Data Extraction Mastery
CSV Processing
Basic Operations with cut
# Extract column 2
cut -d',' -f2 data.csv
# Extract columns 1, 3, 5
cut -d',' -f1,3,5 data.csv
# Extract columns 2-4
cut -d',' -f2-4 data.csv
# Skip header
tail -n +2 data.csv | cut -d',' -f2
AWK for CSV
# Print specific column
awk -F',' '{print $2}' data.csv
# Print with custom output
awk -F',' '{print $1 ": " $3}' data.csv
# Filter by column value
awk -F',' '$3 > 100 {print $1, $3}' data.csv
# Sum a column
awk -F',' '{sum += $3} END {print "Total:", sum}' data.csv
# Average
awk -F',' 'NR>1 {sum+=$3; count++} END {print "Avg:", sum/count}' data.csv
# Group by column
awk -F',' 'NR>1 {sum[$1] += $3} END {for (k in sum) print k, sum[k]}' data.csv
Handling Quoted CSV
# CSV with quotes: "field1","field,with,comma","field3"
# Using gawk with FPAT (field pattern)
gawk 'BEGIN {FPAT = "([^,]+)|(\"[^\"]+\")"} {print $2}' data.csv
# Using csvkit (install: pip install csvkit)
csvcut -c 2 data.csv
csvgrep -c 3 -m "pattern" data.csv
csvsort -c 2 data.csv
# Using Miller (mlr)
mlr --csv cut -f name,age data.csv
mlr --csv filter '$age > 30' data.csv
mlr --csv sort -f age data.csv
CSV Transformation
# CSV to TSV
sed 's/,/\t/g' data.csv > data.tsv
# TSV to CSV
sed 's/\t/,/g' data.tsv > data.csv
# Add header
echo "name,age,city" | cat - data.csv > data_with_header.csv
# Remove header
tail -n +2 data.csv > data_no_header.csv
# Reorder columns
awk -F',' -v OFS=',' '{print $3,$1,$2}' data.csv
# Add new column
awk -F',' -v OFS=',' '{print $0, $2*2}' data.csv
# Merge two CSVs by key (simple join)
join -t',' -1 1 -2 1 <(sort -t',' -k1 file1.csv) <(sort -t',' -k1 file2.csv)
CSV Statistics
# Count rows
wc -l data.csv
# Count unique values in column
awk -F',' '{print $2}' data.csv | sort | uniq -c | sort -rn
# Min/Max
awk -F',' 'NR==2 {min=max=$3} NR>1 {if($3<min)min=$3; if($3>max)max=$3} END {print "Min:",min,"Max:",max}' data.csv
# Standard deviation (awk)
awk -F',' 'NR>1 {sum+=$3; sumsq+=$3*$3; n++} END {print "StdDev:", sqrt(sumsq/n - (sum/n)^2)}' data.csv
JSON Processing
jq Basics
# Pretty print
cat data.json | jq '.'
# Extract field
cat data.json | jq '.name'
# Remove quotes from string
cat data.json | jq -r '.name'
# Multiple fields
cat data.json | jq '{name: .name, age: .age}'
# Array elements
cat data.json | jq '.users[]'
cat data.json | jq '.users[0]'
cat data.json | jq '.users[0:3]'
jq Filtering and Mapping
# Filter array
cat data.json | jq '.users[] | select(.age > 30)'
# Map to new structure
cat data.json | jq '.users | map({n: .name, a: .age})'
# Get all values of key
cat data.json | jq '.users[].name'
# Unique values
cat data.json | jq '[.users[].city] | unique'
# Sort
cat data.json | jq '.users | sort_by(.age)'
# Length
cat data.json | jq '.users | length'
# Group by
cat data.json | jq '.users | group_by(.city) | map({city: .[0].city, count: length})'
jq Transformation
# Add field
cat data.json | jq '.users[] | . + {status: "active"}'
# Remove field
cat data.json | jq '.users[] | del(.password)'
# Update field
cat data.json | jq '.users[] | .age += 1'
# Conditional
cat data.json | jq '.users[] | if .age > 30 then .category = "senior" else .category = "junior" end'
# Combine arrays
jq -s '.[0].users + .[1].users' file1.json file2.json
# JSON to CSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @csv'
# JSON to TSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @tsv'
Nested JSON
# Deep access
cat data.json | jq '.config.database.host'
# With null handling
cat data.json | jq '.config.database.host // "localhost"'
# Flatten nested
cat data.json | jq '.orders[] | {id, customer: .customer.name, total: .items | map(.price) | add}'
# Recursive descent
cat data.json | jq '.. | .email? // empty'
Log Parsing
Apache/Nginx Access Logs
# Standard format: IP - - [timestamp] "METHOD URL PROTO" STATUS SIZE
# Extract IPs
awk '{print $1}' access.log | sort | uniq -c | sort -rn
# Status codes
awk '{print $9}' access.log | sort | uniq -c | sort -rn
# 5xx errors
awk '$9 ~ /^5/ {print $0}' access.log
# URLs with 404
awk '$9 == "404" {print $7}' access.log | sort | uniq -c | sort -rn
# Requests per hour
awk '{print $4}' access.log | cut -d: -f1,2 | uniq -c
# Bandwidth by URL
awk '{sum[$7] += $10} END {for (url in sum) print sum[url], url}' access.log | sort -rn | head -20
# Top user agents
awk -F'"' '{print $6}' access.log | sort | uniq -c | sort -rn | head -20
# Combined pattern
awk '$9 >= 400 {
ip[$1]++
url[$7]++
}
END {
print "=== Top IPs with errors ==="
for (i in ip) print ip[i], i | "sort -rn | head -10"
print "\n=== Top URLs with errors ==="
for (u in url) print url[u], u | "sort -rn | head -10"
}' access.log
Syslog Format
# Format: Mon DD HH:MM:SS hostname process[pid]: message
# Extract by severity (if present)
grep -E '\[(error|crit|alert|emerg)\]' /var/log/syslog
# By process
grep 'sshd\[' /var/log/auth.log
# Extract SSH failed attempts
grep "Failed password" /var/log/auth.log | \
sed -E 's/.*from ([0-9.]+).*/\1/' | \
sort | uniq -c | sort -rn
# Time-based analysis
awk '{print $1, $2, $3}' /var/log/syslog | cut -d: -f1,2 | uniq -c
JSON Logs
# Common format: {"timestamp":"...","level":"...","message":"..."}
# Filter by level
cat app.log | jq -c 'select(.level == "ERROR")'
# Extract messages
cat app.log | jq -r '.message'
# Time range
cat app.log | jq -c 'select(.timestamp > "2026-02-13T00:00:00")'
# Aggregate by level
cat app.log | jq -r '.level' | sort | uniq -c
# Complex extraction
cat app.log | jq -r 'select(.level == "ERROR") | [.timestamp, .message] | @tsv'
Custom Log Formats
# Key=value format
grep "error" app.log | \
sed -E 's/.*user=([^ ]+).*/\1/' | \
sort | uniq -c | sort -rn
# Extract all key=value pairs
grep -oE '[a-z_]+=[^ ]+' logfile | \
awk -F= '{print $1, $2}' | \
sort | uniq -c
# Multi-line logs (stack traces)
awk '/Exception/,/^[^ ]/' app.log
# Parse with perl
perl -ne 'print "$1 $2\n" if /user=(\w+).*ip=(\d+\.\d+\.\d+\.\d+)/' logfile
Binary Data Extraction
hexdump and xxd
# Hex dump
xxd file.bin | head -20
hexdump -C file.bin | head -20
# Extract specific bytes
xxd -s 0x100 -l 64 file.bin
# Convert hex to binary
echo "48656c6c6f" | xxd -r -p
# Binary to base64
base64 file.bin > file.b64
# Search in hex
xxd file.bin | grep -i "504b0304" # ZIP magic bytes
XML Processing
xmllint
# Pretty print
xmllint --format data.xml
# XPath query
xmllint --xpath "//user/name/text()" data.xml
# Multiple elements
xmllint --xpath "//user" data.xml
# Count elements
xmllint --xpath "count(//user)" data.xml
# With namespaces
xmllint --xpath "//*[local-name()='user']" data.xml
xmlstarlet
# Select elements
xmlstarlet sel -t -v "//user/name" data.xml
# With template
xmlstarlet sel -t -m "//user" -v "name" -o ": " -v "age" -n data.xml
# Edit XML
xmlstarlet ed -u "//user/age" -v "31" data.xml
# Delete element
xmlstarlet ed -d "//user[@id='1']" data.xml
# Add element
xmlstarlet ed -s "//users" -t elem -n "user" -v "" data.xml
CTF Data Extraction
Common Patterns
# Find flags
grep -rn "flag{" .
grep -roE "flag\{[^}]+\}" .
strings binary | grep -E "CTF|flag|key"
# Base64 encoded data
grep -oE '[A-Za-z0-9+/]{40,}={0,2}' file | while read b64; do
echo "=== Trying: ${b64:0:20}..."
echo "$b64" | base64 -d 2>/dev/null | file - | grep -v "empty\|data"
done
# Hex encoded
grep -oE '[0-9a-fA-F]{20,}' file | while read hex; do
echo "$hex" | xxd -r -p | file -
done
# ROT13/Caesar
cat cipher.txt | tr 'A-Za-z' 'N-ZA-Mn-za-m'
# All rotations
for i in {1..25}; do
echo "=== ROT$i ==="
cat cipher.txt | tr $(printf '%s' {A..Z} | head -c 26) $(printf '%s' {A..Z}{A..Z} | cut -c $((i+1))-$((i+26)))
done
Memory Dump Analysis
# Extract strings with context
strings -n 8 memdump.bin | grep -A2 -B2 "password"
# Find URLs
strings memdump.bin | grep -E "https?://[^\s]+"
# Extract email addresses
strings memdump.bin | grep -oE "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
# Find file signatures
xxd memdump.bin | grep -i "ffd8ff" # JPEG
xxd memdump.bin | grep -i "89504e47" # PNG
xxd memdump.bin | grep -i "504b0304" # ZIP
Network Capture Analysis
# Extract HTTP data with tshark
tshark -r capture.pcap -Y "http" -T fields -e http.host -e http.request.uri
# DNS queries
tshark -r capture.pcap -Y "dns.flags.response == 0" -T fields -e dns.qry.name
# Extract files
tshark -r capture.pcap --export-objects "http,/tmp/exported"
# Follow TCP stream
tshark -r capture.pcap -z "follow,tcp,ascii,0"
# Extract credentials
tshark -r capture.pcap -Y "http.authbasic" -T fields -e http.authbasic
Data Validation
Checksums and Hashes
# Generate
md5sum file.txt
sha256sum file.txt
# Verify
echo "expected_hash file.txt" | sha256sum -c
# Multiple files
sha256sum *.txt > checksums.sha256
sha256sum -c checksums.sha256
# Extract hash from file
grep -oE '[0-9a-f]{64}' hashfile.txt
Data Cleaning
# Remove blank lines
sed '/^$/d' file
# Remove leading/trailing whitespace
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' file
# Remove duplicate lines (keeping order)
awk '!seen[$0]++' file
# Remove non-printable characters
tr -cd '[:print:]\n' < file
# Normalize line endings
sed 's/\r$//' file # CRLF to LF
Quick Reference
# CSV
cut -d',' -f2 file.csv # Extract column 2
awk -F',' '$3>100' file.csv # Filter by condition
csvcut -c name,age file.csv # csvkit extraction
# JSON
jq '.key' file.json # Extract key
jq '.array[]' file.json # Iterate array
jq -r '.key' file.json # Raw output (no quotes)
jq 'select(.age>30)' file.json # Filter
# Logs
awk '{print $1}' log | sort | uniq -c # Count by first field
grep -oE 'pattern' log | sort | uniq # Extract matching patterns
# Binary
strings file.bin # Extract ASCII strings
xxd file.bin | head # Hex dump
file unknown # Identify file type
# XML
xmllint --xpath "//tag" file.xml # XPath query
grep -oP '(?<=<tag>)[^<]+' file # Quick extraction