Data Extraction Mastery

Overview

Real-world data extraction using command-line tools. No Python needed for most tasks.

CSV Processing

Basic Operations with cut

# Extract column 2
cut -d',' -f2 data.csv

# Extract columns 1, 3, 5
cut -d',' -f1,3,5 data.csv

# Extract columns 2-4
cut -d',' -f2-4 data.csv

# Skip header
tail -n +2 data.csv | cut -d',' -f2

AWK for CSV

# Print specific column
awk -F',' '{print $2}' data.csv

# Print with custom output
awk -F',' '{print $1 ": " $3}' data.csv

# Filter by column value
awk -F',' '$3 > 100 {print $1, $3}' data.csv

# Sum a column
awk -F',' '{sum += $3} END {print "Total:", sum}' data.csv

# Average
awk -F',' 'NR>1 {sum+=$3; count++} END {print "Avg:", sum/count}' data.csv

# Group by column
awk -F',' 'NR>1 {sum[$1] += $3} END {for (k in sum) print k, sum[k]}' data.csv

Handling Quoted CSV

# CSV with quotes: "field1","field,with,comma","field3"

# Using gawk with FPAT (field pattern)
gawk 'BEGIN {FPAT = "([^,]+)|(\"[^\"]+\")"} {print $2}' data.csv

# Using csvkit (install: pip install csvkit)
csvcut -c 2 data.csv
csvgrep -c 3 -m "pattern" data.csv
csvsort -c 2 data.csv

# Using Miller (mlr)
mlr --csv cut -f name,age data.csv
mlr --csv filter '$age > 30' data.csv
mlr --csv sort -f age data.csv

CSV Transformation

# CSV to TSV
sed 's/,/\t/g' data.csv > data.tsv

# TSV to CSV
sed 's/\t/,/g' data.tsv > data.csv

# Add header
echo "name,age,city" | cat - data.csv > data_with_header.csv

# Remove header
tail -n +2 data.csv > data_no_header.csv

# Reorder columns
awk -F',' -v OFS=',' '{print $3,$1,$2}' data.csv

# Add new column
awk -F',' -v OFS=',' '{print $0, $2*2}' data.csv

# Merge two CSVs by key (simple join)
join -t',' -1 1 -2 1 <(sort -t',' -k1 file1.csv) <(sort -t',' -k1 file2.csv)

CSV Statistics

# Count rows
wc -l data.csv

# Count unique values in column
awk -F',' '{print $2}' data.csv | sort | uniq -c | sort -rn

# Min/Max
awk -F',' 'NR==2 {min=max=$3} NR>1 {if($3<min)min=$3; if($3>max)max=$3} END {print "Min:",min,"Max:",max}' data.csv

# Standard deviation (awk)
awk -F',' 'NR>1 {sum+=$3; sumsq+=$3*$3; n++} END {print "StdDev:", sqrt(sumsq/n - (sum/n)^2)}' data.csv

JSON Processing

jq Basics

# Pretty print
cat data.json | jq '.'

# Extract field
cat data.json | jq '.name'

# Remove quotes from string
cat data.json | jq -r '.name'

# Multiple fields
cat data.json | jq '{name: .name, age: .age}'

# Array elements
cat data.json | jq '.users[]'
cat data.json | jq '.users[0]'
cat data.json | jq '.users[0:3]'

jq Filtering and Mapping

# Filter array
cat data.json | jq '.users[] | select(.age > 30)'

# Map to new structure
cat data.json | jq '.users | map({n: .name, a: .age})'

# Get all values of key
cat data.json | jq '.users[].name'

# Unique values
cat data.json | jq '[.users[].city] | unique'

# Sort
cat data.json | jq '.users | sort_by(.age)'

# Length
cat data.json | jq '.users | length'

# Group by
cat data.json | jq '.users | group_by(.city) | map({city: .[0].city, count: length})'

jq Transformation

# Add field
cat data.json | jq '.users[] | . + {status: "active"}'

# Remove field
cat data.json | jq '.users[] | del(.password)'

# Update field
cat data.json | jq '.users[] | .age += 1'

# Conditional
cat data.json | jq '.users[] | if .age > 30 then .category = "senior" else .category = "junior" end'

# Combine arrays
jq -s '.[0].users + .[1].users' file1.json file2.json

# JSON to CSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @csv'

# JSON to TSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @tsv'

Nested JSON

# Deep access
cat data.json | jq '.config.database.host'

# With null handling
cat data.json | jq '.config.database.host // "localhost"'

# Flatten nested
cat data.json | jq '.orders[] | {id, customer: .customer.name, total: .items | map(.price) | add}'

# Recursive descent
cat data.json | jq '.. | .email? // empty'

Create JSON

# From variables
jq -n --arg name "John" --arg age 30 '{name: $name, age: ($age | tonumber)}'

# From CSV
cat data.csv | jq -R 'split(",") | {name: .[0], age: .[1]|tonumber}' | jq -s '.'

# Build array
echo '{}' | jq '. + {items: ["a","b","c"]}'

Log Parsing

Apache/Nginx Access Logs

# Standard format: IP - - [timestamp] "METHOD URL PROTO" STATUS SIZE

# Extract IPs
awk '{print $1}' access.log | sort | uniq -c | sort -rn

# Status codes
awk '{print $9}' access.log | sort | uniq -c | sort -rn

# 5xx errors
awk '$9 ~ /^5/ {print $0}' access.log

# URLs with 404
awk '$9 == "404" {print $7}' access.log | sort | uniq -c | sort -rn

# Requests per hour
awk '{print $4}' access.log | cut -d: -f1,2 | uniq -c

# Bandwidth by URL
awk '{sum[$7] += $10} END {for (url in sum) print sum[url], url}' access.log | sort -rn | head -20

# Top user agents
awk -F'"' '{print $6}' access.log | sort | uniq -c | sort -rn | head -20

# Combined pattern
awk '$9 >= 400 {
    ip[$1]++
    url[$7]++
}
END {
    print "=== Top IPs with errors ==="
    for (i in ip) print ip[i], i | "sort -rn | head -10"
    print "\n=== Top URLs with errors ==="
    for (u in url) print url[u], u | "sort -rn | head -10"
}' access.log

Syslog Format

# Format: Mon DD HH:MM:SS hostname process[pid]: message

# Extract by severity (if present)
grep -E '\[(error|crit|alert|emerg)\]' /var/log/syslog

# By process
grep 'sshd\[' /var/log/auth.log

# Extract SSH failed attempts
grep "Failed password" /var/log/auth.log | \
    sed -E 's/.*from ([0-9.]+).*/\1/' | \
    sort | uniq -c | sort -rn

# Time-based analysis
awk '{print $1, $2, $3}' /var/log/syslog | cut -d: -f1,2 | uniq -c

JSON Logs

# Common format: {"timestamp":"...","level":"...","message":"..."}

# Filter by level
cat app.log | jq -c 'select(.level == "ERROR")'

# Extract messages
cat app.log | jq -r '.message'

# Time range
cat app.log | jq -c 'select(.timestamp > "2026-02-13T00:00:00")'

# Aggregate by level
cat app.log | jq -r '.level' | sort | uniq -c

# Complex extraction
cat app.log | jq -r 'select(.level == "ERROR") | [.timestamp, .message] | @tsv'

Custom Log Formats

# Key=value format
grep "error" app.log | \
    sed -E 's/.*user=([^ ]+).*/\1/' | \
    sort | uniq -c | sort -rn

# Extract all key=value pairs
grep -oE '[a-z_]+=[^ ]+' logfile | \
    awk -F= '{print $1, $2}' | \
    sort | uniq -c

# Multi-line logs (stack traces)
awk '/Exception/,/^[^ ]/' app.log

# Parse with perl
perl -ne 'print "$1 $2\n" if /user=(\w+).*ip=(\d+\.\d+\.\d+\.\d+)/' logfile

Binary Data Extraction

hexdump and xxd

# Hex dump
xxd file.bin | head -20
hexdump -C file.bin | head -20

# Extract specific bytes
xxd -s 0x100 -l 64 file.bin

# Convert hex to binary
echo "48656c6c6f" | xxd -r -p

# Binary to base64
base64 file.bin > file.b64

# Search in hex
xxd file.bin | grep -i "504b0304"  # ZIP magic bytes

strings

# Extract ASCII strings
strings binary_file

# Minimum length
strings -n 10 binary_file

# Include offset
strings -t x binary_file

# Search for patterns
strings binary_file | grep -E "password|secret|key"

# Wide strings (UTF-16)
strings -e l binary_file

File Analysis

# File type
file unknown_file

# Magic bytes
xxd -l 16 file.bin

# Embedded files
binwalk firmware.bin

# Extract embedded
binwalk -e firmware.bin

# Entropy analysis
ent file.bin

XML Processing

xmllint

# Pretty print
xmllint --format data.xml

# XPath query
xmllint --xpath "//user/name/text()" data.xml

# Multiple elements
xmllint --xpath "//user" data.xml

# Count elements
xmllint --xpath "count(//user)" data.xml

# With namespaces
xmllint --xpath "//*[local-name()='user']" data.xml

xmlstarlet

# Select elements
xmlstarlet sel -t -v "//user/name" data.xml

# With template
xmlstarlet sel -t -m "//user" -v "name" -o ": " -v "age" -n data.xml

# Edit XML
xmlstarlet ed -u "//user/age" -v "31" data.xml

# Delete element
xmlstarlet ed -d "//user[@id='1']" data.xml

# Add element
xmlstarlet ed -s "//users" -t elem -n "user" -v "" data.xml

Using grep and sed

# Simple tag extraction (when proper XML tools unavailable)
grep -oP '(?<=<name>)[^<]+' data.xml

# Extract attribute
grep -oP 'id="\K[^"]+' data.xml

# Remove XML tags
sed 's/<[^>]*>//g' data.xml

CTF Data Extraction

Common Patterns

# Find flags
grep -rn "flag{" .
grep -roE "flag\{[^}]+\}" .
strings binary | grep -E "CTF|flag|key"

# Base64 encoded data
grep -oE '[A-Za-z0-9+/]{40,}={0,2}' file | while read b64; do
    echo "=== Trying: ${b64:0:20}..."
    echo "$b64" | base64 -d 2>/dev/null | file - | grep -v "empty\|data"
done

# Hex encoded
grep -oE '[0-9a-fA-F]{20,}' file | while read hex; do
    echo "$hex" | xxd -r -p | file -
done

# ROT13/Caesar
cat cipher.txt | tr 'A-Za-z' 'N-ZA-Mn-za-m'

# All rotations
for i in {1..25}; do
    echo "=== ROT$i ==="
    cat cipher.txt | tr $(printf '%s' {A..Z} | head -c 26) $(printf '%s' {A..Z}{A..Z} | cut -c $((i+1))-$((i+26)))
done

Memory Dump Analysis

# Extract strings with context
strings -n 8 memdump.bin | grep -A2 -B2 "password"

# Find URLs
strings memdump.bin | grep -E "https?://[^\s]+"

# Extract email addresses
strings memdump.bin | grep -oE "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Find file signatures
xxd memdump.bin | grep -i "ffd8ff"  # JPEG
xxd memdump.bin | grep -i "89504e47"  # PNG
xxd memdump.bin | grep -i "504b0304"  # ZIP

Network Capture Analysis

# Extract HTTP data with tshark
tshark -r capture.pcap -Y "http" -T fields -e http.host -e http.request.uri

# DNS queries
tshark -r capture.pcap -Y "dns.flags.response == 0" -T fields -e dns.qry.name

# Extract files
tshark -r capture.pcap --export-objects "http,/tmp/exported"

# Follow TCP stream
tshark -r capture.pcap -z "follow,tcp,ascii,0"

# Extract credentials
tshark -r capture.pcap -Y "http.authbasic" -T fields -e http.authbasic

Data Validation

Checksums and Hashes

# Generate
md5sum file.txt
sha256sum file.txt

# Verify
echo "expected_hash  file.txt" | sha256sum -c

# Multiple files
sha256sum *.txt > checksums.sha256
sha256sum -c checksums.sha256

# Extract hash from file
grep -oE '[0-9a-f]{64}' hashfile.txt

Data Cleaning

# Remove blank lines
sed '/^$/d' file

# Remove leading/trailing whitespace
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' file

# Remove duplicate lines (keeping order)
awk '!seen[$0]++' file

# Remove non-printable characters
tr -cd '[:print:]\n' < file

# Normalize line endings
sed 's/\r$//' file  # CRLF to LF

Quick Reference

# CSV
cut -d',' -f2 file.csv           # Extract column 2
awk -F',' '$3>100' file.csv      # Filter by condition
csvcut -c name,age file.csv      # csvkit extraction

# JSON
jq '.key' file.json              # Extract key
jq '.array[]' file.json          # Iterate array
jq -r '.key' file.json           # Raw output (no quotes)
jq 'select(.age>30)' file.json   # Filter

# Logs
awk '{print $1}' log | sort | uniq -c  # Count by first field
grep -oE 'pattern' log | sort | uniq   # Extract matching patterns

# Binary
strings file.bin                 # Extract ASCII strings
xxd file.bin | head              # Hex dump
file unknown                     # Identify file type

# XML
xmllint --xpath "//tag" file.xml # XPath query
grep -oP '(?<=<tag>)[^<]+' file  # Quick extraction