Data Extraction Mastery

Overview

Real-world data extraction using command-line tools. No Python needed for most tasks.

CSV Processing

Basic Operations with cut

# Extract column 2
cut -d',' -f2 data.csv

# Extract columns 1, 3, 5
cut -d',' -f1,3,5 data.csv

# Extract columns 2-4
cut -d',' -f2-4 data.csv

# Skip header
tail -n +2 data.csv | cut -d',' -f2

AWK for CSV

# Print specific column
awk -F',' '{print $2}' data.csv

# Print with custom output
awk -F',' '{print $1 ": " $3}' data.csv

# Filter by column value
awk -F',' '$3 > 100 {print $1, $3}' data.csv

# Sum a column
awk -F',' '{sum += $3} END {print "Total:", sum}' data.csv

# Average
awk -F',' 'NR>1 {sum+=$3; count++} END {print "Avg:", sum/count}' data.csv

# Group by column
awk -F',' 'NR>1 {sum[$1] += $3} END {for (k in sum) print k, sum[k]}' data.csv

Handling Quoted CSV

# CSV with quotes: "field1","field,with,comma","field3"

# Using gawk with FPAT (field pattern)
gawk 'BEGIN {FPAT = "([^,]+)|(\"[^\"]+\")"} {print $2}' data.csv

# Using csvkit (install: pip install csvkit)
csvcut -c 2 data.csv
csvgrep -c 3 -m "pattern" data.csv
csvsort -c 2 data.csv

# Using Miller (mlr)
mlr --csv cut -f name,age data.csv
mlr --csv filter '$age > 30' data.csv
mlr --csv sort -f age data.csv

CSV Transformation

# CSV to TSV
sed 's/,/\t/g' data.csv > data.tsv

# TSV to CSV
sed 's/\t/,/g' data.tsv > data.csv

# Add header
echo "name,age,city" | cat - data.csv > data_with_header.csv

# Remove header
tail -n +2 data.csv > data_no_header.csv

# Reorder columns
awk -F',' -v OFS=',' '{print $3,$1,$2}' data.csv

# Add new column
awk -F',' -v OFS=',' '{print $0, $2*2}' data.csv

# Merge two CSVs by key (simple join)
join -t',' -1 1 -2 1 <(sort -t',' -k1 file1.csv) <(sort -t',' -k1 file2.csv)

CSV Statistics

# Count rows
wc -l data.csv

# Count unique values in column
awk -F',' '{print $2}' data.csv | sort | uniq -c | sort -rn

# Min/Max
awk -F',' 'NR==2 {min=max=$3} NR>1 {if($3<min)min=$3; if($3>max)max=$3} END {print "Min:",min,"Max:",max}' data.csv

# Standard deviation (awk)
awk -F',' 'NR>1 {sum+=$3; sumsq+=$3*$3; n++} END {print "StdDev:", sqrt(sumsq/n - (sum/n)^2)}' data.csv

JSON Processing

jq Basics

# Pretty print
cat data.json | jq '.'

# Extract field
cat data.json | jq '.name'

# Remove quotes from string
cat data.json | jq -r '.name'

# Multiple fields
cat data.json | jq '{name: .name, age: .age}'

# Array elements
cat data.json | jq '.users[]'
cat data.json | jq '.users[0]'
cat data.json | jq '.users[0:3]'

jq Filtering and Mapping

# Filter array
cat data.json | jq '.users[] | select(.age > 30)'

# Map to new structure
cat data.json | jq '.users | map({n: .name, a: .age})'

# Get all values of key
cat data.json | jq '.users[].name'

# Unique values
cat data.json | jq '[.users[].city] | unique'

# Sort
cat data.json | jq '.users | sort_by(.age)'

# Length
cat data.json | jq '.users | length'

# Group by
cat data.json | jq '.users | group_by(.city) | map({city: .[0].city, count: length})'

jq Transformation

# Add field
cat data.json | jq '.users[] | . + {status: "active"}'

# Remove field
cat data.json | jq '.users[] | del(.password)'

# Update field
cat data.json | jq '.users[] | .age += 1'

# Conditional
cat data.json | jq '.users[] | if .age > 30 then .category = "senior" else .category = "junior" end'

# Combine arrays
jq -s '.[0].users + .[1].users' file1.json file2.json

# JSON to CSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @csv'

# JSON to TSV
cat data.json | jq -r '.users[] | [.name, .age, .city] | @tsv'

Nested JSON

# Deep access
cat data.json | jq '.config.database.host'

# With null handling
cat data.json | jq '.config.database.host // "localhost"'

# Flatten nested
cat data.json | jq '.orders[] | {id, customer: .customer.name, total: .items | map(.price) | add}'

# Recursive descent
cat data.json | jq '.. | .email? // empty'

Create JSON

# From variables
jq -n --arg name "John" --arg age 30 '{name: $name, age: ($age | tonumber)}'

# From CSV
cat data.csv | jq -R 'split(",") | {name: .[0], age: .[1]|tonumber}' | jq -s '.'

# Build array
echo '{}' | jq '. + {items: ["a","b","c"]}'

Log Parsing

Apache/Nginx Access Logs

# Standard format: IP - - [timestamp] "METHOD URL PROTO" STATUS SIZE

# Extract IPs
awk '{print $1}' access.log | sort | uniq -c | sort -rn

# Status codes
awk '{print $9}' access.log | sort | uniq -c | sort -rn

# 5xx errors
awk '$9 ~ /^5/ {print $0}' access.log

# URLs with 404
awk '$9 == "404" {print $7}' access.log | sort | uniq -c | sort -rn

# Requests per hour
awk '{print $4}' access.log | cut -d: -f1,2 | uniq -c

# Bandwidth by URL
awk '{sum[$7] += $10} END {for (url in sum) print sum[url], url}' access.log | sort -rn | head -20

# Top user agents
awk -F'"' '{print $6}' access.log | sort | uniq -c | sort -rn | head -20

# Combined pattern
awk '$9 >= 400 {
    ip[$1]++
    url[$7]++
}
END {
    print "=== Top IPs with errors ==="
    for (i in ip) print ip[i], i | "sort -rn | head -10"
    print "\n=== Top URLs with errors ==="
    for (u in url) print url[u], u | "sort -rn | head -10"
}' access.log

Syslog Format

# Format: Mon DD HH:MM:SS hostname process[pid]: message

# Extract by severity (if present)
grep -E '\[(error|crit|alert|emerg)\]' /var/log/syslog

# By process
grep 'sshd\[' /var/log/auth.log

# Extract SSH failed attempts
grep "Failed password" /var/log/auth.log | \
    sed -E 's/.*from ([0-9.]+).*/\1/' | \
    sort | uniq -c | sort -rn

# Time-based analysis
awk '{print $1, $2, $3}' /var/log/syslog | cut -d: -f1,2 | uniq -c

JSON Logs

# Common format: {"timestamp":"...","level":"...","message":"..."}

# Filter by level
cat app.log | jq -c 'select(.level == "ERROR")'

# Extract messages
cat app.log | jq -r '.message'

# Time range
cat app.log | jq -c 'select(.timestamp > "2026-02-13T00:00:00")'

# Aggregate by level
cat app.log | jq -r '.level' | sort | uniq -c

# Complex extraction
cat app.log | jq -r 'select(.level == "ERROR") | [.timestamp, .message] | @tsv'

Custom Log Formats

# Key=value format
grep "error" app.log | \
    sed -E 's/.*user=([^ ]+).*/\1/' | \
    sort | uniq -c | sort -rn

# Extract all key=value pairs
grep -oE '[a-z_]+=[^ ]+' logfile | \
    awk -F= '{print $1, $2}' | \
    sort | uniq -c

# Multi-line logs (stack traces)
awk '/Exception/,/^[^ ]/' app.log

# Parse with perl
perl -ne 'print "$1 $2\n" if /user=(\w+).*ip=(\d+\.\d+\.\d+\.\d+)/' logfile

Binary Data Extraction

hexdump and xxd

# Hex dump
xxd file.bin | head -20
hexdump -C file.bin | head -20

# Extract specific bytes
xxd -s 0x100 -l 64 file.bin

# Convert hex to binary
echo "48656c6c6f" | xxd -r -p

# Binary to base64
base64 file.bin > file.b64

# Search in hex
xxd file.bin | grep -i "504b0304"  # ZIP magic bytes

strings

# Extract ASCII strings
strings binary_file

# Minimum length
strings -n 10 binary_file

# Include offset
strings -t x binary_file

# Search for patterns
strings binary_file | grep -E "password|secret|key"

# Wide strings (UTF-16)
strings -e l binary_file

File Analysis

# File type
file unknown_file

# Magic bytes
xxd -l 16 file.bin

# Embedded files
binwalk firmware.bin

# Extract embedded
binwalk -e firmware.bin

# Entropy analysis
ent file.bin

XML Processing

xmllint

# Pretty print
xmllint --format data.xml

# XPath query
xmllint --xpath "//user/name/text()" data.xml

# Multiple elements
xmllint --xpath "//user" data.xml

# Count elements
xmllint --xpath "count(//user)" data.xml

# With namespaces
xmllint --xpath "//*[local-name()='user']" data.xml

xmlstarlet

# Select elements
xmlstarlet sel -t -v "//user/name" data.xml

# With template
xmlstarlet sel -t -m "//user" -v "name" -o ": " -v "age" -n data.xml

# Edit XML
xmlstarlet ed -u "//user/age" -v "31" data.xml

# Delete element
xmlstarlet ed -d "//user[@id='1']" data.xml

# Add element
xmlstarlet ed -s "//users" -t elem -n "user" -v "" data.xml

Using grep and sed

# Simple tag extraction (when proper XML tools unavailable)
grep -oP '(?<=<name>)[^<]+' data.xml

# Extract attribute
grep -oP 'id="\K[^"]+' data.xml

# Remove XML tags
sed 's/<[^>]*>//g' data.xml

CTF Data Extraction

Common Patterns

# Find flags
grep -rn "flag{" .
grep -roE "flag\{[^}]+\}" .
strings binary | grep -E "CTF|flag|key"

# Base64 encoded data
grep -oE '[A-Za-z0-9+/]{40,}={0,2}' file | while read b64; do
    echo "=== Trying: ${b64:0:20}..."
    echo "$b64" | base64 -d 2>/dev/null | file - | grep -v "empty\|data"
done

# Hex encoded
grep -oE '[0-9a-fA-F]{20,}' file | while read hex; do
    echo "$hex" | xxd -r -p | file -
done

# ROT13/Caesar
cat cipher.txt | tr 'A-Za-z' 'N-ZA-Mn-za-m'

# All rotations
for i in {1..25}; do
    echo "=== ROT$i ==="
    cat cipher.txt | tr $(printf '%s' {A..Z} | head -c 26) $(printf '%s' {A..Z}{A..Z} | cut -c $((i+1))-$((i+26)))
done

Memory Dump Analysis

# Extract strings with context
strings -n 8 memdump.bin | grep -A2 -B2 "password"

# Find URLs
strings memdump.bin | grep -E "https?://[^\s]+"

# Extract email addresses
strings memdump.bin | grep -oE "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

# Find file signatures
xxd memdump.bin | grep -i "ffd8ff"  # JPEG
xxd memdump.bin | grep -i "89504e47"  # PNG
xxd memdump.bin | grep -i "504b0304"  # ZIP

Network Capture Analysis

# Extract HTTP data with tshark
tshark -r capture.pcap -Y "http" -T fields -e http.host -e http.request.uri

# DNS queries
tshark -r capture.pcap -Y "dns.flags.response == 0" -T fields -e dns.qry.name

# Extract files
tshark -r capture.pcap --export-objects "http,/tmp/exported"

# Follow TCP stream
tshark -r capture.pcap -z "follow,tcp,ascii,0"

# Extract credentials
tshark -r capture.pcap -Y "http.authbasic" -T fields -e http.authbasic

Data Validation

Checksums and Hashes

# Generate
md5sum file.txt
sha256sum file.txt

# Verify
echo "expected_hash  file.txt" | sha256sum -c

# Multiple files
sha256sum *.txt > checksums.sha256
sha256sum -c checksums.sha256

# Extract hash from file
grep -oE '[0-9a-f]{64}' hashfile.txt

Data Cleaning

# Remove blank lines
sed '/^$/d' file

# Remove leading/trailing whitespace
sed 's/^[[:space:]]*//;s/[[:space:]]*$//' file

# Remove duplicate lines (keeping order)
awk '!seen[$0]++' file

# Remove non-printable characters
tr -cd '[:print:]\n' < file

# Normalize line endings
sed 's/\r$//' file  # CRLF to LF

Quick Reference

# CSV
cut -d',' -f2 file.csv           # Extract column 2
awk -F',' '$3>100' file.csv      # Filter by condition
csvcut -c name,age file.csv      # csvkit extraction

# JSON
jq '.key' file.json              # Extract key
jq '.array[]' file.json          # Iterate array
jq -r '.key' file.json           # Raw output (no quotes)
jq 'select(.age>30)' file.json   # Filter

# Logs
awk '{print $1}' log | sort | uniq -c  # Count by first field
grep -oE 'pattern' log | sort | uniq   # Extract matching patterns

# Binary
strings file.bin                 # Extract ASCII strings
xxd file.bin | head              # Hex dump
file unknown                     # Identify file type

# XML
xmllint --xpath "//tag" file.xml # XPath query
grep -oP '(?<=<tag>)[^<]+' file  # Quick extraction

Data Extraction Mastery

Overview

CSV Processing

Basic Operations with cut

AWK for CSV

Handling Quoted CSV

CSV Transformation

CSV Statistics

JSON Processing

jq Basics

jq Filtering and Mapping

jq Transformation

Nested JSON

Create JSON

Log Parsing

Apache/Nginx Access Logs

Syslog Format

JSON Logs

Custom Log Formats

Binary Data Extraction

hexdump and xxd

strings

File Analysis

XML Processing

xmllint

xmlstarlet

Using grep and sed

CTF Data Extraction

Common Patterns

Memory Dump Analysis

Network Capture Analysis

Data Validation

Checksums and Hashes

Data Cleaning

Quick Reference

Related