AWK Mastery
Overview
AWK is a pattern-scanning and processing language. It excels at column-based text processing, making it essential for log analysis and data extraction.
Field Reference
| Variable | Description | Example |
|---|---|---|
|
Entire line |
|
|
Individual fields |
|
|
Last field |
|
|
Second to last |
|
|
Number of fields |
|
|
Current record number |
|
|
Record number in current file |
|
|
Field separator (input) |
|
|
Output field separator |
|
|
Record separator |
|
|
Output record separator |
|
|
Current filename |
|
Field Delimiters
Pattern Matching
Regular Expression Patterns
# Lines containing pattern
awk '/ERROR/' logfile
# Lines NOT containing pattern
awk '!/ERROR/' logfile
# Field matches pattern
awk '$1 ~ /admin/' file
# Field does NOT match
awk '$1 !~ /admin/' file
# Case insensitive
awk 'tolower($0) ~ /error/' file
Actions
Print Variations
# Print specific fields
awk '{print $1, $3}' file
# Print with custom separator
awk -F':' '{print $1 " -> " $7}' /etc/passwd
# Printf for formatting
awk '{printf "%-20s %10d\n", $1, $2}' file
# Print line numbers
awk '{print NR": "$0}' file
Variables and Arithmetic
# Sum a column
awk '{sum += $3} END {print sum}' file
# Average
awk '{sum += $3; count++} END {print sum/count}' file
# Count lines
awk 'END {print NR}' file
# Running total
awk '{total += $3; print $0, total}' file
String Manipulation
# Length
awk '{print length($1), $1}' file
# Substring
awk '{print substr($1, 1, 5)}' file
# Index (find position)
awk '{print index($0, "ERROR")}' file
# Split into array
awk '{split($1, a, "-"); print a[1], a[2]}' file
# Substitute (first occurrence)
awk '{sub(/old/, "new"); print}' file
# Substitute (all occurrences)
awk '{gsub(/old/, "new"); print}' file
# To lowercase
awk '{print tolower($1)}' file
# To uppercase
awk '{print toupper($1)}' file
Arrays
Associative Arrays
# Count occurrences
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file
# Sum by category
awk '{sum[$1] += $2} END {for (k in sum) print k, sum[k]}' file
# Unique values
awk '!seen[$1]++' file
# Check existence
awk '{if ($1 in seen) print "DUP:", $0; seen[$1]=1}' file
BEGIN and END
awk '
BEGIN {
FS = ":"
print "Processing /etc/passwd"
print "========================"
}
{
users++
if ($3 >= 1000) regular++
}
END {
print "========================"
print "Total users:", users
print "Regular users:", regular
}
' /etc/passwd
Production Examples
Log Analysis
# Apache access log - requests per IP
awk '{count[$1]++} END {for (ip in count) print count[ip], ip}' \
access.log | sort -rn | head -20
# HTTP status code distribution
awk '{count[$9]++} END {for (s in count) print s, count[s]}' access.log
# Requests per hour
awk -F'[/:]' '{hour=$5; count[hour]++} END {
for (h in count) print h":00", count[h]
}' access.log | sort -t: -k1n
# Slow requests (response time > 5s)
awk '$NF > 5 {print}' access.log
# Bandwidth by URL
awk '{sum[$7] += $10} END {
for (url in sum) print sum[url], url
}' access.log | sort -rn | head -20
System Administration
# Users with shells
awk -F: '$7 ~ /bash|zsh|fish/ {print $1, $7}' /etc/passwd
# Large processes (memory > 1%)
ps aux | awk '$4 > 1.0 {print $4"%", $11}'
# Disk usage > 80%
df -h | awk '$5+0 > 80 {print $6, $5}'
# Network connections by state
ss -ta | awk 'NR>1 {count[$1]++} END {for (s in count) print s, count[s]}'
# Top 10 largest files in directory
ls -lS | awk 'NR>1 {print $5, $9}' | head -10
CSV Processing
# Extract columns (skip header)
awk -F',' 'NR>1 {print $1, $3}' data.csv
# Sum column
awk -F',' 'NR>1 {sum += $3} END {print "Total:", sum}' data.csv
# Filter by condition
awk -F',' '$3 > 100' data.csv
# Convert to TSV
awk -F',' '{$1=$1; print}' OFS='\t' data.csv
# Add header
awk -F',' 'NR==1 {print "Name,Value,Status"} NR>0 {print}' data.csv
SIEM and Security
# Failed SSH attempts per IP
grep "Failed password" /var/log/auth.log | \
awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -20
# Successful logins
grep "Accepted" /var/log/auth.log | \
awk '{print $9, $11}' | sort | uniq
# Extract IPs from any log
awk 'match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/) {
print substr($0, RSTART, RLENGTH)
}' logfile | sort -u
# Parse syslog priority
awk -F'[<>]' '{
priority = $2
facility = int(priority / 8)
severity = priority % 8
print "Facility:", facility, "Severity:", severity
}' syslog
Data Transformation
# JSON-like output
awk -F: '{
printf "{\"user\": \"%s\", \"uid\": %s, \"shell\": \"%s\"}\n", $1, $3, $7
}' /etc/passwd
# Pivot data (rows to columns)
awk '{
key[$1] = $1
val[$1] = val[$1] " " $2
} END {
for (k in key) print key[k], val[k]
}' file
# Transpose (columns to rows)
awk '{
for (i=1; i<=NF; i++) a[NR,i] = $i
if (NF > max) max = NF
}
END {
for (i=1; i<=max; i++) {
for (j=1; j<=NR; j++) printf "%s ", a[j,i]
print ""
}
}' file
Multi-file Processing
# Process multiple files with filename
awk '{print FILENAME ":" NR ":" $0}' file1 file2
# Compare files (find common lines)
awk 'NR==FNR {a[$0]; next} $0 in a' file1 file2
# Find lines only in file1
awk 'NR==FNR {a[$0]; next} !($0 in a)' file2 file1
# Merge files by key
awk -F: 'NR==FNR {a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1 file2
Built-in Functions Reference
| Function | Description | Example |
|---|---|---|
|
String length |
|
|
Substring |
|
|
Find position |
|
|
Split into array |
|
|
Replace first |
|
|
Replace all |
|
|
Regex match |
|
|
Lowercase |
|
|
Uppercase |
|
|
Format string |
|
|
Integer |
|
|
Square root |
|
|
Natural log |
|
|
Exponential |
|
|
Random 0-1 |
|
|
Seed random |
|
|
Run command |
|
|
Read next line |
|
CTF and Security Patterns
Flag Extraction
# Find flag patterns
strings binary | awk '/flag\{.*\}/ {print}'
awk '/CTF\{[^}]+\}/ {match($0, /CTF\{[^}]+\}/); print substr($0, RSTART, RLENGTH)}' file
# Extract between markers
awk '/BEGIN FLAG/,/END FLAG/' file
# Decode hex
echo "48656c6c6f" | awk '{
for (i=1; i<=length($0); i+=2)
printf "%c", strtonum("0x" substr($0,i,2))
print ""
}'
IP and Network Analysis
# Extract and count IPs
awk 'match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/) {
ip = substr($0, RSTART, RLENGTH)
count[ip]++
}
END {
for (ip in count) print count[ip], ip
}' access.log | sort -rn
# Filter by IP range
awk -F'.' '$1==10 && $2==50 {print}' file
# Calculate bandwidth per IP
awk '{bytes[$1] += $10} END {
for (ip in bytes) printf "%s: %.2f MB\n", ip, bytes[ip]/1048576
}' access.log | sort -t: -k2 -rn
Credential Hunting
# Find potential passwords
awk '/password|passwd|pwd|secret|token|key/ && /[=:]/ {print}' config_files
# Extract key=value pairs
awk -F'[=:]' '/password|api_key|secret/ {
gsub(/^[ \t]+|[ \t]+$/, "", $1)
gsub(/^[ \t"'\'']+|[ \t"'\'']+$/, "", $2)
print $1 " => " $2
}' file
# Parse environment variables
awk -F= '/^[A-Z_]+.*=/ {print $1, "=", $2}' .env
Data Transformation Patterns
CSV Operations
# CSV to JSON array
awk -F',' 'NR==1 {
for (i=1; i<=NF; i++) header[i]=$i
next
}
{
printf "{"
for (i=1; i<=NF; i++) {
printf "\"%s\":\"%s\"", header[i], $i
if (i<NF) printf ","
}
print "},"
}' data.csv
# Transpose CSV (rows to columns)
awk -F',' '{
for (i=1; i<=NF; i++) a[NR,i]=$i
if (NF>max) max=NF
}
END {
for (i=1; i<=max; i++) {
for (j=1; j<=NR; j++) printf "%s%s", a[j,i], (j<NR?",":"")
print ""
}
}' data.csv
# Merge two CSVs by first column
awk -F',' 'NR==FNR {a[$1]=$0; next} $1 in a {print a[$1] "," $2}' file1.csv file2.csv
# Pivot table
awk -F',' 'NR>1 {
sum[$1,$2] += $3
rows[$1]++
cols[$2]++
}
END {
for (r in rows) {
printf "%s", r
for (c in cols) printf ",%s", sum[r,c]+0
print ""
}
}' data.csv
Text Cleanup
# Normalize whitespace
awk '{$1=$1; print}' file
# Remove duplicate consecutive lines
awk 'prev != $0 {print} {prev=$0}' file
# Trim all fields
awk '{for(i=1;i<=NF;i++) gsub(/^[ \t]+|[ \t]+$/, "", $i); print}' file
# Convert CamelCase to snake_case
echo "CamelCaseString" | awk '{
gsub(/([A-Z])/, "_&"); sub(/^_/, ""); print tolower($0)
}'
# Wrap long lines at 80 characters
awk '{
while (length($0) > 80) {
print substr($0, 1, 80)
$0 = substr($0, 81)
}
print
}' file
Reporting
# Create formatted report
awk 'BEGIN {
printf "%-20s %10s %10s\n", "NAME", "VALUE", "PERCENT"
printf "%-20s %10s %10s\n", "----", "-----", "-------"
}
{
total += $2
data[NR] = $1 ":" $2
}
END {
for (i=1; i<=NR; i++) {
split(data[i], parts, ":")
printf "%-20s %10d %9.1f%%\n", parts[1], parts[2], (parts[2]/total)*100
}
printf "%-20s %10d\n", "TOTAL", total
}' data.txt
# Histogram
awk '{
count[$1]++
if (count[$1] > max) max = count[$1]
}
END {
for (k in count) {
bar = ""
for (i=0; i<count[k]; i++) bar = bar "#"
printf "%-20s %5d %s\n", k, count[k], bar
}
}' data.txt | sort -k2 -rn
Advanced Date/Time
# Parse and reformat dates
awk '{
split($1, d, "-")
printf "%s/%s/%s %s\n", d[2], d[3], d[1], $2
}' file # 2026-02-13 -> 02/13/2026
# Filter by date range
awk -F',' '$1 >= "2026-02-01" && $1 <= "2026-02-28"' data.csv
# Calculate time differences (HH:MM:SS)
awk 'BEGIN {FS=":"} {
total = $1*3600 + $2*60 + $3
printf "%d seconds\n", total
}' <<< "01:30:45"
# Aggregate by hour
awk -F'[: ]' '{hour[$2]++} END {for (h in hour) print h":00", hour[h]}' logfile | sort
Inline awk Programs
# One-liner templates
awk 'BEGIN{action} pattern{action} END{action}' file
# Useful shorthands
awk '1' # Print all (same as cat)
awk 'NF' # Print non-empty lines
awk '!a[$0]++' # Remove duplicate lines
awk '{print NF}' # Count fields per line
awk 'END{print NR}' # Count lines
awk '{s+=$1}END{print s}' # Sum first column
awk 'NR==10' # Print line 10
awk 'NR>=10&&NR<=20' # Print lines 10-20
awk '/start/,/end/' # Print range
awk 'NR%2' # Print odd lines
awk '!(NR%2)' # Print even lines
awk '{print NR": "$0}' # Number lines
awk '{$NF=""};1' # Remove last field
awk '{$1=""};1' # Remove first field
awk 'gsub(/old/,"new")' # Only print changed lines
awk 'length>80' # Lines longer than 80
awk '{print $NF}' # Print last field
awk '{print $(NF-1)}' # Print second to last
awk 'NF>5' # Lines with more than 5 fields
awk '/pattern/{getline;print}' # Print line after match
awk '/pattern/{print;getline;print}' # Print match and next line
Quick Reference
# Print first field
awk '{print $1}' file
# Print last field
awk '{print $NF}' file
# Print fields 1 and 3
awk '{print $1, $3}' file
# Custom delimiter
awk -F':' '{print $1}' file
# Sum column 3
awk '{sum+=$3} END {print sum}' file
# Count lines matching pattern
awk '/pattern/ {count++} END {print count}' file
# Remove duplicates
awk '!seen[$0]++' file
# Print lines longer than 80 chars
awk 'length > 80' file
# Add line numbers
awk '{print NR, $0}' file
# Print every other line
awk 'NR % 2 == 1' file
# Filter and transform
awk -F',' '$3>100 {print $1, $3*2}' file
# Group and sum
awk -F',' '{sum[$1]+=$2} END {for(k in sum) print k,sum[k]}' file