AWK Fundamentals
AWK field processing and pattern matching fundamentals.
Field Extraction
# Default: whitespace delimiter
awk '{print $1}' file # First field
awk '{print $1, $3}' file # Fields 1 and 3
awk '{print $NF}' file # Last field
awk '{print $(NF-1)}' file # Second to last
# Custom delimiter
awk -F':' '{print $1}' /etc/passwd # Colon separated
awk -F',' '{print $2}' data.csv # CSV
awk -F'\t' '{print $1}' data.tsv # Tab separated
awk -F'[,;:]' '{print $1}' file # Multiple delimiters
# Output field separator
awk -F':' -v OFS=',' '{print $1, $3, $6}' /etc/passwd
# Field range (print fields 2 through 5)
awk '{for(i=2; i<=5; i++) printf "%s ", $i; print ""}' file
# All fields except first
awk '{$1=""; print substr($0,2)}' file
# Reformat output
awk -F':' '{printf "%-20s %s\n", $1, $6}' /etc/passwd
# Infrastructure: Parse ISE session fields
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .user_name, .nas_ip_address] | @tsv' | \
awk -F'\t' '{printf "MAC: %-20s User: %-15s Switch: %s\n", $1, $2, $3}'
# Infrastructure: Extract IP from interface output
ip -4 -o addr show | awk '{print $2, $4}' | awk -F'/' '{print $1}'
# Extract specific columns from ps
ps aux | awk '{printf "%-10s %5s %5s %s\n", $1, $2, $3, $11}'
Built-in Variables
# Record/Field variables
# NR - Current record (line) number (across all files)
# FNR - Current record number in current file
# NF - Number of fields in current record
# $0 - Entire current record
# $n - nth field
# Separator variables
# FS - Input field separator (default: whitespace)
# OFS - Output field separator (default: space)
# RS - Input record separator (default: newline)
# ORS - Output record separator (default: newline)
# Print line numbers
awk '{print NR, $0}' file # Line number + content
# Field count per line
awk '{print NR, NF, "fields:", $0}' file
# Process specific line
awk 'NR==5 {print}' file # Only line 5
awk 'NR>=5 && NR<=10' file # Lines 5-10
awk 'NR>1' file # Skip header
# Last field of each line
awk '{print $NF}' file
# Process multiple files, track per-file line number
awk 'FNR==1 {print "=== " FILENAME " ==="} {print FNR, $0}' file1 file2
# Change output separator
awk 'BEGIN {OFS=","} {print $1, $2, $3}' file
awk 'BEGIN {ORS="; "} {print $1}' file # Semicolon between records
# Multi-character field separator
awk 'BEGIN {FS="::"} {print $1, $2}' file
# Infrastructure: Format /etc/passwd as CSV
awk 'BEGIN {FS=":"; OFS=","} {print $1, $3, $6}' /etc/passwd
# Infrastructure: Numbered list of hosts
awk '{printf "%3d. %s\n", NR, $0}' hosts.txt
Pattern Matching
# Line contains pattern
awk '/ERROR/' file # Lines with ERROR
awk '/ERROR/ {print}' file # Same, explicit
awk '!/ERROR/' file # Lines WITHOUT ERROR
# Field matches pattern
awk '$1 ~ /^vault/' file # Field 1 starts with vault
awk '$3 !~ /failed/' file # Field 3 doesn't contain failed
# Case insensitive (POSIX)
awk 'tolower($0) ~ /error/' file
# Case insensitive (GNU awk)
awk 'BEGIN{IGNORECASE=1} /error/' file
# Multiple patterns
awk '/ERROR/ || /WARN/' file # OR
awk '/ERROR/ && /auth/' file # AND (on same line)
# Range patterns (print between START and END)
awk '/BEGIN/,/END/' file
awk '/^---$/,/^---$/' file # Between YAML frontmatter markers
# Negated range (print everything EXCEPT between markers)
awk '/BEGIN/,/END/ {next} {print}' file
# Field-based conditions
awk '$3 > 100' file # Third field > 100
awk '$1 == "vault-01"' file # Exact match
awk 'length($2) > 10' file # Field length > 10
# Compound conditions
awk '$1 == "ERROR" && $3 > 5 {print $2, $4}' file
# Infrastructure: Filter failed ISE authentications
netapi ise mnt sessions --format json | jq -r '.[] | [.authentication_status, .user_name, .failure_reason] | @tsv' | \
awk -F'\t' '$1 == "FAILED" {print $2, $3}'
# Infrastructure: High CPU processes
ps aux | awk '$3 > 50 {printf "%-10s PID: %-6s CPU: %s%%\n", $1, $2, $3}'
Aggregation and Counting
# Sum a column
awk '{sum += $1} END {print "Total:", sum}' file
# Count lines
awk 'END {print NR}' file
# Count matching lines
awk '/ERROR/ {count++} END {print count}' file
# Average
awk '{sum += $1; count++} END {print "Avg:", sum/count}' file
# Min/Max
awk 'NR==1 || $1 < min {min=$1} NR==1 || $1 > max {max=$1} END {print "Min:", min, "Max:", max}' file
# Count occurrences (frequency)
awk '{count[$1]++} END {for(k in count) print k, count[k]}' file
# Sorted frequency (pipe to sort)
awk '{count[$1]++} END {for(k in count) print count[k], k}' file | sort -rn
# Group by and sum
awk '{sum[$1] += $2} END {for(k in sum) print k, sum[k]}' file
# Multi-dimensional aggregation
awk '{count[$1][$2]++} END {for(i in count) for(j in count[i]) print i, j, count[i][j]}' file
# Infrastructure: Count auth failures by user
netapi ise mnt sessions --format json | jq -r '.[] | select(.authentication_status == "FAILED") | .user_name' | \
awk '{count[$1]++} END {for(u in count) printf "%-20s %d failures\n", u, count[u]}' | sort -k2 -rn
# Infrastructure: Disk usage summary
df -h | awk 'NR>1 {used[$6]=$3; avail[$6]=$4} END {for(m in used) printf "%-30s Used: %8s Avail: %8s\n", m, used[m], avail[m]}'
# Infrastructure: Pod count by namespace
kubectl get pods -A --no-headers | awk '{count[$1]++} END {for(ns in count) printf "%-20s %d pods\n", ns, count[ns]}' | sort -k2 -rn
# Running totals
awk '{sum += $1; print $0, "Running:", sum}' file
BEGIN and END Blocks
# BEGIN: Execute before processing any input
# END: Execute after all input processed
# Print header and footer
awk 'BEGIN {print "=== Report ==="} {print} END {print "=== End ==="}' file
# Initialize variables
awk 'BEGIN {sum=0; count=0} {sum+=$1; count++} END {print sum/count}' file
# Set separators in BEGIN
awk 'BEGIN {FS=":"; OFS="\t"} {print $1, $3}' /etc/passwd
# Create formatted table
awk 'BEGIN {
print "USERNAME UID SHELL"
print "------------------------------------"
}
{
printf "%-20s %-5s %s\n", $1, $3, $7
}
END {
print "------------------------------------"
print NR, "users total"
}' FS=':' /etc/passwd
# Multiple BEGIN/END blocks (executed in order)
awk 'BEGIN {print "Starting..."} BEGIN {print "Ready!"} {print} END {print "Done"}' file
# Infrastructure: ISE session report
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .user_name, .authentication_status] | @tsv' | \
awk 'BEGIN {
FS="\t"
print "MAC ADDRESS USER STATUS"
print "------------------------------------------------"
}
{
printf "%-20s %-16s %s\n", $1, $2, $3
if($3=="FAILED") fail++; else pass++
}
END {
print "------------------------------------------------"
printf "Passed: %d Failed: %d Total: %d\n", pass, fail, NR
}'
# No input, just generate
awk 'BEGIN {for(i=1; i<=10; i++) print i}'
# Generate sequence with formatting
awk 'BEGIN {for(i=1; i<=10; i++) printf "Item %02d\n", i}'
String Functions
# length - string length
awk '{print length($1), $1}' file
awk 'length($0) > 80' file # Lines longer than 80 chars
# substr - extract substring
awk '{print substr($1, 1, 3)}' file # First 3 chars
awk '{print substr($1, 4)}' file # From char 4 to end
# index - find position of substring
awk '{pos=index($0, "ERROR"); if(pos) print "Found at", pos}' file
# split - split string into array
awk '{n=split($1, arr, "-"); for(i=1; i<=n; i++) print arr[i]}' file
# sub - replace first occurrence
awk '{sub(/old/, "new"); print}' file
# gsub - replace all occurrences
awk '{gsub(/old/, "new"); print}' file
awk '{gsub(/[^a-zA-Z0-9]/, "_"); print}' file # Replace non-alphanumeric
# tolower/toupper - case conversion
awk '{print tolower($0)}' file
awk '{print toupper($1)}' file
# sprintf - formatted string
awk '{s=sprintf("%05d-%s", NR, $1); print s}' file
# match - regex match with position
awk '{if(match($0, /[0-9]+/)) print substr($0, RSTART, RLENGTH)}' file
# Infrastructure: Normalize MAC addresses
echo "14:F6:D8:7B:31:80" | awk '{gsub(/:/, "-"); print tolower($0)}'
# Infrastructure: Extract hostname from FQDN
echo "vault-01.inside.domusdigitalis.dev" | awk -F'.' '{print $1}'
# Infrastructure: Pad process IDs
ps aux | awk 'NR>1 {printf "PID: %06d CMD: %s\n", $2, $11}'
Conditionals and Control Flow
# if/else
awk '{if($3 > 100) print "HIGH:", $0; else print "LOW:", $0}' file
# Ternary operator
awk '{status = ($3 > 100) ? "HIGH" : "LOW"; print status, $0}' file
# Multiple conditions
awk '{
if($1 == "ERROR") print "ERROR:", $0
else if($1 == "WARN") print "WARNING:", $0
else print "INFO:", $0
}' file
# next - skip to next record
awk '/skip/ {next} {print}' file # Skip lines with "skip"
# exit - stop processing
awk '/STOP/ {exit} {print}' file # Print until STOP
# exit with code
awk 'END {exit (NR > 100) ? 1 : 0}' file # Exit 1 if >100 lines
# While loop
awk '{i=1; while(i<=NF) {print $i; i++}}' file
# For loop
awk '{for(i=1; i<=NF; i++) print i, $i}' file
# Infrastructure: Categorize log levels
cat /var/log/messages | awk '{
level = "INFO"
if(/ERROR|FATAL|CRIT/) level = "CRITICAL"
else if(/WARN|WARNING/) level = "WARNING"
else if(/DEBUG/) level = "DEBUG"
print level, $0
}'
# Infrastructure: Health status
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .authentication_status] | @tsv' | \
awk -F'\t' '{
if($2 == "AUTHENTICATED") status = "✓"
else if($2 == "FAILED") status = "✗"
else status = "?"
printf "%s %s\n", status, $1
}'
Arrays
# Associative arrays (key-value)
awk '{count[$1]++} END {for(k in count) print k, count[k]}' file
# Check if key exists
awk '{if($1 in count) print "Duplicate:", $1; count[$1]++}' file
# Delete array element
awk '{count[$1]++} END {delete count["unwanted"]; for(k in count) print k, count[k]}' file
# Multi-dimensional arrays
awk '{data[$1][$2] = $3} END {for(i in data) for(j in data[i]) print i, j, data[i][j]}' file
# Array length (GNU awk)
awk '{arr[NR]=$0} END {print length(arr), "elements"}' file
# Sort array keys (GNU awk)
awk '{count[$1]++} END {PROCINFO["sorted_in"]="@val_num_desc"; for(k in count) print count[k], k}' file
# Infrastructure: Group ISE sessions by switch
netapi ise mnt sessions --format json | jq -r '.[] | [.nas_ip_address, .calling_station_id] | @tsv' | \
awk -F'\t' '{
sessions[$1]++
macs[$1] = macs[$1] ? macs[$1] ", " $2 : $2
}
END {
for(switch in sessions) {
print "=== " switch " (" sessions[switch] " sessions) ==="
print macs[switch]
print ""
}
}'
# Infrastructure: Build lookup table
awk 'BEGIN {
hosts["vault-01"] = "10.50.1.60"
hosts["ise-01"] = "10.50.1.20"
hosts["bind-01"] = "10.50.1.90"
}
{
if($1 in hosts) print $1, "->", hosts[$1]
}' hostlist.txt
# Store all fields in array
awk '{for(i=1; i<=NF; i++) fields[NR][i]=$i} END {print fields[2][3]}' file
Output Formatting
# printf - formatted output
awk '{printf "%-20s %10d %8.2f\n", $1, $2, $3}' file
# Format specifiers:
# %s - string
# %d - integer
# %f - float
# %e - scientific
# %x - hex
# %- - left align
# %10 - minimum width 10
# %.2 - 2 decimal places
# Right align numbers
awk '{printf "%10d\n", $1}' file
# Left align strings
awk '{printf "%-20s\n", $1}' file
# Fixed decimal places
awk '{printf "%.2f\n", $1}' file
# Leading zeros
awk '{printf "%05d\n", $1}' file
# Multiple columns
awk '{printf "| %-15s | %8d | %6.2f%% |\n", $1, $2, $3}' file
# Table with header
awk 'BEGIN {
printf "%-15s %10s %10s\n", "HOST", "CPU%", "MEM%"
printf "%-15s %10s %10s\n", "----", "----", "----"
}
{
printf "%-15s %10.1f %10.1f\n", $1, $2, $3
}' stats.txt
# Infrastructure: Formatted pod listing
kubectl get pods -A --no-headers | awk '{
printf "%-20s %-40s %-10s %s\n", $1, $2, $4, $5
}'
# Infrastructure: Certificate expiry table
find /etc/ssl/certs -name "*.pem" -exec openssl x509 -in {} -noout -enddate -subject \; 2>/dev/null | \
paste - - | awk -F'[=,]' '{
gsub(/^ +| +$/, "", $2)
gsub(/^ +| +$/, "", $4)
printf "%-40s %s\n", substr($4, 1, 40), $2
}'
Multi-file Processing
# FILENAME - current file name
awk '{print FILENAME, $0}' file1 file2
# FNR vs NR
# NR - total records across all files
# FNR - record number in current file
awk 'FNR==1 {print "=== " FILENAME " ==="}' file1 file2 file3
# Reset counter per file
awk 'FNR==1 {count=0} {count++} FNR==EOF {print FILENAME, count}' file1 file2
# Process files differently based on argument position
awk 'NR==FNR {lookup[$1]=$2; next} $1 in lookup {print $0, lookup[$1]}' lookup.txt data.txt
# Compare two files (like diff)
awk 'NR==FNR {a[$0]; next} !($0 in a)' file1 file2 # Lines in file2 not in file1
# Find common lines
awk 'NR==FNR {a[$0]; next} $0 in a' file1 file2
# Join files on key
awk 'NR==FNR {data[$1]=$2; next} $1 in data {print $0, data[$1]}' file1.txt file2.txt
# Infrastructure: Compare host lists
awk 'NR==FNR {expected[$0]; next} !($0 in expected) {print "EXTRA:", $0}' expected_hosts.txt actual_hosts.txt
awk 'NR==FNR {actual[$0]; next} !($0 in actual) {print "MISSING:", $0}' actual_hosts.txt expected_hosts.txt
# Infrastructure: Enrich log with host info
awk 'NR==FNR {ip[$1]=$2; next} {print $0, "(" ip[$4] ")"}' hosts_lookup.txt access.log
Infrastructure Patterns
# SSH CA cert status across hosts
~/.local/bin/vault-ssh-test 2>&1 | awk '/===/ {host=$2} /Success|FAILED/ {
status = /Success/ ? "✓" : "✗"
printf "%s %-20s\n", status, host
}'
# k8s pod resource summary
kubectl top pods -A --no-headers | awk '{
ns=$1; pod=$2
gsub(/m$/, "", $3); cpu=$3
gsub(/Mi$/, "", $4); mem=$4
ns_cpu[ns] += cpu
ns_mem[ns] += mem
ns_count[ns]++
}
END {
printf "%-20s %8s %10s %6s\n", "NAMESPACE", "CPU(m)", "MEM(Mi)", "PODS"
printf "%-20s %8s %10s %6s\n", "---------", "------", "-------", "----"
for(ns in ns_cpu) {
printf "%-20s %8d %10d %6d\n", ns, ns_cpu[ns], ns_mem[ns], ns_count[ns]
}
}'
# ISE session pivot table (by switch and status)
netapi ise mnt sessions --format json | jq -r '.[] | [.nas_ip_address, .authentication_status] | @tsv' | \
awk -F'\t' '{
pivot[$1][$2]++
total[$1]++
}
END {
printf "%-20s %10s %10s %10s\n", "SWITCH", "PASSED", "FAILED", "TOTAL"
for(sw in total) {
printf "%-20s %10d %10d %10d\n", sw, pivot[sw]["AUTHENTICATED"]+0, pivot[sw]["FAILED"]+0, total[sw]
}
}'
# Log analysis: Requests per minute
awk '{
split($4, t, ":")
minute = t[2] ":" t[3]
count[minute]++
}
END {
for(m in count) print m, count[m]
}' access.log | sort
# Certificate expiry check
find /etc/ssl/certs -name "*.pem" -type f 2>/dev/null | while read cert; do
openssl x509 -in "$cert" -noout -enddate 2>/dev/null
done | awk -F'=' '{
cmd = "date -d \"" $2 "\" +%s"
cmd | getline exp
close(cmd)
now = systime()
days = int((exp - now) / 86400)
if(days < 30) printf "⚠️ %d days: %s\n", days, $2
}'
# Parse netstat/ss output
ss -tlnp | awk 'NR>1 {
split($4, addr, ":")
port = addr[length(addr)]
gsub(/.*"/, "", $6)
gsub(/".*/, "", $6)
printf "%-6s %-20s\n", port, $6
}' | sort -n
# Vault audit log analysis
sudo cat /var/log/vault/audit.log | jq -r '[.time, .type, .request.operation, .request.path] | @tsv' | \
awk -F'\t' '{
ops[$3]++
paths[$4]++
}
END {
print "=== Operations ==="
for(op in ops) printf "%-15s %d\n", op, ops[op]
print "\n=== Top Paths ==="
for(path in paths) printf "%-40s %d\n", path, paths[path]
}' | sort -k2 -rn | head -20
AWK Gotchas
# WRONG: Forgetting to quote the awk program
awk {print $1} file # Shell expands $1!
# CORRECT: Always quote
awk '{print $1}' file
# WRONG: Using shell variables inside single quotes
var="pattern"
awk '/$var/ {print}' file # Literal $var, not expanded
# CORRECT: Use -v to pass variables
awk -v pat="$var" '$0 ~ pat {print}' file
# Or use double quotes (but escape $ for awk)
awk "/$var/ {print}" file
# WRONG: Integer division when you want float
awk 'BEGIN {print 3/4}' # 0.75 - actually works in awk
awk 'BEGIN {print int(3/4)}' # 0 - if you want integer
# WRONG: Comparing numbers as strings
awk '$1 > 9' file # "10" < "9" as strings!
# CORRECT: Force numeric comparison
awk '$1+0 > 9' file
# WRONG: Modifying $0 without recalculating fields
awk '{gsub(/old/, "new"); print $1}' file # $1 is from BEFORE gsub
# CORRECT: Reference $0 or re-parse
awk '{gsub(/old/, "new"); print $0}' file
# WRONG: Assuming fields exist
awk '{print $10}' file # Empty if <10 fields
# CORRECT: Check first
awk 'NF >= 10 {print $10}' file
# WRONG: Regex in variable with slashes
awk -v pat="/var/log" '$0 ~ pat' file # Fails!
# CORRECT: Escape or use different approach
awk -v pat="/var/log" 'index($0, pat)' file
# WRONG: Using = instead of == for comparison
awk '$1 = "value"' file # Assignment, not comparison!
# CORRECT: Use == for comparison
awk '$1 == "value"' file
# WRONG: Expecting associative array order
awk '{a[$1]++} END {for(k in a) print k}' file # Order not guaranteed
# CORRECT: Sort externally or use PROCINFO (gawk)
awk '{a[$1]++} END {for(k in a) print k}' file | sort
Circular Buffer (tail emulation)
# Last 10 lines - awk replacement for tail -10
awk '{a[NR%10]=$0} END {for(i=NR+1;i<=NR+10;i++) print a[i%10]}' file
# Last 35 lines
awk '{a[NR%35]=$0} END {for(i=NR+1;i<=NR+35;i++) print a[i%35]}' file
# Variable N (pass as awk variable)
awk -v n=20 '{a[NR%n]=$0} END {for(i=NR+1;i<=NR+n;i++) print a[i%n]}' file
a[NR%10] # Store line in array slot 0-9 (wraps around) NR=1 → a[1] # Line 1 stored in slot 1 NR=10 → a[0] # Line 10 stored in slot 0 NR=11 → a[1] # Line 11 OVERWRITES slot 1 (old line 1 gone) ... END block # Array contains only last 10 lines
# Use tail when: - Simple last N lines - Piping is acceptable # Use awk circular buffer when: - Already in awk pipeline (no context switch) - Need to process last N lines with awk logic - Learning awk arrays + modulo
Field Transforms
awk '{print $3, $1, $2}' file
awk '{for(i=NF;i>0;i--) printf "%s ", $i; print ""}' file
awk -F',' '{$1=$1} OFS="\t"' file.csv
awk -F':' '{$1=$1} OFS="|"' /etc/passwd
awk -F'[ ,:]' '{print $1, $3}' file
awk '{sum+=$1} END {print sum}' file
awk '{sum+=$1; c++} END {print sum/c}' file
awk '{a[NR]=$1; sum+=$1} END {for(i=1;i⇐NR;i++) printf "%s %.2f%%\n", a[i], a[i]/sum*100}' file
awk '{print $1, $2, $1+$2}' file
awk '{sum+=$1; print $0, sum}' file
awk '{printf "%-20s %10s %8.2f\n", $1, $2, $3}' file
awk '{printf "%20s\n", $1}' file
awk '{printf "%05d\n", $1}' file
awk '{for(i=1;i⇐NF;i) a[i,NR]=$i} END {for(i=1;i<=NF;i) {for(j=1;j⇐NR;j++) printf "%s ", a[i,j]; print ""}}' file
awk '{a[$1]=a[$1] ? a[$1]","$2 : $2} END {for(k in a) print k, a[k]}' file
echo "user=admin;role=root" | awk -F'[=;]' '{print $2, $4}'
awk -F'=' '/^hostname/ {print $2}' config.txt
awk '$3>100 {$3="HIGH"} {print}' file
awk '{$2=""} {print}' file
awk '{print $1, ($2?$2:"N/A"), $3}' file