AWK Fundamentals

AWK field processing and pattern matching fundamentals.

Field Extraction

# Default: whitespace delimiter
awk '{print $1}' file                    # First field
awk '{print $1, $3}' file                # Fields 1 and 3
awk '{print $NF}' file                   # Last field
awk '{print $(NF-1)}' file               # Second to last

# Custom delimiter
awk -F':' '{print $1}' /etc/passwd       # Colon separated
awk -F',' '{print $2}' data.csv          # CSV
awk -F'\t' '{print $1}' data.tsv         # Tab separated
awk -F'[,;:]' '{print $1}' file          # Multiple delimiters

# Output field separator
awk -F':' -v OFS=',' '{print $1, $3, $6}' /etc/passwd

# Field range (print fields 2 through 5)
awk '{for(i=2; i<=5; i++) printf "%s ", $i; print ""}' file

# All fields except first
awk '{$1=""; print substr($0,2)}' file

# Reformat output
awk -F':' '{printf "%-20s %s\n", $1, $6}' /etc/passwd

# Infrastructure: Parse ISE session fields
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .user_name, .nas_ip_address] | @tsv' | \
    awk -F'\t' '{printf "MAC: %-20s User: %-15s Switch: %s\n", $1, $2, $3}'

# Infrastructure: Extract IP from interface output
ip -4 -o addr show | awk '{print $2, $4}' | awk -F'/' '{print $1}'

# Extract specific columns from ps
ps aux | awk '{printf "%-10s %5s %5s %s\n", $1, $2, $3, $11}'

Built-in Variables

# Record/Field variables
# NR  - Current record (line) number (across all files)
# FNR - Current record number in current file
# NF  - Number of fields in current record
# $0  - Entire current record
# $n  - nth field

# Separator variables
# FS  - Input field separator (default: whitespace)
# OFS - Output field separator (default: space)
# RS  - Input record separator (default: newline)
# ORS - Output record separator (default: newline)

# Print line numbers
awk '{print NR, $0}' file                # Line number + content

# Field count per line
awk '{print NR, NF, "fields:", $0}' file

# Process specific line
awk 'NR==5 {print}' file                 # Only line 5
awk 'NR>=5 && NR<=10' file               # Lines 5-10
awk 'NR>1' file                          # Skip header

# Last field of each line
awk '{print $NF}' file

# Process multiple files, track per-file line number
awk 'FNR==1 {print "=== " FILENAME " ==="} {print FNR, $0}' file1 file2

# Change output separator
awk 'BEGIN {OFS=","} {print $1, $2, $3}' file
awk 'BEGIN {ORS="; "} {print $1}' file   # Semicolon between records

# Multi-character field separator
awk 'BEGIN {FS="::"} {print $1, $2}' file

# Infrastructure: Format /etc/passwd as CSV
awk 'BEGIN {FS=":"; OFS=","} {print $1, $3, $6}' /etc/passwd

# Infrastructure: Numbered list of hosts
awk '{printf "%3d. %s\n", NR, $0}' hosts.txt

Pattern Matching

# Line contains pattern
awk '/ERROR/' file                       # Lines with ERROR
awk '/ERROR/ {print}' file               # Same, explicit
awk '!/ERROR/' file                      # Lines WITHOUT ERROR

# Field matches pattern
awk '$1 ~ /^vault/' file                 # Field 1 starts with vault
awk '$3 !~ /failed/' file                # Field 3 doesn't contain failed

# Case insensitive (POSIX)
awk 'tolower($0) ~ /error/' file

# Case insensitive (GNU awk)
awk 'BEGIN{IGNORECASE=1} /error/' file

# Multiple patterns
awk '/ERROR/ || /WARN/' file             # OR
awk '/ERROR/ && /auth/' file             # AND (on same line)

# Range patterns (print between START and END)
awk '/BEGIN/,/END/' file
awk '/^---$/,/^---$/' file               # Between YAML frontmatter markers

# Negated range (print everything EXCEPT between markers)
awk '/BEGIN/,/END/ {next} {print}' file

# Field-based conditions
awk '$3 > 100' file                      # Third field > 100
awk '$1 == "vault-01"' file              # Exact match
awk 'length($2) > 10' file               # Field length > 10

# Compound conditions
awk '$1 == "ERROR" && $3 > 5 {print $2, $4}' file

# Infrastructure: Filter failed ISE authentications
netapi ise mnt sessions --format json | jq -r '.[] | [.authentication_status, .user_name, .failure_reason] | @tsv' | \
    awk -F'\t' '$1 == "FAILED" {print $2, $3}'

# Infrastructure: High CPU processes
ps aux | awk '$3 > 50 {printf "%-10s PID: %-6s CPU: %s%%\n", $1, $2, $3}'

Aggregation and Counting

# Sum a column
awk '{sum += $1} END {print "Total:", sum}' file

# Count lines
awk 'END {print NR}' file

# Count matching lines
awk '/ERROR/ {count++} END {print count}' file

# Average
awk '{sum += $1; count++} END {print "Avg:", sum/count}' file

# Min/Max
awk 'NR==1 || $1 < min {min=$1} NR==1 || $1 > max {max=$1} END {print "Min:", min, "Max:", max}' file

# Count occurrences (frequency)
awk '{count[$1]++} END {for(k in count) print k, count[k]}' file

# Sorted frequency (pipe to sort)
awk '{count[$1]++} END {for(k in count) print count[k], k}' file | sort -rn

# Group by and sum
awk '{sum[$1] += $2} END {for(k in sum) print k, sum[k]}' file

# Multi-dimensional aggregation
awk '{count[$1][$2]++} END {for(i in count) for(j in count[i]) print i, j, count[i][j]}' file

# Infrastructure: Count auth failures by user
netapi ise mnt sessions --format json | jq -r '.[] | select(.authentication_status == "FAILED") | .user_name' | \
    awk '{count[$1]++} END {for(u in count) printf "%-20s %d failures\n", u, count[u]}' | sort -k2 -rn

# Infrastructure: Disk usage summary
df -h | awk 'NR>1 {used[$6]=$3; avail[$6]=$4} END {for(m in used) printf "%-30s Used: %8s  Avail: %8s\n", m, used[m], avail[m]}'

# Infrastructure: Pod count by namespace
kubectl get pods -A --no-headers | awk '{count[$1]++} END {for(ns in count) printf "%-20s %d pods\n", ns, count[ns]}' | sort -k2 -rn

# Running totals
awk '{sum += $1; print $0, "Running:", sum}' file

BEGIN and END Blocks

# BEGIN: Execute before processing any input
# END: Execute after all input processed

# Print header and footer
awk 'BEGIN {print "=== Report ==="} {print} END {print "=== End ==="}' file

# Initialize variables
awk 'BEGIN {sum=0; count=0} {sum+=$1; count++} END {print sum/count}' file

# Set separators in BEGIN
awk 'BEGIN {FS=":"; OFS="\t"} {print $1, $3}' /etc/passwd

# Create formatted table
awk 'BEGIN {
    print "USERNAME            UID   SHELL"
    print "------------------------------------"
}
{
    printf "%-20s %-5s %s\n", $1, $3, $7
}
END {
    print "------------------------------------"
    print NR, "users total"
}' FS=':' /etc/passwd

# Multiple BEGIN/END blocks (executed in order)
awk 'BEGIN {print "Starting..."} BEGIN {print "Ready!"} {print} END {print "Done"}' file

# Infrastructure: ISE session report
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .user_name, .authentication_status] | @tsv' | \
    awk 'BEGIN {
        FS="\t"
        print "MAC ADDRESS          USER             STATUS"
        print "------------------------------------------------"
    }
    {
        printf "%-20s %-16s %s\n", $1, $2, $3
        if($3=="FAILED") fail++; else pass++
    }
    END {
        print "------------------------------------------------"
        printf "Passed: %d  Failed: %d  Total: %d\n", pass, fail, NR
    }'

# No input, just generate
awk 'BEGIN {for(i=1; i<=10; i++) print i}'

# Generate sequence with formatting
awk 'BEGIN {for(i=1; i<=10; i++) printf "Item %02d\n", i}'

String Functions

# length - string length
awk '{print length($1), $1}' file
awk 'length($0) > 80' file               # Lines longer than 80 chars

# substr - extract substring
awk '{print substr($1, 1, 3)}' file      # First 3 chars
awk '{print substr($1, 4)}' file         # From char 4 to end

# index - find position of substring
awk '{pos=index($0, "ERROR"); if(pos) print "Found at", pos}' file

# split - split string into array
awk '{n=split($1, arr, "-"); for(i=1; i<=n; i++) print arr[i]}' file

# sub - replace first occurrence
awk '{sub(/old/, "new"); print}' file

# gsub - replace all occurrences
awk '{gsub(/old/, "new"); print}' file
awk '{gsub(/[^a-zA-Z0-9]/, "_"); print}' file  # Replace non-alphanumeric

# tolower/toupper - case conversion
awk '{print tolower($0)}' file
awk '{print toupper($1)}' file

# sprintf - formatted string
awk '{s=sprintf("%05d-%s", NR, $1); print s}' file

# match - regex match with position
awk '{if(match($0, /[0-9]+/)) print substr($0, RSTART, RLENGTH)}' file

# Infrastructure: Normalize MAC addresses
echo "14:F6:D8:7B:31:80" | awk '{gsub(/:/, "-"); print tolower($0)}'

# Infrastructure: Extract hostname from FQDN
echo "vault-01.inside.domusdigitalis.dev" | awk -F'.' '{print $1}'

# Infrastructure: Pad process IDs
ps aux | awk 'NR>1 {printf "PID: %06d  CMD: %s\n", $2, $11}'

Conditionals and Control Flow

# if/else
awk '{if($3 > 100) print "HIGH:", $0; else print "LOW:", $0}' file

# Ternary operator
awk '{status = ($3 > 100) ? "HIGH" : "LOW"; print status, $0}' file

# Multiple conditions
awk '{
    if($1 == "ERROR") print "ERROR:", $0
    else if($1 == "WARN") print "WARNING:", $0
    else print "INFO:", $0
}' file

# next - skip to next record
awk '/skip/ {next} {print}' file         # Skip lines with "skip"

# exit - stop processing
awk '/STOP/ {exit} {print}' file         # Print until STOP

# exit with code
awk 'END {exit (NR > 100) ? 1 : 0}' file # Exit 1 if >100 lines

# While loop
awk '{i=1; while(i<=NF) {print $i; i++}}' file

# For loop
awk '{for(i=1; i<=NF; i++) print i, $i}' file

# Infrastructure: Categorize log levels
cat /var/log/messages | awk '{
    level = "INFO"
    if(/ERROR|FATAL|CRIT/) level = "CRITICAL"
    else if(/WARN|WARNING/) level = "WARNING"
    else if(/DEBUG/) level = "DEBUG"
    print level, $0
}'

# Infrastructure: Health status
netapi ise mnt sessions --format json | jq -r '.[] | [.calling_station_id, .authentication_status] | @tsv' | \
    awk -F'\t' '{
        if($2 == "AUTHENTICATED") status = "✓"
        else if($2 == "FAILED") status = "✗"
        else status = "?"
        printf "%s %s\n", status, $1
    }'

Arrays

# Associative arrays (key-value)
awk '{count[$1]++} END {for(k in count) print k, count[k]}' file

# Check if key exists
awk '{if($1 in count) print "Duplicate:", $1; count[$1]++}' file

# Delete array element
awk '{count[$1]++} END {delete count["unwanted"]; for(k in count) print k, count[k]}' file

# Multi-dimensional arrays
awk '{data[$1][$2] = $3} END {for(i in data) for(j in data[i]) print i, j, data[i][j]}' file

# Array length (GNU awk)
awk '{arr[NR]=$0} END {print length(arr), "elements"}' file

# Sort array keys (GNU awk)
awk '{count[$1]++} END {PROCINFO["sorted_in"]="@val_num_desc"; for(k in count) print count[k], k}' file

# Infrastructure: Group ISE sessions by switch
netapi ise mnt sessions --format json | jq -r '.[] | [.nas_ip_address, .calling_station_id] | @tsv' | \
    awk -F'\t' '{
        sessions[$1]++
        macs[$1] = macs[$1] ? macs[$1] ", " $2 : $2
    }
    END {
        for(switch in sessions) {
            print "=== " switch " (" sessions[switch] " sessions) ==="
            print macs[switch]
            print ""
        }
    }'

# Infrastructure: Build lookup table
awk 'BEGIN {
    hosts["vault-01"] = "10.50.1.60"
    hosts["ise-01"] = "10.50.1.20"
    hosts["bind-01"] = "10.50.1.90"
}
{
    if($1 in hosts) print $1, "->", hosts[$1]
}' hostlist.txt

# Store all fields in array
awk '{for(i=1; i<=NF; i++) fields[NR][i]=$i} END {print fields[2][3]}' file

Output Formatting

# printf - formatted output
awk '{printf "%-20s %10d %8.2f\n", $1, $2, $3}' file

# Format specifiers:
# %s   - string
# %d   - integer
# %f   - float
# %e   - scientific
# %x   - hex
# %-   - left align
# %10  - minimum width 10
# %.2  - 2 decimal places

# Right align numbers
awk '{printf "%10d\n", $1}' file

# Left align strings
awk '{printf "%-20s\n", $1}' file

# Fixed decimal places
awk '{printf "%.2f\n", $1}' file

# Leading zeros
awk '{printf "%05d\n", $1}' file

# Multiple columns
awk '{printf "| %-15s | %8d | %6.2f%% |\n", $1, $2, $3}' file

# Table with header
awk 'BEGIN {
    printf "%-15s %10s %10s\n", "HOST", "CPU%", "MEM%"
    printf "%-15s %10s %10s\n", "----", "----", "----"
}
{
    printf "%-15s %10.1f %10.1f\n", $1, $2, $3
}' stats.txt

# Infrastructure: Formatted pod listing
kubectl get pods -A --no-headers | awk '{
    printf "%-20s %-40s %-10s %s\n", $1, $2, $4, $5
}'

# Infrastructure: Certificate expiry table
find /etc/ssl/certs -name "*.pem" -exec openssl x509 -in {} -noout -enddate -subject \; 2>/dev/null | \
    paste - - | awk -F'[=,]' '{
        gsub(/^ +| +$/, "", $2)
        gsub(/^ +| +$/, "", $4)
        printf "%-40s %s\n", substr($4, 1, 40), $2
    }'

Multi-file Processing

# FILENAME - current file name
awk '{print FILENAME, $0}' file1 file2

# FNR vs NR
# NR  - total records across all files
# FNR - record number in current file
awk 'FNR==1 {print "=== " FILENAME " ==="}' file1 file2 file3

# Reset counter per file
awk 'FNR==1 {count=0} {count++} FNR==EOF {print FILENAME, count}' file1 file2

# Process files differently based on argument position
awk 'NR==FNR {lookup[$1]=$2; next} $1 in lookup {print $0, lookup[$1]}' lookup.txt data.txt

# Compare two files (like diff)
awk 'NR==FNR {a[$0]; next} !($0 in a)' file1 file2    # Lines in file2 not in file1

# Find common lines
awk 'NR==FNR {a[$0]; next} $0 in a' file1 file2

# Join files on key
awk 'NR==FNR {data[$1]=$2; next} $1 in data {print $0, data[$1]}' file1.txt file2.txt

# Infrastructure: Compare host lists
awk 'NR==FNR {expected[$0]; next} !($0 in expected) {print "EXTRA:", $0}' expected_hosts.txt actual_hosts.txt
awk 'NR==FNR {actual[$0]; next} !($0 in actual) {print "MISSING:", $0}' actual_hosts.txt expected_hosts.txt

# Infrastructure: Enrich log with host info
awk 'NR==FNR {ip[$1]=$2; next} {print $0, "(" ip[$4] ")"}' hosts_lookup.txt access.log

Infrastructure Patterns

# SSH CA cert status across hosts
~/.local/bin/vault-ssh-test 2>&1 | awk '/===/ {host=$2} /Success|FAILED/ {
    status = /Success/ ? "✓" : "✗"
    printf "%s %-20s\n", status, host
}'

# k8s pod resource summary
kubectl top pods -A --no-headers | awk '{
    ns=$1; pod=$2
    gsub(/m$/, "", $3); cpu=$3
    gsub(/Mi$/, "", $4); mem=$4
    ns_cpu[ns] += cpu
    ns_mem[ns] += mem
    ns_count[ns]++
}
END {
    printf "%-20s %8s %10s %6s\n", "NAMESPACE", "CPU(m)", "MEM(Mi)", "PODS"
    printf "%-20s %8s %10s %6s\n", "---------", "------", "-------", "----"
    for(ns in ns_cpu) {
        printf "%-20s %8d %10d %6d\n", ns, ns_cpu[ns], ns_mem[ns], ns_count[ns]
    }
}'

# ISE session pivot table (by switch and status)
netapi ise mnt sessions --format json | jq -r '.[] | [.nas_ip_address, .authentication_status] | @tsv' | \
    awk -F'\t' '{
        pivot[$1][$2]++
        total[$1]++
    }
    END {
        printf "%-20s %10s %10s %10s\n", "SWITCH", "PASSED", "FAILED", "TOTAL"
        for(sw in total) {
            printf "%-20s %10d %10d %10d\n", sw, pivot[sw]["AUTHENTICATED"]+0, pivot[sw]["FAILED"]+0, total[sw]
        }
    }'

# Log analysis: Requests per minute
awk '{
    split($4, t, ":")
    minute = t[2] ":" t[3]
    count[minute]++
}
END {
    for(m in count) print m, count[m]
}' access.log | sort

# Certificate expiry check
find /etc/ssl/certs -name "*.pem" -type f 2>/dev/null | while read cert; do
    openssl x509 -in "$cert" -noout -enddate 2>/dev/null
done | awk -F'=' '{
    cmd = "date -d \"" $2 "\" +%s"
    cmd | getline exp
    close(cmd)
    now = systime()
    days = int((exp - now) / 86400)
    if(days < 30) printf "⚠️  %d days: %s\n", days, $2
}'

# Parse netstat/ss output
ss -tlnp | awk 'NR>1 {
    split($4, addr, ":")
    port = addr[length(addr)]
    gsub(/.*"/, "", $6)
    gsub(/".*/, "", $6)
    printf "%-6s %-20s\n", port, $6
}' | sort -n

# Vault audit log analysis
sudo cat /var/log/vault/audit.log | jq -r '[.time, .type, .request.operation, .request.path] | @tsv' | \
    awk -F'\t' '{
        ops[$3]++
        paths[$4]++
    }
    END {
        print "=== Operations ==="
        for(op in ops) printf "%-15s %d\n", op, ops[op]
        print "\n=== Top Paths ==="
        for(path in paths) printf "%-40s %d\n", path, paths[path]
    }' | sort -k2 -rn | head -20

AWK Gotchas

# WRONG: Forgetting to quote the awk program
awk {print $1} file                      # Shell expands $1!

# CORRECT: Always quote
awk '{print $1}' file

# WRONG: Using shell variables inside single quotes
var="pattern"
awk '/$var/ {print}' file                # Literal $var, not expanded

# CORRECT: Use -v to pass variables
awk -v pat="$var" '$0 ~ pat {print}' file

# Or use double quotes (but escape $ for awk)
awk "/$var/ {print}" file

# WRONG: Integer division when you want float
awk 'BEGIN {print 3/4}'                  # 0.75 - actually works in awk
awk 'BEGIN {print int(3/4)}'             # 0 - if you want integer

# WRONG: Comparing numbers as strings
awk '$1 > 9' file                        # "10" < "9" as strings!

# CORRECT: Force numeric comparison
awk '$1+0 > 9' file

# WRONG: Modifying $0 without recalculating fields
awk '{gsub(/old/, "new"); print $1}' file  # $1 is from BEFORE gsub

# CORRECT: Reference $0 or re-parse
awk '{gsub(/old/, "new"); print $0}' file

# WRONG: Assuming fields exist
awk '{print $10}' file                   # Empty if <10 fields

# CORRECT: Check first
awk 'NF >= 10 {print $10}' file

# WRONG: Regex in variable with slashes
awk -v pat="/var/log" '$0 ~ pat' file    # Fails!

# CORRECT: Escape or use different approach
awk -v pat="/var/log" 'index($0, pat)' file

# WRONG: Using = instead of == for comparison
awk '$1 = "value"' file                  # Assignment, not comparison!

# CORRECT: Use == for comparison
awk '$1 == "value"' file

# WRONG: Expecting associative array order
awk '{a[$1]++} END {for(k in a) print k}' file  # Order not guaranteed

# CORRECT: Sort externally or use PROCINFO (gawk)
awk '{a[$1]++} END {for(k in a) print k}' file | sort

Circular Buffer (tail emulation)

Last N lines using modulo array (no external commands)
# Last 10 lines - awk replacement for tail -10
awk '{a[NR%10]=$0} END {for(i=NR+1;i<=NR+10;i++) print a[i%10]}' file

# Last 35 lines
awk '{a[NR%35]=$0} END {for(i=NR+1;i<=NR+35;i++) print a[i%35]}' file

# Variable N (pass as awk variable)
awk -v n=20 '{a[NR%n]=$0} END {for(i=NR+1;i<=NR+n;i++) print a[i%n]}' file
How it works
a[NR%10]     # Store line in array slot 0-9 (wraps around)
NR=1 → a[1]  # Line 1 stored in slot 1
NR=10 → a[0] # Line 10 stored in slot 0
NR=11 → a[1] # Line 11 OVERWRITES slot 1 (old line 1 gone)
...
END block    # Array contains only last 10 lines
When to use this vs tail
# Use tail when:
- Simple last N lines
- Piping is acceptable

# Use awk circular buffer when:
- Already in awk pipeline (no context switch)
- Need to process last N lines with awk logic
- Learning awk arrays + modulo

Field Transforms

awk '{print $3, $1, $2}' file

awk '{for(i=NF;i>0;i--) printf "%s ", $i; print ""}' file

awk -F',' '{$1=$1} OFS="\t"' file.csv

awk -F':' '{$1=$1} OFS="|"' /etc/passwd

awk -F'[ ,:]' '{print $1, $3}' file

awk '{sum+=$1} END {print sum}' file

awk '{sum+=$1; c++} END {print sum/c}' file

awk '{a[NR]=$1; sum+=$1} END {for(i=1;i⇐NR;i++) printf "%s %.2f%%\n", a[i], a[i]/sum*100}' file

awk '{print $1, $2, $1+$2}' file

awk '{sum+=$1; print $0, sum}' file

awk '{printf "%-20s %10s %8.2f\n", $1, $2, $3}' file

awk '{printf "%20s\n", $1}' file

awk '{printf "%05d\n", $1}' file

awk '{for(i=1;i⇐NF;i) a[i,NR]=$i} END {for(i=1;i<=NF;i) {for(j=1;j⇐NR;j++) printf "%s ", a[i,j]; print ""}}' file

awk '{a[$1]=a[$1] ? a[$1]","$2 : $2} END {for(k in a) print k, a[k]}' file

echo "user=admin;role=root" | awk -F'[=;]' '{print $2, $4}'

awk -F'=' '/^hostname/ {print $2}' config.txt

awk '$3>100 {$3="HIGH"} {print}' file

awk '{$2=""} {print}' file

awk '{print $1, ($2?$2:"N/A"), $3}' file