AWK Mastery

Overview

AWK is a pattern-scanning and processing language. It excels at column-based text processing, making it essential for log analysis and data extraction.

Basic Syntax

awk 'pattern { action }' file
awk -F'delimiter' 'pattern { action }' file

Field Reference

Variable Description Example

$0

Entire line

print $0

$1, $2, …​

Individual fields

print $1, $3

$NF

Last field

print $NF

$(NF-1)

Second to last

print $(NF-1)

NF

Number of fields

if (NF > 5)

NR

Current record number

print NR, $0

FNR

Record number in current file

if (FNR == 1)

FS

Field separator (input)

FS=":"

OFS

Output field separator

OFS="\t"

RS

Record separator

RS="\n\n"

ORS

Output record separator

ORS="\n\n"

FILENAME

Current filename

print FILENAME

Field Delimiters

Setting the Delimiter

# Command line
awk -F':' '{print $1}' /etc/passwd

# In script
awk 'BEGIN { FS=":" } { print $1 }' /etc/passwd

# Multiple character delimiter
awk -F'::' '{print $1}' file

# Regex delimiter
awk -F'[,;:]' '{print $1}' file

Common Delimiters

# CSV (simple)
awk -F',' '{print $2}' data.csv

# Tab-delimited
awk -F'\t' '{print $1}' data.tsv

# Colon (passwd, shadow)
awk -F':' '{print $1, $3}' /etc/passwd

# Space (default)
awk '{print $1}' file

# Multiple spaces
awk -F' +' '{print $1}' file

Pattern Matching

Regular Expression Patterns

# Lines containing pattern
awk '/ERROR/' logfile

# Lines NOT containing pattern
awk '!/ERROR/' logfile

# Field matches pattern
awk '$1 ~ /admin/' file

# Field does NOT match
awk '$1 !~ /admin/' file

# Case insensitive
awk 'tolower($0) ~ /error/' file

Comparison Patterns

# Numeric comparison
awk '$3 > 100' file
awk '$3 >= 100 && $3 <= 200' file

# String comparison
awk '$1 == "admin"' file
awk '$1 != "root"' file

# Empty field
awk '$2 == ""' file

# Non-empty field
awk '$2 != ""' file

# Length comparison
awk 'length($1) > 10' file

Range Patterns

# From pattern A to pattern B (inclusive)
awk '/START/,/END/' file

# From line 5 to line 10
awk 'NR==5,NR==10' file

# From pattern to EOF
awk '/START/,0' file

Actions

Print Variations

# Print specific fields
awk '{print $1, $3}' file

# Print with custom separator
awk -F':' '{print $1 " -> " $7}' /etc/passwd

# Printf for formatting
awk '{printf "%-20s %10d\n", $1, $2}' file

# Print line numbers
awk '{print NR": "$0}' file

Variables and Arithmetic

# Sum a column
awk '{sum += $3} END {print sum}' file

# Average
awk '{sum += $3; count++} END {print sum/count}' file

# Count lines
awk 'END {print NR}' file

# Running total
awk '{total += $3; print $0, total}' file

String Manipulation

# Length
awk '{print length($1), $1}' file

# Substring
awk '{print substr($1, 1, 5)}' file

# Index (find position)
awk '{print index($0, "ERROR")}' file

# Split into array
awk '{split($1, a, "-"); print a[1], a[2]}' file

# Substitute (first occurrence)
awk '{sub(/old/, "new"); print}' file

# Substitute (all occurrences)
awk '{gsub(/old/, "new"); print}' file

# To lowercase
awk '{print tolower($1)}' file

# To uppercase
awk '{print toupper($1)}' file

Conditionals

If-Else

awk '{
  if ($3 > 100)
    print $1, "HIGH"
  else if ($3 > 50)
    print $1, "MEDIUM"
  else
    print $1, "LOW"
}' file

Ternary Operator

awk '{print $1, ($3 > 100 ? "HIGH" : "LOW")}' file

Loops

For Loop

# Print all fields
awk '{for (i=1; i<=NF; i++) print $i}' file

# Reverse fields
awk '{for (i=NF; i>=1; i--) printf "%s ", $i; print ""}' file

While Loop

awk '{
  i = 1
  while (i <= NF) {
    print $i
    i++
  }
}' file

Arrays

Associative Arrays

# Count occurrences
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file

# Sum by category
awk '{sum[$1] += $2} END {for (k in sum) print k, sum[k]}' file

# Unique values
awk '!seen[$1]++' file

# Check existence
awk '{if ($1 in seen) print "DUP:", $0; seen[$1]=1}' file

Sorting Output

# Sort by count
awk '{count[$1]++} END {for (k in count) print count[k], k}' file | sort -rn

# Using PROCINFO (GNU AWK)
awk '{count[$1]++} END {
  PROCINFO["sorted_in"] = "@val_num_desc"
  for (k in count) print k, count[k]
}' file

BEGIN and END

awk '
BEGIN {
  FS = ":"
  print "Processing /etc/passwd"
  print "========================"
}
{
  users++
  if ($3 >= 1000) regular++
}
END {
  print "========================"
  print "Total users:", users
  print "Regular users:", regular
}
' /etc/passwd

Production Examples

Log Analysis

# Apache access log - requests per IP
awk '{count[$1]++} END {for (ip in count) print count[ip], ip}' \
    access.log | sort -rn | head -20

# HTTP status code distribution
awk '{count[$9]++} END {for (s in count) print s, count[s]}' access.log

# Requests per hour
awk -F'[/:]' '{hour=$5; count[hour]++} END {
  for (h in count) print h":00", count[h]
}' access.log | sort -t: -k1n

# Slow requests (response time > 5s)
awk '$NF > 5 {print}' access.log

# Bandwidth by URL
awk '{sum[$7] += $10} END {
  for (url in sum) print sum[url], url
}' access.log | sort -rn | head -20

System Administration

# Users with shells
awk -F: '$7 ~ /bash|zsh|fish/ {print $1, $7}' /etc/passwd

# Large processes (memory > 1%)
ps aux | awk '$4 > 1.0 {print $4"%", $11}'

# Disk usage > 80%
df -h | awk '$5+0 > 80 {print $6, $5}'

# Network connections by state
ss -ta | awk 'NR>1 {count[$1]++} END {for (s in count) print s, count[s]}'

# Top 10 largest files in directory
ls -lS | awk 'NR>1 {print $5, $9}' | head -10

CSV Processing

# Extract columns (skip header)
awk -F',' 'NR>1 {print $1, $3}' data.csv

# Sum column
awk -F',' 'NR>1 {sum += $3} END {print "Total:", sum}' data.csv

# Filter by condition
awk -F',' '$3 > 100' data.csv

# Convert to TSV
awk -F',' '{$1=$1; print}' OFS='\t' data.csv

# Add header
awk -F',' 'NR==1 {print "Name,Value,Status"} NR>0 {print}' data.csv

SIEM and Security

# Failed SSH attempts per IP
grep "Failed password" /var/log/auth.log | \
  awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -20

# Successful logins
grep "Accepted" /var/log/auth.log | \
  awk '{print $9, $11}' | sort | uniq

# Extract IPs from any log
awk 'match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/) {
  print substr($0, RSTART, RLENGTH)
}' logfile | sort -u

# Parse syslog priority
awk -F'[<>]' '{
  priority = $2
  facility = int(priority / 8)
  severity = priority % 8
  print "Facility:", facility, "Severity:", severity
}' syslog

Data Transformation

# JSON-like output
awk -F: '{
  printf "{\"user\": \"%s\", \"uid\": %s, \"shell\": \"%s\"}\n", $1, $3, $7
}' /etc/passwd

# Pivot data (rows to columns)
awk '{
  key[$1] = $1
  val[$1] = val[$1] " " $2
} END {
  for (k in key) print key[k], val[k]
}' file

# Transpose (columns to rows)
awk '{
  for (i=1; i<=NF; i++) a[NR,i] = $i
  if (NF > max) max = NF
}
END {
  for (i=1; i<=max; i++) {
    for (j=1; j<=NR; j++) printf "%s ", a[j,i]
    print ""
  }
}' file

Multi-file Processing

# Process multiple files with filename
awk '{print FILENAME ":" NR ":" $0}' file1 file2

# Compare files (find common lines)
awk 'NR==FNR {a[$0]; next} $0 in a' file1 file2

# Find lines only in file1
awk 'NR==FNR {a[$0]; next} !($0 in a)' file2 file1

# Merge files by key
awk -F: 'NR==FNR {a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1 file2

Built-in Functions Reference

Function Description Example

length(s)

String length

length($1)

substr(s,start,len)

Substring

substr($1,1,5)

index(s,target)

Find position

index($0,"ERROR")

split(s,arr,sep)

Split into array

split($1,a,"-")

sub(re,repl,s)

Replace first

sub(/old/,"new",$0)

gsub(re,repl,s)

Replace all

gsub(/old/,"new",$0)

match(s,re)

Regex match

match($0,/[0-9]+/)

tolower(s)

Lowercase

tolower($1)

toupper(s)

Uppercase

toupper($1)

sprintf(fmt,…​)

Format string

sprintf("%05d",$1)

int(x)

Integer

int(3.14)

sqrt(x)

Square root

sqrt($1)

log(x)

Natural log

log($1)

exp(x)

Exponential

exp(1)

rand()

Random 0-1

rand()

srand(seed)

Seed random

srand()

system(cmd)

Run command

system("date")

getline

Read next line

getline < "file"

CTF and Security Patterns

Flag Extraction

# Find flag patterns
strings binary | awk '/flag\{.*\}/ {print}'
awk '/CTF\{[^}]+\}/ {match($0, /CTF\{[^}]+\}/); print substr($0, RSTART, RLENGTH)}' file

# Extract between markers
awk '/BEGIN FLAG/,/END FLAG/' file

# Decode hex
echo "48656c6c6f" | awk '{
    for (i=1; i<=length($0); i+=2)
        printf "%c", strtonum("0x" substr($0,i,2))
    print ""
}'

IP and Network Analysis

# Extract and count IPs
awk 'match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/) {
    ip = substr($0, RSTART, RLENGTH)
    count[ip]++
}
END {
    for (ip in count) print count[ip], ip
}' access.log | sort -rn

# Filter by IP range
awk -F'.' '$1==10 && $2==50 {print}' file

# Calculate bandwidth per IP
awk '{bytes[$1] += $10} END {
    for (ip in bytes) printf "%s: %.2f MB\n", ip, bytes[ip]/1048576
}' access.log | sort -t: -k2 -rn

Credential Hunting

# Find potential passwords
awk '/password|passwd|pwd|secret|token|key/ && /[=:]/ {print}' config_files

# Extract key=value pairs
awk -F'[=:]' '/password|api_key|secret/ {
    gsub(/^[ \t]+|[ \t]+$/, "", $1)
    gsub(/^[ \t"'\'']+|[ \t"'\'']+$/, "", $2)
    print $1 " => " $2
}' file

# Parse environment variables
awk -F= '/^[A-Z_]+.*=/ {print $1, "=", $2}' .env

Data Transformation Patterns

CSV Operations

# CSV to JSON array
awk -F',' 'NR==1 {
    for (i=1; i<=NF; i++) header[i]=$i
    next
}
{
    printf "{"
    for (i=1; i<=NF; i++) {
        printf "\"%s\":\"%s\"", header[i], $i
        if (i<NF) printf ","
    }
    print "},"
}' data.csv

# Transpose CSV (rows to columns)
awk -F',' '{
    for (i=1; i<=NF; i++) a[NR,i]=$i
    if (NF>max) max=NF
}
END {
    for (i=1; i<=max; i++) {
        for (j=1; j<=NR; j++) printf "%s%s", a[j,i], (j<NR?",":"")
        print ""
    }
}' data.csv

# Merge two CSVs by first column
awk -F',' 'NR==FNR {a[$1]=$0; next} $1 in a {print a[$1] "," $2}' file1.csv file2.csv

# Pivot table
awk -F',' 'NR>1 {
    sum[$1,$2] += $3
    rows[$1]++
    cols[$2]++
}
END {
    for (r in rows) {
        printf "%s", r
        for (c in cols) printf ",%s", sum[r,c]+0
        print ""
    }
}' data.csv

Text Cleanup

# Normalize whitespace
awk '{$1=$1; print}' file

# Remove duplicate consecutive lines
awk 'prev != $0 {print} {prev=$0}' file

# Trim all fields
awk '{for(i=1;i<=NF;i++) gsub(/^[ \t]+|[ \t]+$/, "", $i); print}' file

# Convert CamelCase to snake_case
echo "CamelCaseString" | awk '{
    gsub(/([A-Z])/, "_&"); sub(/^_/, ""); print tolower($0)
}'

# Wrap long lines at 80 characters
awk '{
    while (length($0) > 80) {
        print substr($0, 1, 80)
        $0 = substr($0, 81)
    }
    print
}' file

Reporting

# Create formatted report
awk 'BEGIN {
    printf "%-20s %10s %10s\n", "NAME", "VALUE", "PERCENT"
    printf "%-20s %10s %10s\n", "----", "-----", "-------"
}
{
    total += $2
    data[NR] = $1 ":" $2
}
END {
    for (i=1; i<=NR; i++) {
        split(data[i], parts, ":")
        printf "%-20s %10d %9.1f%%\n", parts[1], parts[2], (parts[2]/total)*100
    }
    printf "%-20s %10d\n", "TOTAL", total
}' data.txt

# Histogram
awk '{
    count[$1]++
    if (count[$1] > max) max = count[$1]
}
END {
    for (k in count) {
        bar = ""
        for (i=0; i<count[k]; i++) bar = bar "#"
        printf "%-20s %5d %s\n", k, count[k], bar
    }
}' data.txt | sort -k2 -rn

Advanced Date/Time

# Parse and reformat dates
awk '{
    split($1, d, "-")
    printf "%s/%s/%s %s\n", d[2], d[3], d[1], $2
}' file  # 2026-02-13 -> 02/13/2026

# Filter by date range
awk -F',' '$1 >= "2026-02-01" && $1 <= "2026-02-28"' data.csv

# Calculate time differences (HH:MM:SS)
awk 'BEGIN {FS=":"} {
    total = $1*3600 + $2*60 + $3
    printf "%d seconds\n", total
}' <<< "01:30:45"

# Aggregate by hour
awk -F'[: ]' '{hour[$2]++} END {for (h in hour) print h":00", hour[h]}' logfile | sort

Inline awk Programs

# One-liner templates
awk 'BEGIN{action} pattern{action} END{action}' file

# Useful shorthands
awk '1'                    # Print all (same as cat)
awk 'NF'                   # Print non-empty lines
awk '!a[$0]++'             # Remove duplicate lines
awk '{print NF}'           # Count fields per line
awk 'END{print NR}'        # Count lines
awk '{s+=$1}END{print s}'  # Sum first column
awk 'NR==10'               # Print line 10
awk 'NR>=10&&NR<=20'       # Print lines 10-20
awk '/start/,/end/'        # Print range
awk 'NR%2'                 # Print odd lines
awk '!(NR%2)'              # Print even lines
awk '{print NR": "$0}'     # Number lines
awk '{$NF=""};1'           # Remove last field
awk '{$1=""};1'            # Remove first field
awk 'gsub(/old/,"new")'    # Only print changed lines
awk 'length>80'            # Lines longer than 80
awk '{print $NF}'          # Print last field
awk '{print $(NF-1)}'      # Print second to last
awk 'NF>5'                 # Lines with more than 5 fields
awk '/pattern/{getline;print}' # Print line after match
awk '/pattern/{print;getline;print}' # Print match and next line

Quick Reference

# Print first field
awk '{print $1}' file

# Print last field
awk '{print $NF}' file

# Print fields 1 and 3
awk '{print $1, $3}' file

# Custom delimiter
awk -F':' '{print $1}' file

# Sum column 3
awk '{sum+=$3} END {print sum}' file

# Count lines matching pattern
awk '/pattern/ {count++} END {print count}' file

# Remove duplicates
awk '!seen[$0]++' file

# Print lines longer than 80 chars
awk 'length > 80' file

# Add line numbers
awk '{print NR, $0}' file

# Print every other line
awk 'NR % 2 == 1' file

# Filter and transform
awk -F',' '$3>100 {print $1, $3*2}' file

# Group and sum
awk -F',' '{sum[$1]+=$2} END {for(k in sum) print k,sum[k]}' file