AWK Mastery

Overview

AWK is a pattern-scanning and processing language. It excels at column-based text processing, making it essential for log analysis and data extraction.

Basic Syntax

awk 'pattern { action }' file
awk -F'delimiter' 'pattern { action }' file

Field Reference

Variable Description Example

Variable	Description	Example
`$0`	Entire line	`print $0`
`$1, $2, …`	Individual fields	`print $1, $3`
`$NF`	Last field	`print $NF`
`$(NF-1)`	Second to last	`print $(NF-1)`
`NF`	Number of fields	`if (NF > 5)`
`NR`	Current record number	`print NR, $0`
`FNR`	Record number in current file	`if (FNR == 1)`
`FS`	Field separator (input)	`FS=":"`
`OFS`	Output field separator	`OFS="\t"`
`RS`	Record separator	`RS="\n\n"`
`ORS`	Output record separator	`ORS="\n\n"`
`FILENAME`	Current filename	`print FILENAME`

$0

Entire line

print $0

$1, $2, …

Individual fields

print $1, $3

$NF

Last field

print $NF

$(NF-1)

Second to last

print $(NF-1)

NF

Number of fields

if (NF > 5)

NR

Current record number

print NR, $0

FNR

Record number in current file

if (FNR == 1)

FS

Field separator (input)

FS=":"

OFS

Output field separator

OFS="\t"

RS

Record separator

RS="\n\n"

ORS

Output record separator

ORS="\n\n"

FILENAME

Current filename

print FILENAME

Field Delimiters

Setting the Delimiter

# Command line
awk -F':' '{print $1}' /etc/passwd

# In script
awk 'BEGIN { FS=":" } { print $1 }' /etc/passwd

# Multiple character delimiter
awk -F'::' '{print $1}' file

# Regex delimiter
awk -F'[,;:]' '{print $1}' file

Common Delimiters

# CSV (simple)
awk -F',' '{print $2}' data.csv

# Tab-delimited
awk -F'\t' '{print $1}' data.tsv

# Colon (passwd, shadow)
awk -F':' '{print $1, $3}' /etc/passwd

# Space (default)
awk '{print $1}' file

# Multiple spaces
awk -F' +' '{print $1}' file

Pattern Matching

Regular Expression Patterns

# Lines containing pattern
awk '/ERROR/' logfile

# Lines NOT containing pattern
awk '!/ERROR/' logfile

# Field matches pattern
awk '$1 ~ /admin/' file

# Field does NOT match
awk '$1 !~ /admin/' file

# Case insensitive
awk 'tolower($0) ~ /error/' file

Comparison Patterns

# Numeric comparison
awk '$3 > 100' file
awk '$3 >= 100 && $3 <= 200' file

# String comparison
awk '$1 == "admin"' file
awk '$1 != "root"' file

# Empty field
awk '$2 == ""' file

# Non-empty field
awk '$2 != ""' file

# Length comparison
awk 'length($1) > 10' file

Range Patterns

# From pattern A to pattern B (inclusive)
awk '/START/,/END/' file

# From line 5 to line 10
awk 'NR==5,NR==10' file

# From pattern to EOF
awk '/START/,0' file

Actions

Print Variations

# Print specific fields
awk '{print $1, $3}' file

# Print with custom separator
awk -F':' '{print $1 " -> " $7}' /etc/passwd

# Printf for formatting
awk '{printf "%-20s %10d\n", $1, $2}' file

# Print line numbers
awk '{print NR": "$0}' file

Variables and Arithmetic

# Sum a column
awk '{sum += $3} END {print sum}' file

# Average
awk '{sum += $3; count++} END {print sum/count}' file

# Count lines
awk 'END {print NR}' file

# Running total
awk '{total += $3; print $0, total}' file

String Manipulation

# Length
awk '{print length($1), $1}' file

# Substring
awk '{print substr($1, 1, 5)}' file

# Index (find position)
awk '{print index($0, "ERROR")}' file

# Split into array
awk '{split($1, a, "-"); print a[1], a[2]}' file

# Substitute (first occurrence)
awk '{sub(/old/, "new"); print}' file

# Substitute (all occurrences)
awk '{gsub(/old/, "new"); print}' file

# To lowercase
awk '{print tolower($1)}' file

# To uppercase
awk '{print toupper($1)}' file

Conditionals

If-Else

awk '{
  if ($3 > 100)
    print $1, "HIGH"
  else if ($3 > 50)
    print $1, "MEDIUM"
  else
    print $1, "LOW"
}' file

Ternary Operator

awk '{print $1, ($3 > 100 ? "HIGH" : "LOW")}' file

Loops

For Loop

# Print all fields
awk '{for (i=1; i<=NF; i++) print $i}' file

# Reverse fields
awk '{for (i=NF; i>=1; i--) printf "%s ", $i; print ""}' file

While Loop

awk '{
  i = 1
  while (i <= NF) {
    print $i
    i++
  }
}' file

Arrays

Associative Arrays

# Count occurrences
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file

# Sum by category
awk '{sum[$1] += $2} END {for (k in sum) print k, sum[k]}' file

# Unique values
awk '!seen[$1]++' file

# Check existence
awk '{if ($1 in seen) print "DUP:", $0; seen[$1]=1}' file

Sorting Output

# Sort by count
awk '{count[$1]++} END {for (k in count) print count[k], k}' file | sort -rn

# Using PROCINFO (GNU AWK)
awk '{count[$1]++} END {
  PROCINFO["sorted_in"] = "@val_num_desc"
  for (k in count) print k, count[k]
}' file

BEGIN and END

awk '
BEGIN {
  FS = ":"
  print "Processing /etc/passwd"
  print "========================"
}
{
  users++
  if ($3 >= 1000) regular++
}
END {
  print "========================"
  print "Total users:", users
  print "Regular users:", regular
}
' /etc/passwd

Production Examples

Log Analysis

# Apache access log - requests per IP
awk '{count[$1]++} END {for (ip in count) print count[ip], ip}' \
    access.log | sort -rn | head -20

# HTTP status code distribution
awk '{count[$9]++} END {for (s in count) print s, count[s]}' access.log

# Requests per hour
awk -F'[/:]' '{hour=$5; count[hour]++} END {
  for (h in count) print h":00", count[h]
}' access.log | sort -t: -k1n

# Slow requests (response time > 5s)
awk '$NF > 5 {print}' access.log

# Bandwidth by URL
awk '{sum[$7] += $10} END {
  for (url in sum) print sum[url], url
}' access.log | sort -rn | head -20

System Administration

# Users with shells
awk -F: '$7 ~ /bash|zsh|fish/ {print $1, $7}' /etc/passwd

# Large processes (memory > 1%)
ps aux | awk '$4 > 1.0 {print $4"%", $11}'

# Disk usage > 80%
df -h | awk '$5+0 > 80 {print $6, $5}'

# Network connections by state
ss -ta | awk 'NR>1 {count[$1]++} END {for (s in count) print s, count[s]}'

# Top 10 largest files in directory
ls -lS | awk 'NR>1 {print $5, $9}' | head -10

CSV Processing

# Extract columns (skip header)
awk -F',' 'NR>1 {print $1, $3}' data.csv

# Sum column
awk -F',' 'NR>1 {sum += $3} END {print "Total:", sum}' data.csv

# Filter by condition
awk -F',' '$3 > 100' data.csv

# Convert to TSV
awk -F',' '{$1=$1; print}' OFS='\t' data.csv

# Add header
awk -F',' 'NR==1 {print "Name,Value,Status"} NR>0 {print}' data.csv

SIEM and Security

# Failed SSH attempts per IP
grep "Failed password" /var/log/auth.log | \
  awk '{print $(NF-3)}' | sort | uniq -c | sort -rn | head -20

# Successful logins
grep "Accepted" /var/log/auth.log | \
  awk '{print $9, $11}' | sort | uniq

# Extract IPs from any log
awk 'match($0, /[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/) {
  print substr($0, RSTART, RLENGTH)
}' logfile | sort -u

# Parse syslog priority
awk -F'[<>]' '{
  priority = $2
  facility = int(priority / 8)
  severity = priority % 8
  print "Facility:", facility, "Severity:", severity
}' syslog

Data Transformation

# JSON-like output
awk -F: '{
  printf "{\"user\": \"%s\", \"uid\": %s, \"shell\": \"%s\"}\n", $1, $3, $7
}' /etc/passwd

# Pivot data (rows to columns)
awk '{
  key[$1] = $1
  val[$1] = val[$1] " " $2
} END {
  for (k in key) print key[k], val[k]
}' file

# Transpose (columns to rows)
awk '{
  for (i=1; i<=NF; i++) a[NR,i] = $i
  if (NF > max) max = NF
}
END {
  for (i=1; i<=max; i++) {
    for (j=1; j<=NR; j++) printf "%s ", a[j,i]
    print ""
  }
}' file

Multi-file Processing

# Process multiple files with filename
awk '{print FILENAME ":" NR ":" $0}' file1 file2

# Compare files (find common lines)
awk 'NR==FNR {a[$0]; next} $0 in a' file1 file2

# Find lines only in file1
awk 'NR==FNR {a[$0]; next} !($0 in a)' file2 file1

# Merge files by key
awk -F: 'NR==FNR {a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1 file2

Built-in Functions Reference

Function Description Example

Function	Description	Example
`length(s)`	String length	`length($1)`
`substr(s,start,len)`	Substring	`substr($1,1,5)`
`index(s,target)`	Find position	`index($0,"ERROR")`
`split(s,arr,sep)`	Split into array	`split($1,a,"-")`
`sub(re,repl,s)`	Replace first	`sub(/old/,"new",$0)`
`gsub(re,repl,s)`	Replace all	`gsub(/old/,"new",$0)`
`match(s,re)`	Regex match	`match($0,/[0-9]+/)`
`tolower(s)`	Lowercase	`tolower($1)`
`toupper(s)`	Uppercase	`toupper($1)`
`sprintf(fmt,…)`	Format string	`sprintf("%05d",$1)`
`int(x)`	Integer	`int(3.14)`
`sqrt(x)`	Square root	`sqrt($1)`
`log(x)`	Natural log	`log($1)`
`exp(x)`	Exponential	`exp(1)`
`rand()`	Random 0-1	`rand()`
`srand(seed)`	Seed random	`srand()`
`system(cmd)`	Run command	`system("date")`
`getline`	Read next line	`getline < "file"`

length(s)

String length

length($1)

substr(s,start,len)

Substring

substr($1,1,5)

index(s,target)

Find position

index($0,"ERROR")

split(s,arr,sep)

Split into array

split($1,a,"-")

sub(re,repl,s)

Replace first

sub(/old/,"new",$0)

gsub(re,repl,s)

Replace all

gsub(/old/,"new",$0)

match(s,re)

Regex match

match($0,/[0-9]+/)

tolower(s)

Lowercase

tolower($1)

toupper(s)

Uppercase

toupper($1)

sprintf(fmt,…)

Format string

sprintf("%05d",$1)

int(x)

Integer

int(3.14)

sqrt(x)

Square root

sqrt($1)

log(x)

Natural log

log($1)

exp(x)

Exponential

exp(1)

rand()

Random 0-1