awk
awk is a complete programming language for text processing. Master field extraction, pattern matching, arrays, and functions for powerful data manipulation.
Basic Syntax
awk 'pattern { action }' file
awk -F: 'pattern { action }' file # Custom field separator
awk -v var=value '{ print var }' file # Pass variable
Program Structure
BEGIN { setup code } # Runs before input
/pattern/ { action } # For matching lines
{ default action } # For all lines
END { cleanup/summary code } # Runs after input
Fields and Records
Field Access
awk '{print $1}' file # First field
awk '{print $2, $3}' file # Fields 2 and 3
awk '{print $NF}' file # Last field
awk '{print $(NF-1)}' file # Second to last
awk '{print $0}' file # Entire line
Field Separator
awk -F: '{print $1}' /etc/passwd # Colon separator
awk -F'[ \t]+' '{print $1}' file # Whitespace (regex)
awk -F, '{print $2}' file.csv # Comma (CSV)
awk 'BEGIN{FS=":"} {print $1}' file # Set in BEGIN
Output Field Separator
awk -F: 'BEGIN{OFS=","} {print $1,$3}' /etc/passwd
# user,1000
Record Separator
# Paragraph mode (blank line separates records)
awk 'BEGIN{RS=""} {print NR": "$0}' file
Built-in Variables
| Variable | Meaning |
|---|---|
NR |
Number of Records (current line number across all files) |
NF |
Number of Fields in current record |
FNR |
File Number of Records (resets per file) |
FS |
Field Separator (input) |
OFS |
Output Field Separator |
RS |
Record Separator (input) |
ORS |
Output Record Separator |
FILENAME |
Current filename |
$0 |
Entire current record |
$1, $2… |
Individual fields |
Pattern Matching
Regex Pattern
awk '/error/' file # Lines containing "error"
awk '/^#/' file # Lines starting with #
awk '!/^#/' file # Lines NOT starting with #
awk '/error|warning/' file # Either pattern
Field Pattern
awk '$3 ~ /error/' file # Field 3 contains "error"
awk '$3 !~ /error/' file # Field 3 doesn't contain
awk '$1 == "admin"' file # Field 1 equals "admin"
Comparison
awk '$3 > 100' file # Field 3 > 100
awk '$3 >= 100 && $3 <= 200' file # Range
awk '$3 != 0' file # Field 3 not zero
awk 'NF > 3' file # More than 3 fields
Line Number
awk 'NR == 1' file # First line
awk 'NR >= 10 && NR <= 20' file # Lines 10-20
awk 'NR > 1' file # Skip header
awk 'FNR == 1' file1 file2 # First line of each file
Range Pattern
awk '/start/,/end/' file # From start to end pattern
awk 'NR==5,NR==10' file # Lines 5-10
Actions
awk '{print}' file # Print entire line
awk '{print $1}' file # Print first field
awk '{print $1, $2}' file # Print with OFS
awk '{print $1 " - " $2}' file # Print with custom separator
awk '{printf "%s\t%d\n", $1, $2}' file # Formatted print
Assignment
awk '{$2 = $2 * 2; print}' file # Modify field
awk '{total += $1} END{print total}' # Accumulate
Next and Exit
awk '/skip/{next} {print}' file # Skip matching lines
awk '/found/{exit}' file # Stop at match
Printf Formatting
# Format specifiers
%s # String
%d # Integer
%f # Float
%e # Scientific
%x # Hex
%% # Literal %
# Width and precision
%-10s # Left-align, 10 chars
%10s # Right-align, 10 chars
%8.2f # 8 total, 2 decimal
awk '{printf "%-20s %8d\n", $1, $2}' file
Arithmetic
awk '{print $1 + $2}' file # Addition
awk '{print $1 - $2}' file # Subtraction
awk '{print $1 * $2}' file # Multiplication
awk '{print $1 / $2}' file # Division
awk '{print $1 % $2}' file # Modulo
awk '{print $1 ^ 2}' file # Power
awk '{print int($1)}' file # Integer part
awk '{print sqrt($1)}' file # Square root
Accumulation
# Sum
awk '{sum += $1} END {print sum}' file
# Count
awk 'END {print NR}' file
# Average
awk '{sum += $1} END {print sum/NR}' file
# Min/Max
awk 'BEGIN{max=0} $1>max{max=$1} END{print max}' file
awk 'NR==1||$1<min{min=$1} END{print min}' file
String Functions
# Length
awk '{print length($1)}' file
# Substring
awk '{print substr($1, 1, 5)}' file # First 5 chars
# Index (find position)
awk '{print index($0, "pattern")}' file
# Split
awk '{n=split($1,arr,":"); print arr[1]}' file
# Substitution
awk '{gsub(/old/, "new"); print}' file # Global
awk '{sub(/old/, "new"); print}' file # First only
# Case
awk '{print toupper($1)}' file
awk '{print tolower($1)}' file
# Match
awk 'match($0, /[0-9]+/) {print substr($0, RSTART, RLENGTH)}' file
Arrays
Associative Arrays
# Count occurrences
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file
# Sum by key
awk '{sum[$1] += $2} END {for (k in sum) print k, sum[k]}' file
# Store and retrieve
awk '{data[NR] = $0} END {print data[5]}' file
Array Operations
# Check existence
awk '{if ($1 in seen) print "dup"; seen[$1]=1}' file
# Delete element
awk '{delete arr[key]}' file
# Array length (gawk)
awk '{a[NR]=$1} END {print length(a)}' file
Control Structures
If-Else
awk '{if ($1 > 100) print "high"; else print "low"}' file
awk '{
if ($1 > 100) status = "high"
else if ($1 > 50) status = "medium"
else status = "low"
print $1, status
}' file
Loops
# For loop
awk '{for (i=1; i<=NF; i++) print $i}' file
# While loop
awk '{i=1; while (i<=NF) {print $i; i++}}' file
# For-in (arrays)
awk '{a[$1]++} END {for (k in a) print k, a[k]}' file
Ternary Operator
awk '{print ($1 > 100 ? "high" : "low")}' file
Infrastructure Patterns
Parse /etc/passwd
# List users with UID >= 1000
awk -F: '$3 >= 1000 {print $1}' /etc/passwd
# Users with bash shell
awk -F: '$7 ~ /bash/ {print $1}' /etc/passwd
Parse ps Output
# Top memory consumers
ps aux | awk 'NR>1 {print $4, $11}' | sort -rn | head
# By user
ps aux | awk '{user[$1] += $4} END {for (u in user) print u, user[u]"%"}'
Parse ss/netstat
# Count connections by state
ss -tn | awk 'NR>1 {count[$1]++} END {for (s in count) print s, count[s]}'
# Connections by remote IP
ss -tn | awk 'NR>1 {split($5,a,":"); count[a[1]]++} END {for (ip in count) print count[ip], ip}' | sort -rn
Parse Logs
# Errors per hour
awk '/ERROR/ {print substr($1,1,13)}' app.log | sort | uniq -c
# ISE auth failures by user
awk -F, '/FAILED/ {count[$3]++} END {for (u in count) print count[u], u}' ise.csv | sort -rn
Parse CSV
# Skip header, sum column 3
awk -F, 'NR>1 {sum+=$3} END {print sum}' data.csv
# Filter by column value
awk -F, '$2 == "active" {print $1}' data.csv
JSON Field Extraction
# Simple extraction (not robust for nested JSON)
awk -F'"' '/"name":/ {print $4}' file.json
# For real JSON, use jq - but awk works for simple cases
Network Calculations
# IP to integer
awk -F. '{print ($1*256^3)+($2*256^2)+($3*256)+$4}' <<< "192.168.1.1"
# Calculate bandwidth
awk '/bytes/ {sum+=$NF} END {printf "%.2f GB\n", sum/1024/1024/1024}' log
Line Numbering with Format
awk '{printf "%4d: %s\n", NR, $0}' file
Print Line Range
awk 'NR>=100 && NR<=110' file
Join Lines
# Join all lines
awk '{printf "%s ", $0} END {print ""}' file
# Join every 3 lines
awk 'NR%3{printf "%s ", $0; next}{print}' file
Transpose
# Rows to columns
awk '{for(i=1;i<=NF;i++) a[i,NR]=$i; max=(NF>max?NF:max)}
END {for(i=1;i<=max;i++){for(j=1;j<=NR;j++) printf "%s ", a[i,j]; print ""}}' file
User-Defined Functions
awk '
function abs(x) { return (x < 0 ? -x : x) }
function max(a, b) { return (a > b ? a : b) }
{ print abs($1), max($2, $3) }
' file
Multiple Files
# Process multiple files
awk '{print FILENAME, $0}' file1 file2
# Join files (like paste)
awk 'FNR==NR {a[FNR]=$0; next} {print a[FNR], $0}' file1 file2
# Subtract file2 from file1
awk 'FNR==NR {a[$1]; next} !($1 in a)' file2 file1
Performance Tips
# Exit early
awk '/pattern/ {print; exit}' file
# Use next to skip
awk '/skip/{next} {expensive_operation}' file
# Avoid regex when simple string works
awk '$1 == "exact"' file # Faster than /^exact$/
Quick Reference
# Structure
awk 'BEGIN{} pattern{action} END{}' file
# Common patterns
/regex/ # Match regex
$1 == "value" # Field equals
$1 ~ /regex/ # Field matches
NR == 5 # Line 5
NR > 1 # Skip first line
NR >= 10 && NR <= 20 # Line range
# Common variables
NR NF $0 $1 FILENAME FS OFS
# Common functions
length() substr() split() gsub() printf() sprintf()
# Common idioms
{count[$1]++} # Count by key
{sum+=$1} END{print sum} # Sum column
{if ($1>max) max=$1} # Find max
BEGIN{FS=","} {print $2} # CSV field
Key Takeaways
-
-Ffor field separator - Match your data format -
$1, $2, $NF- Access fields by position -
NRfor line number - Filter/range -
BEGIN/END- Setup and summary -
Associative arrays - Count, sum, group
-
gsub/sub- In-line substitution -
printf- Formatted output
Next Module
xargs - Build commands from input.