awk

awk is a complete programming language for text processing. Master field extraction, pattern matching, arrays, and functions for powerful data manipulation.

Basic Syntax

awk 'pattern { action }' file
awk -F: 'pattern { action }' file      # Custom field separator
awk -v var=value '{ print var }' file  # Pass variable

Program Structure

BEGIN { setup code }           # Runs before input
/pattern/ { action }           # For matching lines
{ default action }             # For all lines
END { cleanup/summary code }   # Runs after input

Fields and Records

Field Access

awk '{print $1}' file          # First field
awk '{print $2, $3}' file      # Fields 2 and 3
awk '{print $NF}' file         # Last field
awk '{print $(NF-1)}' file     # Second to last
awk '{print $0}' file          # Entire line

Field Separator

awk -F: '{print $1}' /etc/passwd       # Colon separator
awk -F'[ \t]+' '{print $1}' file       # Whitespace (regex)
awk -F, '{print $2}' file.csv          # Comma (CSV)
awk 'BEGIN{FS=":"} {print $1}' file    # Set in BEGIN

Output Field Separator

awk -F: 'BEGIN{OFS=","} {print $1,$3}' /etc/passwd
# user,1000

Record Separator

# Paragraph mode (blank line separates records)
awk 'BEGIN{RS=""} {print NR": "$0}' file

Built-in Variables

Variable Meaning

NR

Number of Records (current line number across all files)

NF

Number of Fields in current record

FNR

File Number of Records (resets per file)

FS

Field Separator (input)

OFS

Output Field Separator

RS

Record Separator (input)

ORS

Output Record Separator

FILENAME

Current filename

$0

Entire current record

$1, $2…​

Individual fields

Pattern Matching

Regex Pattern

awk '/error/' file                     # Lines containing "error"
awk '/^#/' file                        # Lines starting with #
awk '!/^#/' file                       # Lines NOT starting with #
awk '/error|warning/' file             # Either pattern

Field Pattern

awk '$3 ~ /error/' file                # Field 3 contains "error"
awk '$3 !~ /error/' file               # Field 3 doesn't contain
awk '$1 == "admin"' file               # Field 1 equals "admin"

Comparison

awk '$3 > 100' file                    # Field 3 > 100
awk '$3 >= 100 && $3 <= 200' file      # Range
awk '$3 != 0' file                     # Field 3 not zero
awk 'NF > 3' file                      # More than 3 fields

Line Number

awk 'NR == 1' file                     # First line
awk 'NR >= 10 && NR <= 20' file        # Lines 10-20
awk 'NR > 1' file                      # Skip header
awk 'FNR == 1' file1 file2             # First line of each file

Range Pattern

awk '/start/,/end/' file               # From start to end pattern
awk 'NR==5,NR==10' file                # Lines 5-10

Actions

Print

awk '{print}' file                     # Print entire line
awk '{print $1}' file                  # Print first field
awk '{print $1, $2}' file              # Print with OFS
awk '{print $1 " - " $2}' file         # Print with custom separator
awk '{printf "%s\t%d\n", $1, $2}' file # Formatted print

Assignment

awk '{$2 = $2 * 2; print}' file        # Modify field
awk '{total += $1} END{print total}'   # Accumulate

Next and Exit

awk '/skip/{next} {print}' file        # Skip matching lines
awk '/found/{exit}' file               # Stop at match

Printf Formatting

# Format specifiers
%s      # String
%d      # Integer
%f      # Float
%e      # Scientific
%x      # Hex
%%      # Literal %

# Width and precision
%-10s   # Left-align, 10 chars
%10s    # Right-align, 10 chars
%8.2f   # 8 total, 2 decimal
awk '{printf "%-20s %8d\n", $1, $2}' file

Arithmetic

awk '{print $1 + $2}' file             # Addition
awk '{print $1 - $2}' file             # Subtraction
awk '{print $1 * $2}' file             # Multiplication
awk '{print $1 / $2}' file             # Division
awk '{print $1 % $2}' file             # Modulo
awk '{print $1 ^ 2}' file              # Power
awk '{print int($1)}' file             # Integer part
awk '{print sqrt($1)}' file            # Square root

Accumulation

# Sum
awk '{sum += $1} END {print sum}' file

# Count
awk 'END {print NR}' file

# Average
awk '{sum += $1} END {print sum/NR}' file

# Min/Max
awk 'BEGIN{max=0} $1>max{max=$1} END{print max}' file
awk 'NR==1||$1<min{min=$1} END{print min}' file

String Functions

# Length
awk '{print length($1)}' file

# Substring
awk '{print substr($1, 1, 5)}' file    # First 5 chars

# Index (find position)
awk '{print index($0, "pattern")}' file

# Split
awk '{n=split($1,arr,":"); print arr[1]}' file

# Substitution
awk '{gsub(/old/, "new"); print}' file    # Global
awk '{sub(/old/, "new"); print}' file     # First only

# Case
awk '{print toupper($1)}' file
awk '{print tolower($1)}' file

# Match
awk 'match($0, /[0-9]+/) {print substr($0, RSTART, RLENGTH)}' file

Arrays

Associative Arrays

# Count occurrences
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file

# Sum by key
awk '{sum[$1] += $2} END {for (k in sum) print k, sum[k]}' file

# Store and retrieve
awk '{data[NR] = $0} END {print data[5]}' file

Array Operations

# Check existence
awk '{if ($1 in seen) print "dup"; seen[$1]=1}' file

# Delete element
awk '{delete arr[key]}' file

# Array length (gawk)
awk '{a[NR]=$1} END {print length(a)}' file

Control Structures

If-Else

awk '{if ($1 > 100) print "high"; else print "low"}' file

awk '{
    if ($1 > 100) status = "high"
    else if ($1 > 50) status = "medium"
    else status = "low"
    print $1, status
}' file

Loops

# For loop
awk '{for (i=1; i<=NF; i++) print $i}' file

# While loop
awk '{i=1; while (i<=NF) {print $i; i++}}' file

# For-in (arrays)
awk '{a[$1]++} END {for (k in a) print k, a[k]}' file

Ternary Operator

awk '{print ($1 > 100 ? "high" : "low")}' file

Infrastructure Patterns

Parse /etc/passwd

# List users with UID >= 1000
awk -F: '$3 >= 1000 {print $1}' /etc/passwd

# Users with bash shell
awk -F: '$7 ~ /bash/ {print $1}' /etc/passwd

Parse ps Output

# Top memory consumers
ps aux | awk 'NR>1 {print $4, $11}' | sort -rn | head

# By user
ps aux | awk '{user[$1] += $4} END {for (u in user) print u, user[u]"%"}'

Parse ss/netstat

# Count connections by state
ss -tn | awk 'NR>1 {count[$1]++} END {for (s in count) print s, count[s]}'

# Connections by remote IP
ss -tn | awk 'NR>1 {split($5,a,":"); count[a[1]]++} END {for (ip in count) print count[ip], ip}' | sort -rn

Parse Logs

# Errors per hour
awk '/ERROR/ {print substr($1,1,13)}' app.log | sort | uniq -c

# ISE auth failures by user
awk -F, '/FAILED/ {count[$3]++} END {for (u in count) print count[u], u}' ise.csv | sort -rn

Parse CSV

# Skip header, sum column 3
awk -F, 'NR>1 {sum+=$3} END {print sum}' data.csv

# Filter by column value
awk -F, '$2 == "active" {print $1}' data.csv

JSON Field Extraction

# Simple extraction (not robust for nested JSON)
awk -F'"' '/"name":/ {print $4}' file.json

# For real JSON, use jq - but awk works for simple cases

Network Calculations

# IP to integer
awk -F. '{print ($1*256^3)+($2*256^2)+($3*256)+$4}' <<< "192.168.1.1"

# Calculate bandwidth
awk '/bytes/ {sum+=$NF} END {printf "%.2f GB\n", sum/1024/1024/1024}' log

Line Numbering with Format

awk '{printf "%4d: %s\n", NR, $0}' file

Print Line Range

awk 'NR>=100 && NR<=110' file

Join Lines

# Join all lines
awk '{printf "%s ", $0} END {print ""}' file

# Join every 3 lines
awk 'NR%3{printf "%s ", $0; next}{print}' file

Transpose

# Rows to columns
awk '{for(i=1;i<=NF;i++) a[i,NR]=$i; max=(NF>max?NF:max)}
     END {for(i=1;i<=max;i++){for(j=1;j<=NR;j++) printf "%s ", a[i,j]; print ""}}' file

User-Defined Functions

awk '
function abs(x) { return (x < 0 ? -x : x) }
function max(a, b) { return (a > b ? a : b) }
{ print abs($1), max($2, $3) }
' file

Multiple Files

# Process multiple files
awk '{print FILENAME, $0}' file1 file2

# Join files (like paste)
awk 'FNR==NR {a[FNR]=$0; next} {print a[FNR], $0}' file1 file2

# Subtract file2 from file1
awk 'FNR==NR {a[$1]; next} !($1 in a)' file2 file1

Performance Tips

# Exit early
awk '/pattern/ {print; exit}' file

# Use next to skip
awk '/skip/{next} {expensive_operation}' file

# Avoid regex when simple string works
awk '$1 == "exact"' file           # Faster than /^exact$/

Quick Reference

# Structure
awk 'BEGIN{} pattern{action} END{}' file

# Common patterns
/regex/          # Match regex
$1 == "value"    # Field equals
$1 ~ /regex/     # Field matches
NR == 5          # Line 5
NR > 1           # Skip first line
NR >= 10 && NR <= 20  # Line range

# Common variables
NR  NF  $0  $1  FILENAME  FS  OFS

# Common functions
length() substr() split() gsub() printf() sprintf()

# Common idioms
{count[$1]++}                    # Count by key
{sum+=$1} END{print sum}         # Sum column
{if ($1>max) max=$1}             # Find max
BEGIN{FS=","} {print $2}         # CSV field

Key Takeaways

  1. -F for field separator - Match your data format

  2. $1, $2, $NF - Access fields by position

  3. NR for line number - Filter/range

  4. BEGIN/END - Setup and summary

  5. Associative arrays - Count, sum, group

  6. gsub/sub - In-line substitution

  7. printf - Formatted output

Next Module

xargs - Build commands from input.