awk: Text Processing & Data Extraction

awk is a complete programming language disguised as a text processor. It’s the original data pipeline tool.


Core Concepts

The awk Model

┌─────────────────────────────────────────────────────────────────┐
│                      AWK PROCESSING MODEL                        │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│   INPUT          PATTERN-ACTION          OUTPUT                  │
│   ─────          ──────────────          ──────                  │
│                                                                  │
│   line 1  ──►   /pattern/ { action }  ──►  transformed line     │
│   line 2  ──►   condition { action }  ──►  transformed line     │
│   line 3  ──►   BEGIN { setup }       ──►  (runs first)         │
│   ...           END { summary }       ──►  (runs last)          │
│                                                                  │
│   FIELDS (columns)                                               │
│   ────────────────                                               │
│   $0 = entire line                                               │
│   $1 = first field                                               │
│   $2 = second field                                              │
│   $NF = last field                                               │
│   $(NF-1) = second-to-last field                                 │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

Built-in Variables

Variable Meaning Example

$0

Entire line

{print $0}

$1, $2, …​

Field 1, 2, …​

{print $1, $3}

NF

Number of fields

{print NF}

$NF

Last field

{print $NF}

NR

Current line number

NR==5 {print}

FNR

Line number in current file

FNR==1 {print "new file"}

FS

Field separator (input)

BEGIN {FS=":"}

OFS

Output field separator

BEGIN {OFS="\t"}

RS

Record separator

BEGIN {RS=""} # paragraphs

ORS

Output record separator

BEGIN {ORS="\n\n"}

FILENAME

Current filename

{print FILENAME, $0}


Field Extraction

Basic Column Selection

# Print first column
awk '{print $1}' file.txt

# Print multiple columns
awk '{print $1, $3, $5}' file.txt

# Print last column
awk '{print $NF}' file.txt

# Print second-to-last
awk '{print $(NF-1)}' file.txt

# Print all but first column
awk '{$1=""; print $0}' file.txt | sed 's/^ //'

Custom Delimiters

# Colon-delimited (passwd file)
awk -F: '{print $1, $3}' /etc/passwd

# Comma-delimited (CSV)
awk -F, '{print $1, $2}' data.csv

# Tab-delimited
awk -F'\t' '{print $1, $2}' data.tsv

# Multiple delimiters
awk -F'[,;:]' '{print $1, $2}' mixed.txt

# Set both input and output
awk 'BEGIN {FS=":"; OFS="\t"} {print $1, $3}' /etc/passwd

Column Ranges

# Columns 2 through 5
awk '{for(i=2; i<=5; i++) printf "%s ", $i; print ""}' file.txt

# All columns from 3 onward
awk '{for(i=3; i<=NF; i++) printf "%s ", $i; print ""}' file.txt

# Every other column
awk '{for(i=1; i<=NF; i+=2) printf "%s ", $i; print ""}' file.txt

Pattern Matching

Regex Patterns

# Lines containing "ERROR"
awk '/ERROR/ {print}' app.log

# Lines NOT containing "DEBUG"
awk '!/DEBUG/' app.log

# Case insensitive (GNU awk)
awk 'BEGIN{IGNORECASE=1} /error/' app.log

# Match in specific field
awk '$3 ~ /fail/' app.log

# Negative match in field
awk '$3 !~ /success/' app.log

Comparison Patterns

# Numeric comparison
awk '$3 > 100 {print $1, $3}' data.txt

# String comparison
awk '$1 == "admin" {print}' users.txt

# Multiple conditions (AND)
awk '$3 > 100 && $4 == "active" {print}' data.txt

# Multiple conditions (OR)
awk '$1 == "error" || $1 == "warning" {print}' log.txt

# Range (between patterns)
awk '/START/,/END/' file.txt

Line Number Patterns

# Specific line
awk 'NR==5' file.txt

# Line range
awk 'NR>=10 && NR<=20' file.txt

# First 10 lines
awk 'NR<=10' file.txt

# Skip header (line 1)
awk 'NR>1' file.txt

# Every 5th line
awk 'NR%5==0' file.txt

Aggregation

Counting

# Count lines
awk 'END {print NR}' file.txt

# Count matching lines
awk '/ERROR/ {count++} END {print count}' app.log

# Count by field value
awk '{count[$1]++} END {for (k in count) print k, count[k]}' access.log

# Count unique values
awk '!seen[$1]++ {count++} END {print count}' file.txt

Summing

# Sum a column
awk '{sum += $3} END {print sum}' data.txt

# Sum with formatting
awk '{sum += $3} END {printf "Total: %.2f\n", sum}' data.txt

# Sum by category
awk '{sum[$1] += $3} END {for (k in sum) print k, sum[k]}' data.txt

Statistics

# Average
awk '{sum += $3; count++} END {print sum/count}' data.txt

# Min and max
awk 'NR==1 {min=max=$3} $3<min {min=$3} $3>max {max=$3} END {print "min:", min, "max:", max}' data.txt

# Full stats
awk '{
  sum += $3
  sumsq += $3^2
  count++
  if (NR==1 || $3<min) min=$3
  if (NR==1 || $3>max) max=$3
}
END {
  avg = sum/count
  var = (sumsq - sum^2/count) / count
  std = sqrt(var)
  print "Count:", count
  print "Sum:", sum
  print "Avg:", avg
  print "Min:", min
  print "Max:", max
  print "StdDev:", std
}' data.txt

Text Transformation

Field Modification

# Replace field value
awk '{$3 = "REDACTED"; print}' data.txt

# Multiply field
awk '{$3 = $3 * 1.1; print}' salaries.txt

# Add new field
awk '{$(NF+1) = $2 * $3; print}' data.txt

# Conditional modification
awk '{if ($3 > 100) $3 = "HIGH"; print}' data.txt

String Functions

# Uppercase
awk '{print toupper($1)}' file.txt

# Lowercase
awk '{print tolower($0)}' file.txt

# Length
awk '{print $1, length($1)}' file.txt

# Substring
awk '{print substr($1, 1, 3)}' file.txt

# Split into array
awk '{split($1, arr, "-"); print arr[1], arr[2]}' file.txt

# Global substitution
awk '{gsub(/old/, "new"); print}' file.txt

# Substitution in field
awk '{gsub(/\./, "-", $2); print}' file.txt

Formatting

# Printf formatting
awk '{printf "%-20s %10.2f\n", $1, $3}' data.txt

# Padded output
awk '{printf "%05d %s\n", NR, $0}' file.txt

# Tabular output
awk 'BEGIN {printf "%-15s %-10s %s\n", "NAME", "COUNT", "STATUS"}
     {printf "%-15s %-10d %s\n", $1, $2, $3}' data.txt

# Header and separator
awk 'BEGIN {
  print "Name            Count      Status"
  print "-----------------------------------"
}
{printf "%-15s %-10d %s\n", $1, $2, $3}' data.txt

Infrastructure Examples

Log Analysis

# Top IPs in access log
awk '{print $1}' access.log | sort | uniq -c | sort -rn | head -20

# With awk only (faster for large files)
awk '{ip[$1]++} END {for (i in ip) print ip[i], i}' access.log | sort -rn | head -20

# HTTP status codes distribution
awk '{status[$9]++} END {for (s in status) print s, status[s]}' access.log | sort

# Requests per hour
awk '{split($4, t, ":"); hour=t[2]; count[hour]++} END {for (h in count) print h, count[h]}' access.log | sort -n

# Failed SSH attempts by IP
awk '/Failed password/ {print $(NF-3)}' /var/log/auth.log | sort | uniq -c | sort -rn

# Average response time
awk '{sum += $NF; count++} END {print sum/count "ms"}' access.log

Process Analysis

# Top memory consumers
ps aux | awk 'NR>1 {print $4, $11}' | sort -rn | head -10

# Total memory by user
ps aux | awk 'NR>1 {mem[$1] += $4} END {for (u in mem) print u, mem[u]"%"}' | sort -k2 -rn

# Processes per user
ps aux | awk 'NR>1 {count[$1]++} END {for (u in count) print u, count[u]}' | sort -k2 -rn

Disk Analysis

# Filesystems over 80% full
df -h | awk 'NR>1 && int($5) > 80 {print $6, $5}'

# Disk usage by mount point
df -h | awk 'NR>1 {print $6, $3}' | column -t

# Total disk used/available
df | awk 'NR>1 {used+=$3; avail+=$4} END {print "Used:", used/1024/1024 "GB", "Avail:", avail/1024/1024 "GB"}'

Network Analysis

# Connections by state
ss -tan | awk 'NR>1 {state[$1]++} END {for (s in state) print s, state[s]}'

# Connections per port
ss -tan | awk 'NR>1 {split($4, a, ":"); port=a[length(a)]; count[port]++} END {for (p in count) print p, count[p]}' | sort -k2 -rn

# IPs connected to port 443
ss -tan | awk '$4 ~ /:443$/ {split($5, a, ":"); print a[1]}' | sort -u

Configuration Parsing

# Extract key=value pairs
awk -F= '/^[^#]/ && NF==2 {print $1, $2}' config.ini

# Parse INI sections
awk '/^\[/ {section=$0} /^[^#\[]/ && NF {print section, $0}' config.ini

# Extract specific values
awk -F= '$1=="hostname" {print $2}' config.ini

# YAML-like parsing (simple)
awk '/^[a-z]/ {key=$1} /^  / {print key, $0}' config.yaml

Multi-File Processing

Process Multiple Files

# Print filename with each line
awk '{print FILENAME, $0}' file1.txt file2.txt

# First line of each file
awk 'FNR==1 {print FILENAME, $0}' *.txt

# Count lines per file
awk 'END {print FILENAME, FNR}' *.txt

# Sum column across all files
awk '{sum+=$3} END {print sum}' *.txt

Join Files

# Join on first column (like SQL join)
awk 'NR==FNR {a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1.txt file2.txt

# Left join (keep all from file2)
awk 'NR==FNR {a[$1]=$2; next} {print $0, ($1 in a ? a[$1] : "N/A")}' file1.txt file2.txt

Associative Arrays

Grouping and Lookup

# Group by field
awk '{groups[$1] = groups[$1] " " $2} END {for (g in groups) print g ":" groups[g]}' data.txt

# Lookup table
awk 'NR==FNR {lookup[$1]=$2; next} {$2=lookup[$2]; print}' lookup.txt data.txt

# Deduplication
awk '!seen[$0]++' file.txt

# Unique by column
awk '!seen[$1]++' file.txt

Multi-dimensional Arrays

# Cross-tabulation
awk '{count[$1,$2]++} END {for (k in count) {split(k, a, SUBSEP); print a[1], a[2], count[k]}}' data.txt

# Pivot table style
awk '{
  rows[$1] = 1
  cols[$2] = 1
  data[$1,$2] += $3
}
END {
  for (r in rows) {
    printf "%s", r
    for (c in cols) printf "\t%s", data[r,c]+0
    print ""
  }
}' data.txt

Control Structures

Conditionals

# If-else
awk '{
  if ($3 > 100)
    print $1, "HIGH"
  else if ($3 > 50)
    print $1, "MEDIUM"
  else
    print $1, "LOW"
}' data.txt

# Ternary operator
awk '{print $1, ($3 > 100 ? "HIGH" : "LOW")}' data.txt

Loops

# For loop (process fields)
awk '{for (i=1; i<=NF; i++) print $i}' file.txt

# While loop
awk '{i=1; while (i<=NF) {print $i; i++}}' file.txt

# Break and continue
awk '{
  for (i=1; i<=NF; i++) {
    if ($i == "skip") continue
    if ($i == "stop") break
    print $i
  }
}' file.txt

Quick Reference

Task Command

Print column

awk '{print $1}'

Print last column

awk '{print $NF}'

Custom delimiter

awk -F: '{print $1}'

Filter by pattern

awk '/pattern/'

Filter by column value

awk '$3 > 100'

Sum column

awk '{s+=$3} END {print s}'

Count lines

awk 'END {print NR}'

Count by group

awk '{c[$1]++} END {for(k in c) print k,c[k]}'

Skip header

awk 'NR>1'

Line range

awk 'NR>=5 && NR⇐10'

Deduplicate

awk '!seen[$0]++'

Replace text

awk '{gsub(/old/,"new"); print}'

Join files

awk 'NR==FNR {a[$1]=$2;next} {print $0,a[$1]}' f1 f2

Formatted output

awk '{printf "%-10s %5d\n", $1, $2}'