awk: Text Processing & Data Extraction
awk is a complete programming language disguised as a text processor. It’s the original data pipeline tool.
Core Concepts
The awk Model
┌─────────────────────────────────────────────────────────────────┐
│ AWK PROCESSING MODEL │
├─────────────────────────────────────────────────────────────────┤
│ │
│ INPUT PATTERN-ACTION OUTPUT │
│ ───── ────────────── ────── │
│ │
│ line 1 ──► /pattern/ { action } ──► transformed line │
│ line 2 ──► condition { action } ──► transformed line │
│ line 3 ──► BEGIN { setup } ──► (runs first) │
│ ... END { summary } ──► (runs last) │
│ │
│ FIELDS (columns) │
│ ──────────────── │
│ $0 = entire line │
│ $1 = first field │
│ $2 = second field │
│ $NF = last field │
│ $(NF-1) = second-to-last field │
│ │
└─────────────────────────────────────────────────────────────────┘
Built-in Variables
| Variable | Meaning | Example |
|---|---|---|
|
Entire line |
|
|
Field 1, 2, … |
|
|
Number of fields |
|
|
Last field |
|
|
Current line number |
|
|
Line number in current file |
|
|
Field separator (input) |
|
|
Output field separator |
|
|
Record separator |
|
|
Output record separator |
|
|
Current filename |
|
Field Extraction
Basic Column Selection
# Print first column
awk '{print $1}' file.txt
# Print multiple columns
awk '{print $1, $3, $5}' file.txt
# Print last column
awk '{print $NF}' file.txt
# Print second-to-last
awk '{print $(NF-1)}' file.txt
# Print all but first column
awk '{$1=""; print $0}' file.txt | sed 's/^ //'
Custom Delimiters
# Colon-delimited (passwd file)
awk -F: '{print $1, $3}' /etc/passwd
# Comma-delimited (CSV)
awk -F, '{print $1, $2}' data.csv
# Tab-delimited
awk -F'\t' '{print $1, $2}' data.tsv
# Multiple delimiters
awk -F'[,;:]' '{print $1, $2}' mixed.txt
# Set both input and output
awk 'BEGIN {FS=":"; OFS="\t"} {print $1, $3}' /etc/passwd
Pattern Matching
Regex Patterns
# Lines containing "ERROR"
awk '/ERROR/ {print}' app.log
# Lines NOT containing "DEBUG"
awk '!/DEBUG/' app.log
# Case insensitive (GNU awk)
awk 'BEGIN{IGNORECASE=1} /error/' app.log
# Match in specific field
awk '$3 ~ /fail/' app.log
# Negative match in field
awk '$3 !~ /success/' app.log
Comparison Patterns
# Numeric comparison
awk '$3 > 100 {print $1, $3}' data.txt
# String comparison
awk '$1 == "admin" {print}' users.txt
# Multiple conditions (AND)
awk '$3 > 100 && $4 == "active" {print}' data.txt
# Multiple conditions (OR)
awk '$1 == "error" || $1 == "warning" {print}' log.txt
# Range (between patterns)
awk '/START/,/END/' file.txt
Aggregation
Counting
# Count lines
awk 'END {print NR}' file.txt
# Count matching lines
awk '/ERROR/ {count++} END {print count}' app.log
# Count by field value
awk '{count[$1]++} END {for (k in count) print k, count[k]}' access.log
# Count unique values
awk '!seen[$1]++ {count++} END {print count}' file.txt
Summing
# Sum a column
awk '{sum += $3} END {print sum}' data.txt
# Sum with formatting
awk '{sum += $3} END {printf "Total: %.2f\n", sum}' data.txt
# Sum by category
awk '{sum[$1] += $3} END {for (k in sum) print k, sum[k]}' data.txt
Statistics
# Average
awk '{sum += $3; count++} END {print sum/count}' data.txt
# Min and max
awk 'NR==1 {min=max=$3} $3<min {min=$3} $3>max {max=$3} END {print "min:", min, "max:", max}' data.txt
# Full stats
awk '{
sum += $3
sumsq += $3^2
count++
if (NR==1 || $3<min) min=$3
if (NR==1 || $3>max) max=$3
}
END {
avg = sum/count
var = (sumsq - sum^2/count) / count
std = sqrt(var)
print "Count:", count
print "Sum:", sum
print "Avg:", avg
print "Min:", min
print "Max:", max
print "StdDev:", std
}' data.txt
Text Transformation
Field Modification
# Replace field value
awk '{$3 = "REDACTED"; print}' data.txt
# Multiply field
awk '{$3 = $3 * 1.1; print}' salaries.txt
# Add new field
awk '{$(NF+1) = $2 * $3; print}' data.txt
# Conditional modification
awk '{if ($3 > 100) $3 = "HIGH"; print}' data.txt
String Functions
# Uppercase
awk '{print toupper($1)}' file.txt
# Lowercase
awk '{print tolower($0)}' file.txt
# Length
awk '{print $1, length($1)}' file.txt
# Substring
awk '{print substr($1, 1, 3)}' file.txt
# Split into array
awk '{split($1, arr, "-"); print arr[1], arr[2]}' file.txt
# Global substitution
awk '{gsub(/old/, "new"); print}' file.txt
# Substitution in field
awk '{gsub(/\./, "-", $2); print}' file.txt
Formatting
# Printf formatting
awk '{printf "%-20s %10.2f\n", $1, $3}' data.txt
# Padded output
awk '{printf "%05d %s\n", NR, $0}' file.txt
# Tabular output
awk 'BEGIN {printf "%-15s %-10s %s\n", "NAME", "COUNT", "STATUS"}
{printf "%-15s %-10d %s\n", $1, $2, $3}' data.txt
# Header and separator
awk 'BEGIN {
print "Name Count Status"
print "-----------------------------------"
}
{printf "%-15s %-10d %s\n", $1, $2, $3}' data.txt
Infrastructure Examples
Log Analysis
# Top IPs in access log
awk '{print $1}' access.log | sort | uniq -c | sort -rn | head -20
# With awk only (faster for large files)
awk '{ip[$1]++} END {for (i in ip) print ip[i], i}' access.log | sort -rn | head -20
# HTTP status codes distribution
awk '{status[$9]++} END {for (s in status) print s, status[s]}' access.log | sort
# Requests per hour
awk '{split($4, t, ":"); hour=t[2]; count[hour]++} END {for (h in count) print h, count[h]}' access.log | sort -n
# Failed SSH attempts by IP
awk '/Failed password/ {print $(NF-3)}' /var/log/auth.log | sort | uniq -c | sort -rn
# Average response time
awk '{sum += $NF; count++} END {print sum/count "ms"}' access.log
Process Analysis
# Top memory consumers
ps aux | awk 'NR>1 {print $4, $11}' | sort -rn | head -10
# Total memory by user
ps aux | awk 'NR>1 {mem[$1] += $4} END {for (u in mem) print u, mem[u]"%"}' | sort -k2 -rn
# Processes per user
ps aux | awk 'NR>1 {count[$1]++} END {for (u in count) print u, count[u]}' | sort -k2 -rn
Disk Analysis
# Filesystems over 80% full
df -h | awk 'NR>1 && int($5) > 80 {print $6, $5}'
# Disk usage by mount point
df -h | awk 'NR>1 {print $6, $3}' | column -t
# Total disk used/available
df | awk 'NR>1 {used+=$3; avail+=$4} END {print "Used:", used/1024/1024 "GB", "Avail:", avail/1024/1024 "GB"}'
Network Analysis
# Connections by state
ss -tan | awk 'NR>1 {state[$1]++} END {for (s in state) print s, state[s]}'
# Connections per port
ss -tan | awk 'NR>1 {split($4, a, ":"); port=a[length(a)]; count[port]++} END {for (p in count) print p, count[p]}' | sort -k2 -rn
# IPs connected to port 443
ss -tan | awk '$4 ~ /:443$/ {split($5, a, ":"); print a[1]}' | sort -u
Configuration Parsing
# Extract key=value pairs
awk -F= '/^[^#]/ && NF==2 {print $1, $2}' config.ini
# Parse INI sections
awk '/^\[/ {section=$0} /^[^#\[]/ && NF {print section, $0}' config.ini
# Extract specific values
awk -F= '$1=="hostname" {print $2}' config.ini
# YAML-like parsing (simple)
awk '/^[a-z]/ {key=$1} /^ / {print key, $0}' config.yaml
Multi-File Processing
Associative Arrays
Grouping and Lookup
# Group by field
awk '{groups[$1] = groups[$1] " " $2} END {for (g in groups) print g ":" groups[g]}' data.txt
# Lookup table
awk 'NR==FNR {lookup[$1]=$2; next} {$2=lookup[$2]; print}' lookup.txt data.txt
# Deduplication
awk '!seen[$0]++' file.txt
# Unique by column
awk '!seen[$1]++' file.txt
Multi-dimensional Arrays
# Cross-tabulation
awk '{count[$1,$2]++} END {for (k in count) {split(k, a, SUBSEP); print a[1], a[2], count[k]}}' data.txt
# Pivot table style
awk '{
rows[$1] = 1
cols[$2] = 1
data[$1,$2] += $3
}
END {
for (r in rows) {
printf "%s", r
for (c in cols) printf "\t%s", data[r,c]+0
print ""
}
}' data.txt
Control Structures
Quick Reference
| Task | Command |
|---|---|
Print column |
|
Print last column |
|
Custom delimiter |
|
Filter by pattern |
|
Filter by column value |
|
Sum column |
|
Count lines |
|
Count by group |
|
Skip header |
|
Line range |
|
Deduplicate |
|
Replace text |
|
Join files |
|
Formatted output |
|
Related
-
sed - Stream editing (simpler substitutions)
-
jq Mastery - For JSON data instead of text