AWK for SIEM Mastery
|
awk uses ERE regex patterns. Foundation learning:
|
Overview
AWK excels at column-based data manipulation - perfect for SIEM logs, CSV exports, and structured data. It’s your go-to for "show me column 3 where column 2 equals X".
Core Syntax
awk 'pattern { action }' file
awk -F',' 'pattern { action }' file # Custom delimiter
awk -v var=value 'pattern { action }' # Pass variables
Built-in Variables
| Variable | Description | Example |
|---|---|---|
|
Entire line |
|
|
Field 1, 2, etc. |
|
|
Number of fields |
|
|
Record (line) number |
|
|
Field separator |
|
|
Output field separator |
|
|
Record separator |
|
|
Output record separator |
|
|
Current filename |
|
Column Operations
Select Columns
# Print columns 1 and 3
awk '{print $1, $3}' file
# Print last column
awk '{print $NF}' file
# Print second-to-last column
awk '{print $(NF-1)}' file
# Print all except first column
awk '{$1=""; print $0}' file
# Print columns 3 through end
awk '{for(i=3;i<=NF;i++) printf "%s ", $i; print ""}' file
Reorder Columns
# Swap columns: 1,2,3 β 3,1,2
awk '{print $3, $1, $2}' file
# With custom output separator
awk -v OFS=',' '{print $3, $1, $2}' file
Filter by Column Value
# Where column 2 equals "ERROR"
awk '$2 == "ERROR"' file
# Where column 3 > 100
awk '$3 > 100' file
# Where column 1 matches pattern
awk '$1 ~ /192\.168/' file
# Where column 1 does NOT match
awk '$1 !~ /192\.168/' file
# Multiple conditions
awk '$2 == "ERROR" && $3 > 100' file
awk '$2 == "ERROR" || $2 == "WARN"' file
SIEM Log Analysis
Parse Firewall Logs
# Extract source IP, dest IP, port from iptables
awk '/SRC=/ {
for(i=1;i<=NF;i++) {
if($i ~ /^SRC=/) src=substr($i,5)
if($i ~ /^DST=/) dst=substr($i,5)
if($i ~ /^DPT=/) dpt=substr($i,5)
}
print src, dst, dpt
}' /var/log/firewall.log
Aggregate Connection Counts
# Count connections per source IP
awk '{count[$1]++} END {for(ip in count) print count[ip], ip}' connections.log | sort -rn | head -20
# Count by source-destination pair
awk '{pair=$1"->"$2; count[pair]++} END {for(p in count) print count[p], p}' flows.log | sort -rn
QRadar AQL Output Processing
# Sum bytes by source IP
awk -F',' 'NR>1 {bytes[$1]+=$3} END {for(ip in bytes) print ip, bytes[ip]}' qradar_export.csv
# Average magnitude per category
awk -F',' 'NR>1 {sum[$2]+=$4; count[$2]++} END {for(cat in sum) print cat, sum[cat]/count[cat]}' events.csv
ISE Session Analysis
# Parse netapi ISE sessions - count by auth method
netapi ise mnt sessions | awk 'NR>2 {count[$3]++} END {for(m in count) print m, count[m]}'
# Extract sessions older than 1 hour (simplistic)
netapi ise mnt sessions | awk 'NR>2 && $5 < 3600 {print $1, $2, $5}'
Windows Event Analysis
# Count failed logins (4625) by username
awk '/EventID.*4625/ {
match($0, /TargetUserName="([^"]+)"/, arr)
if(arr[1]) users[arr[1]]++
} END {
for(u in users) print users[u], u
}' security.log | sort -rn | head -20
Data Transformation
CSV/TSV Conversion
# CSV to TSV
awk -F',' -v OFS='\t' '{$1=$1; print}' file.csv > file.tsv
# TSV to CSV
awk -F'\t' -v OFS=',' '{$1=$1; print}' file.tsv > file.csv
# Add header
awk 'BEGIN{print "IP,Count,Bytes"} {print}' data.csv
JSON-like Output
# Convert columns to JSON objects
awk -F',' '{print "{\"ip\":\""$1"\",\"port\":"$2",\"bytes\":"$3"}"}' data.csv
# Pretty JSON array
awk -F',' 'BEGIN{print "["} NR>1{if(NR>2)print ","; print " {\"src\":\""$1"\",\"dst\":\""$2"\"}"}END{print "]"}' data.csv
Timestamp Processing
# Extract hour from timestamp and count
awk -F'[: ]' '{hour=$2; count[hour]++} END {for(h in count) print h":00", count[h]}' access.log | sort
# Filter by time range
awk -F',' '$1 >= "2026-02-12 08:00" && $1 <= "2026-02-12 17:00"' events.csv
Statistical Operations
Basic Statistics
# Sum column 3
awk '{sum+=$3} END {print "Total:", sum}' file
# Average column 3
awk '{sum+=$3; count++} END {print "Average:", sum/count}' file
# Min and Max
awk 'NR==1{min=max=$3} {if($3<min)min=$3; if($3>max)max=$3} END {print "Min:", min, "Max:", max}' file
# Full stats
awk '{
sum+=$3; sumsq+=$3*$3; count++
if(NR==1 || $3<min) min=$3
if(NR==1 || $3>max) max=$3
} END {
avg=sum/count
std=sqrt(sumsq/count - avg*avg)
print "Count:", count
print "Sum:", sum
print "Avg:", avg
print "Min:", min
print "Max:", max
print "Std:", std
}' file
Top N Analysis
# Top 10 IPs by request count
awk '{count[$1]++} END {for(ip in count) print count[ip], ip}' access.log | sort -rn | head -10
# Top 10 IPs by bytes transferred
awk '{bytes[$1]+=$10} END {for(ip in bytes) print bytes[ip], ip}' access.log | sort -rn | head -10
Multi-File Operations
Process Multiple Files
# Print filename with each line
awk '{print FILENAME, $0}' *.log
# Combine and deduplicate
awk '!seen[$0]++' file1 file2 file3
# Different action per file
awk 'FNR==1{print "=== " FILENAME " ==="} {print}' *.log
Join Files
# Simple join on first column
awk 'NR==FNR{a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1 file2
Pattern Matching
Regular Expressions
# Lines containing pattern
awk '/ERROR/' file
# Lines NOT containing pattern
awk '!/DEBUG/' file
# Field matches pattern
awk '$2 ~ /^192\.168/' file
# Case-insensitive (GNU awk)
awk 'tolower($0) ~ /error/' file
# Extract with regex
awk 'match($0, /SRC=([0-9.]+)/, arr) {print arr[1]}' file
Range Patterns
# Between START and END (inclusive)
awk '/START/,/END/' file
# Line numbers 10-20
awk 'NR>=10 && NR<=20' file
# After header (skip first line)
awk 'NR>1' file
Output Formatting
Printf for Precise Output
# Formatted table
awk 'BEGIN{printf "%-20s %10s %10s\n", "IP", "Requests", "Bytes"}
{printf "%-20s %10d %10d\n", $1, $2, $3}' data.txt
# Right-aligned numbers
awk '{printf "%15s %8d %12.2f\n", $1, $2, $3}' file
# Zero-padded
awk '{printf "%s %05d\n", $1, $2}' file
Generate Reports
# Summary report
awk 'BEGIN{
print "================================"
print " SECURITY REPORT "
print "================================"
}
{
total++
if($2=="ERROR") errors++
if($2=="WARN") warnings++
}
END{
print "Total Events:", total
print "Errors:", errors
print "Warnings:", warnings
print "Error Rate:", (errors/total)*100 "%"
print "================================"
}' events.log
SIEM Quick Reference Card
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β AWK SIEM QUICK REFERENCE β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β COLUMN SELECTION β
β Print col 1,3: awk '{print $1, $3}' β
β Last column: awk '{print $NF}' β
β All but first: awk '{$1=""; print}' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β FILTERING β
β Column equals: awk '$2 == "ERROR"' β
β Column matches: awk '$1 ~ /192\.168/' β
β Column > value: awk '$3 > 1000' β
β Skip header: awk 'NR > 1' β
β Line range: awk 'NR >= 10 && NR <= 20' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β AGGREGATION β
β Count by field: awk '{c[$1]++} END {for(k in c) print c[k], k}' β
β Sum by field: awk '{s[$1]+=$2} END {for(k in s) print k, s[k]}' β
β Total sum: awk '{s+=$2} END {print s}' β
β Average: awk '{s+=$2; n++} END {print s/n}' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β TRANSFORMATION β
β Custom delimiter: awk -F',' '{print $1}' β
β Output delimiter: awk -v OFS=',' '{$1=$1; print}' β
β Add column: awk '{print $0, "NEW"}' β
β Modify column: awk '{$2=toupper($2); print}' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β FORMATTING β
β Table format: awk '{printf "%-20s %10d\n", $1, $2}' β
β CSV output: awk -v OFS=',' '{$1=$1; print}' β
β JSON object: awk '{print "{\"ip\":\""$1"\"}"}' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
β COMMON PATTERNS β
β Top 10 by count: awk '{c[$1]++} END{for(k in c)print c[k],k}' \ β
β | sort -rn | head -10 β
β Unique values: awk '!seen[$1]++ {print $1}' β
β Between patterns: awk '/START/,/END/' β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Real-World Workflows
Daily Security Report
#!/bin/bash
# Generate daily security metrics from logs
echo "=== Security Report $(date +%Y-%m-%d) ==="
echo -e "\n--- Top 10 Source IPs ---"
awk '{print $1}' /var/log/auth.log | sort | uniq -c | sort -rn | head -10
echo -e "\n--- Failed Auth by User ---"
awk '/Failed password/ {
match($0, /user ([^ ]+)/, u)
if(u[1]) users[u[1]]++
} END {
for(u in users) printf "%5d %s\n", users[u], u
}' /var/log/auth.log | sort -rn | head -10
echo -e "\n--- Hourly Distribution ---"
awk '{
split($3, t, ":")
hour[t[1]]++
} END {
for(h=0; h<24; h++) printf "%02d:00 %d\n", h, hour[sprintf("%02d",h)]
}' /var/log/auth.log
ISE Export Analysis
# Analyze ISE DataConnect export
awk -F',' '
NR==1 {next} # Skip header
{
method[$5]++
result[$8]++
nas[$6]++
}
END {
print "=== Authentication Methods ==="
for(m in method) print method[m], m
print "\n=== Results ==="
for(r in result) print result[r], r
print "\n=== Top NADs ==="
for(n in nas) print nas[n], n
}' ise_export.csv | sort -rn