AWK for SIEM Mastery

awk uses ERE regex patterns. Foundation learning:

Regex Curriculum - Master patterns first
Regex Flavors - awk uses ERE

Overview

AWK excels at column-based data manipulation - perfect for SIEM logs, CSV exports, and structured data. It’s your go-to for "show me column 3 where column 2 equals X".

Core Syntax

awk 'pattern { action }' file
awk -F',' 'pattern { action }' file     # Custom delimiter
awk -v var=value 'pattern { action }'   # Pass variables

Built-in Variables

Variable Description Example

Variable	Description	Example
`$0`	Entire line	`{print $0}`
`$1, $2…`	Field 1, 2, etc.	`{print $1, $3}`
`NF`	Number of fields	`{print $NF}` (last field)
`NR`	Record (line) number	`NR==1` (first line)
`FS`	Field separator	`BEGIN{FS=","}`
`OFS`	Output field separator	`BEGIN{OFS="\t"}`
`RS`	Record separator	`BEGIN{RS="\n\n"}` (paragraphs)
`ORS`	Output record separator	`BEGIN{ORS="\n\n"}`
`FILENAME`	Current filename	`{print FILENAME, $0}`

$0

Entire line

{print $0}

$1, $2…

Field 1, 2, etc.

{print $1, $3}

NF

Number of fields

{print $NF} (last field)

NR

Record (line) number

NR==1 (first line)

FS

Field separator

BEGIN{FS=","}

OFS

Output field separator

BEGIN{OFS="\t"}

RS

Record separator

BEGIN{RS="\n\n"} (paragraphs)

ORS

Output record separator

BEGIN{ORS="\n\n"}

FILENAME

Current filename

{print FILENAME, $0}

Column Operations

Select Columns

# Print columns 1 and 3
awk '{print $1, $3}' file

# Print last column
awk '{print $NF}' file

# Print second-to-last column
awk '{print $(NF-1)}' file

# Print all except first column
awk '{$1=""; print $0}' file

# Print columns 3 through end
awk '{for(i=3;i<=NF;i++) printf "%s ", $i; print ""}' file

Reorder Columns

# Swap columns: 1,2,3 → 3,1,2
awk '{print $3, $1, $2}' file

# With custom output separator
awk -v OFS=',' '{print $3, $1, $2}' file

Filter by Column Value

# Where column 2 equals "ERROR"
awk '$2 == "ERROR"' file

# Where column 3 > 100
awk '$3 > 100' file

# Where column 1 matches pattern
awk '$1 ~ /192\.168/' file

# Where column 1 does NOT match
awk '$1 !~ /192\.168/' file

# Multiple conditions
awk '$2 == "ERROR" && $3 > 100' file
awk '$2 == "ERROR" || $2 == "WARN"' file

SIEM Log Analysis

Parse Firewall Logs

# Extract source IP, dest IP, port from iptables
awk '/SRC=/ {
    for(i=1;i<=NF;i++) {
        if($i ~ /^SRC=/) src=substr($i,5)
        if($i ~ /^DST=/) dst=substr($i,5)
        if($i ~ /^DPT=/) dpt=substr($i,5)
    }
    print src, dst, dpt
}' /var/log/firewall.log

Aggregate Connection Counts

# Count connections per source IP
awk '{count[$1]++} END {for(ip in count) print count[ip], ip}' connections.log | sort -rn | head -20

# Count by source-destination pair
awk '{pair=$1"->"$2; count[pair]++} END {for(p in count) print count[p], p}' flows.log | sort -rn

QRadar AQL Output Processing

# Sum bytes by source IP
awk -F',' 'NR>1 {bytes[$1]+=$3} END {for(ip in bytes) print ip, bytes[ip]}' qradar_export.csv

# Average magnitude per category
awk -F',' 'NR>1 {sum[$2]+=$4; count[$2]++} END {for(cat in sum) print cat, sum[cat]/count[cat]}' events.csv

ISE Session Analysis

# Parse netapi ISE sessions - count by auth method
netapi ise mnt sessions | awk 'NR>2 {count[$3]++} END {for(m in count) print m, count[m]}'

# Extract sessions older than 1 hour (simplistic)
netapi ise mnt sessions | awk 'NR>2 && $5 < 3600 {print $1, $2, $5}'

Windows Event Analysis

# Count failed logins (4625) by username
awk '/EventID.*4625/ {
    match($0, /TargetUserName="([^"]+)"/, arr)
    if(arr[1]) users[arr[1]]++
} END {
    for(u in users) print users[u], u
}' security.log | sort -rn | head -20

Data Transformation

CSV/TSV Conversion

# CSV to TSV
awk -F',' -v OFS='\t' '{$1=$1; print}' file.csv > file.tsv

# TSV to CSV
awk -F'\t' -v OFS=',' '{$1=$1; print}' file.tsv > file.csv

# Add header
awk 'BEGIN{print "IP,Count,Bytes"} {print}' data.csv

JSON-like Output

# Convert columns to JSON objects
awk -F',' '{print "{\"ip\":\""$1"\",\"port\":"$2",\"bytes\":"$3"}"}' data.csv

# Pretty JSON array
awk -F',' 'BEGIN{print "["} NR>1{if(NR>2)print ","; print "  {\"src\":\""$1"\",\"dst\":\""$2"\"}"}END{print "]"}' data.csv

Timestamp Processing

# Extract hour from timestamp and count
awk -F'[: ]' '{hour=$2; count[hour]++} END {for(h in count) print h":00", count[h]}' access.log | sort

# Filter by time range
awk -F',' '$1 >= "2026-02-12 08:00" && $1 <= "2026-02-12 17:00"' events.csv

Statistical Operations

Basic Statistics

# Sum column 3
awk '{sum+=$3} END {print "Total:", sum}' file

# Average column 3
awk '{sum+=$3; count++} END {print "Average:", sum/count}' file

# Min and Max
awk 'NR==1{min=max=$3} {if($3<min)min=$3; if($3>max)max=$3} END {print "Min:", min, "Max:", max}' file

# Full stats
awk '{
    sum+=$3; sumsq+=$3*$3; count++
    if(NR==1 || $3<min) min=$3
    if(NR==1 || $3>max) max=$3
} END {
    avg=sum/count
    std=sqrt(sumsq/count - avg*avg)
    print "Count:", count
    print "Sum:", sum
    print "Avg:", avg
    print "Min:", min
    print "Max:", max
    print "Std:", std
}' file

Top N Analysis

# Top 10 IPs by request count
awk '{count[$1]++} END {for(ip in count) print count[ip], ip}' access.log | sort -rn | head -10

# Top 10 IPs by bytes transferred
awk '{bytes[$1]+=$10} END {for(ip in bytes) print bytes[ip], ip}' access.log | sort -rn | head -10

Multi-File Operations

Process Multiple Files

# Print filename with each line
awk '{print FILENAME, $0}' *.log

# Combine and deduplicate
awk '!seen[$0]++' file1 file2 file3

# Different action per file
awk 'FNR==1{print "=== " FILENAME " ==="} {print}' *.log

Join Files

# Simple join on first column
awk 'NR==FNR{a[$1]=$2; next} $1 in a {print $0, a[$1]}' file1 file2

Pattern Matching

Regular Expressions

# Lines containing pattern
awk '/ERROR/' file

# Lines NOT containing pattern
awk '!/DEBUG/' file

# Field matches pattern
awk '$2 ~ /^192\.168/' file

# Case-insensitive (GNU awk)
awk 'tolower($0) ~ /error/' file

# Extract with regex
awk 'match($0, /SRC=([0-9.]+)/, arr) {print arr[1]}' file

Range Patterns

# Between START and END (inclusive)
awk '/START/,/END/' file

# Line numbers 10-20
awk 'NR>=10 && NR<=20' file

# After header (skip first line)
awk 'NR>1' file

Output Formatting

Printf for Precise Output

# Formatted table
awk 'BEGIN{printf "%-20s %10s %10s\n", "IP", "Requests", "Bytes"}
     {printf "%-20s %10d %10d\n", $1, $2, $3}' data.txt

# Right-aligned numbers
awk '{printf "%15s %8d %12.2f\n", $1, $2, $3}' file

# Zero-padded
awk '{printf "%s %05d\n", $1, $2}' file

Generate Reports

# Summary report
awk 'BEGIN{
    print "================================"
    print "       SECURITY REPORT          "
    print "================================"
}
{
    total++
    if($2=="ERROR") errors++
    if($2=="WARN") warnings++
}
END{
    print "Total Events:", total
    print "Errors:", errors
    print "Warnings:", warnings
    print "Error Rate:", (errors/total)*100 "%"
    print "================================"
}' events.log

SIEM Quick Reference Card

┌─────────────────────────────────────────────────────────────────────────────┐
│                         AWK SIEM QUICK REFERENCE                            │
├─────────────────────────────────────────────────────────────────────────────┤
│ COLUMN SELECTION                                                            │
│   Print col 1,3:      awk '{print $1, $3}'                                 │
│   Last column:        awk '{print $NF}'                                    │
│   All but first:      awk '{$1=""; print}'                                 │
├─────────────────────────────────────────────────────────────────────────────┤
│ FILTERING                                                                   │
│   Column equals:      awk '$2 == "ERROR"'                                  │
│   Column matches:     awk '$1 ~ /192\.168/'                                │
│   Column > value:     awk '$3 > 1000'                                      │
│   Skip header:        awk 'NR > 1'                                         │
│   Line range:         awk 'NR >= 10 && NR <= 20'                           │
├─────────────────────────────────────────────────────────────────────────────┤
│ AGGREGATION                                                                 │
│   Count by field:     awk '{c[$1]++} END {for(k in c) print c[k], k}'      │
│   Sum by field:       awk '{s[$1]+=$2} END {for(k in s) print k, s[k]}'    │
│   Total sum:          awk '{s+=$2} END {print s}'                          │
│   Average:            awk '{s+=$2; n++} END {print s/n}'                   │
├─────────────────────────────────────────────────────────────────────────────┤
│ TRANSFORMATION                                                              │
│   Custom delimiter:   awk -F',' '{print $1}'                               │
│   Output delimiter:   awk -v OFS=',' '{$1=$1; print}'                      │
│   Add column:         awk '{print $0, "NEW"}'                              │
│   Modify column:      awk '{$2=toupper($2); print}'                        │
├─────────────────────────────────────────────────────────────────────────────┤
│ FORMATTING                                                                  │
│   Table format:       awk '{printf "%-20s %10d\n", $1, $2}'                │
│   CSV output:         awk -v OFS=',' '{$1=$1; print}'                      │
│   JSON object:        awk '{print "{\"ip\":\""$1"\"}"}'                    │
├─────────────────────────────────────────────────────────────────────────────┤
│ COMMON PATTERNS                                                             │
│   Top 10 by count:    awk '{c[$1]++} END{for(k in c)print c[k],k}' \       │
│                       | sort -rn | head -10                                 │
│   Unique values:      awk '!seen[$1]++ {print $1}'                         │
│   Between patterns:   awk '/START/,/END/'                                  │
└─────────────────────────────────────────────────────────────────────────────┘

Real-World Workflows

Daily Security Report

#!/bin/bash
# Generate daily security metrics from logs

echo "=== Security Report $(date +%Y-%m-%d) ==="

echo -e "\n--- Top 10 Source IPs ---"
awk '{print $1}' /var/log/auth.log | sort | uniq -c | sort -rn | head -10

echo -e "\n--- Failed Auth by User ---"
awk '/Failed password/ {
    match($0, /user ([^ ]+)/, u)
    if(u[1]) users[u[1]]++
} END {
    for(u in users) printf "%5d %s\n", users[u], u
}' /var/log/auth.log | sort -rn | head -10

echo -e "\n--- Hourly Distribution ---"
awk '{
    split($3, t, ":")
    hour[t[1]]++
} END {
    for(h=0; h<24; h++) printf "%02d:00 %d\n", h, hour[sprintf("%02d",h)]
}' /var/log/auth.log

ISE Export Analysis

# Analyze ISE DataConnect export
awk -F',' '
NR==1 {next}  # Skip header
{
    method[$5]++
    result[$8]++
    nas[$6]++
}
END {
    print "=== Authentication Methods ==="
    for(m in method) print method[m], m

    print "\n=== Results ==="
    for(r in result) print result[r], r

    print "\n=== Top NADs ==="
    for(n in nas) print nas[n], n
}' ise_export.csv | sort -rn