Drill 03: Log Processing
Filter, transform, and aggregate log files.
Run This Drill
bash ~/atelier/_bibliotheca/domus-captures/docs/modules/ROOT/examples/etl-drills/03-log-processing.sh
Drill Script
#!/bin/bash
# ETL DRILL 03: LOG PROCESSING
# Paste this entire script into your terminal
# Topics: grep filtering, sed transforms, awk aggregation
echo "=================================================================="
echo " ETL DRILL 03: LOG PROCESSING "
echo "=================================================================="
echo ""
# Create test log data
cat << 'EOF' > /tmp/app.log
2024-03-18 10:30:45 INFO [main] Application started on port 8080
2024-03-18 10:30:46 INFO [main] Connected to database at 10.50.1.50:5432
2024-03-18 10:31:02 ERROR [worker-1] Connection refused to 10.50.1.20:443
2024-03-18 10:31:15 WARN [worker-2] High latency detected: 250ms
2024-03-18 10:31:30 INFO [worker-1] Retry successful
2024-03-18 10:32:00 ERROR [worker-3] Authentication failed for user admin
2024-03-18 10:32:15 WARN [worker-1] Memory usage at 85%
2024-03-18 10:32:45 INFO [main] Health check passed
2024-03-18 10:33:00 ERROR [worker-2] Timeout waiting for response from 10.50.1.21
2024-03-18 10:33:15 INFO [worker-1] Request completed in 45ms
EOF
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.1: GREP FILTERING"
echo "Extract relevant lines"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Find all errors"
echo "grep 'ERROR' /tmp/app.log"
grep 'ERROR' /tmp/app.log
echo ""
echo "Command: Find errors and warnings"
echo "grep -E 'ERROR|WARN' /tmp/app.log"
grep -E 'ERROR|WARN' /tmp/app.log
echo ""
echo "Command: Extract IPs from logs"
echo "grep -oE '[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+' /tmp/app.log | sort -u"
grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' /tmp/app.log | sort -u
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.2: SED TRANSFORMATIONS"
echo "Pattern substitution and extraction"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Redact IP addresses"
echo "sed 's/[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+\\.[0-9]\\+/X.X.X.X/g' /tmp/app.log | head -3"
sed 's/[0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+/X.X.X.X/g' /tmp/app.log | head -3
echo ""
echo "Command: Extract timestamp and level only"
echo "sed -n 's/^\\([0-9-]\\+ [0-9:]\\+\\) \\([A-Z]\\+\\).*/\\1 \\2/p' /tmp/app.log"
sed -n 's/^\([0-9-]\+ [0-9:]\+\) \([A-Z]\+\).*/\1 \2/p' /tmp/app.log
echo ""
echo "Command: Convert to JSON lines"
echo "sed 's/^\\([0-9-]\\+ [0-9:]\\+\\) \\([A-Z]\\+\\) \\[\\([^]]*\\)\\] \\(.*\\)/{\"time\":\"\\1\",\"level\":\"\\2\",\"thread\":\"\\3\",\"msg\":\"\\4\"}/' /tmp/app.log | head -3"
sed 's/^\([0-9-]\+ [0-9:]\+\) \([A-Z]\+\) \[\([^]]*\)\] \(.*\)/{"time":"\1","level":"\2","thread":"\3","msg":"\4"}/' /tmp/app.log | head -3
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.3: AWK AGGREGATION"
echo "Count, group, calculate"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Count by log level"
echo "awk '{print \$3}' /tmp/app.log | sort | uniq -c | sort -rn"
awk '{print $3}' /tmp/app.log | sort | uniq -c | sort -rn
echo ""
echo "Command: Count by log level (pure awk)"
echo "awk '{count[\$3]++} END {for (level in count) print level, count[level]}' /tmp/app.log"
awk '{count[$3]++} END {for (level in count) print level, count[level]}' /tmp/app.log
echo ""
echo "Command: Count errors by thread"
echo "awk '/ERROR/ {gsub(/[\\[\\]]/, \"\", \$4); count[\$4]++} END {for (t in count) print t, count[t]}' /tmp/app.log"
awk '/ERROR/ {gsub(/[\[\]]/, "", $4); count[$4]++} END {for (t in count) print t, count[t]}' /tmp/app.log
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.4: EXTRACT METRICS"
echo "Parse numbers from logs"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Extract latency values"
echo "grep -oE '[0-9]+ms' /tmp/app.log"
grep -oE '[0-9]+ms' /tmp/app.log
echo ""
echo "Command: Calculate average latency"
echo "grep -oE '[0-9]+ms' /tmp/app.log | sed 's/ms//' | awk '{sum+=\$1; count++} END {print \"Average:\", sum/count, \"ms\"}'"
grep -oE '[0-9]+ms' /tmp/app.log | sed 's/ms//' | awk '{sum+=$1; count++} END {print "Average:", sum/count, "ms"}'
echo ""
echo "Command: Extract memory percentage"
echo "grep 'Memory usage' /tmp/app.log | grep -oE '[0-9]+%'"
grep 'Memory usage' /tmp/app.log | grep -oE '[0-9]+%'
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.5: TIME-BASED FILTERING"
echo "Filter by timestamp"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Logs after 10:32:00"
echo "awk '\$2 >= \"10:32:00\"' /tmp/app.log"
awk '$2 >= "10:32:00"' /tmp/app.log
echo ""
echo "Command: Logs between 10:31:00 and 10:32:30"
echo "awk '\$2 >= \"10:31:00\" && \$2 <= \"10:32:30\"' /tmp/app.log"
awk '$2 >= "10:31:00" && $2 <= "10:32:30"' /tmp/app.log
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "DRILL 3.6: COMPLETE LOG ANALYSIS PIPELINE"
echo "Multi-stage processing for report"
echo "------------------------------------------------------------------"
echo ""
echo "Command: Error summary report"
cat << 'SCRIPT'
#!/bin/bash
LOG="/tmp/app.log"
echo "=== Log Analysis Report ==="
echo "Time range: $(head -1 "$LOG" | awk '{print $1, $2}') to $(tail -1 "$LOG" | awk '{print $1, $2}')"
echo ""
echo "=== Count by Level ==="
awk '{count[$3]++} END {for (l in count) printf " %-6s %d\n", l, count[l]}' "$LOG"
echo ""
echo "=== Errors ==="
grep ERROR "$LOG" | awk '{$1=$2=$3=$4=""; print " -" $0}'
echo ""
echo "=== Unique IPs ==="
grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "$LOG" | sort -u | sed 's/^/ /'
SCRIPT
echo ""
echo "Executing..."
LOG="/tmp/app.log"
echo "=== Log Analysis Report ==="
echo "Time range: $(head -1 "$LOG" | awk '{print $1, $2}') to $(tail -1 "$LOG" | awk '{print $1, $2}')"
echo ""
echo "=== Count by Level ==="
awk '{count[$3]++} END {for (l in count) printf " %-6s %d\n", l, count[l]}' "$LOG"
echo ""
echo "=== Errors ==="
grep ERROR "$LOG" | awk '{$1=$2=$3=$4=""; print " -" $0}'
echo ""
echo "=== Unique IPs ==="
grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' "$LOG" | sort -u | sed 's/^/ /'
echo ""
# ---------------------------------------------------------------------------
echo "------------------------------------------------------------------"
echo "YOUR TURN - TRY THESE:"
echo "------------------------------------------------------------------"
echo ""
echo "1. Find logs from worker-1 only:"
echo " grep '\\[worker-1\\]' /tmp/app.log"
echo ""
echo "2. Convert to CSV:"
echo " awk '{print \$1\",\"\$2\",\"\$3}' /tmp/app.log"
echo ""
echo "3. Find the busiest minute:"
echo " awk '{print \$1, substr(\$2,1,5)}' /tmp/app.log | sort | uniq -c | sort -rn | head -1"
echo ""
echo "------------------------------------------------------------------"
echo "KEY TAKEAWAYS:"
echo "1. grep -E for extended regex, -o for match only"
echo "2. sed 's/pattern/replacement/' for transforms"
echo "3. awk '{count[\$N]++}' for aggregation"
echo "4. Combine: grep filter | sed transform | awk aggregate"
echo "5. sort | uniq -c for quick counting"
echo "------------------------------------------------------------------"