Regex Session 08: Python re Module

Python’s re module brings regex into automation. Move beyond one-liners to building reusable parsers, validators, and data extractors.

Why Python for Regex?

Advantage Description

Named groups

(?P<name>…​) for readable extraction

Multiple matches

findall(), finditer() for all occurrences

Compiled patterns

re.compile() for reuse and performance

Flag combinations

re.IGNORECASE | re.MULTILINE

Substitution functions

Pass a function to re.sub() for dynamic replacement

Core Functions

import re

# Match at START of string only
re.match(pattern, string)

# Search ANYWHERE in string (use this most often)
re.search(pattern, string)

# Find ALL non-overlapping matches
re.findall(pattern, string)

# Iterator of match objects
re.finditer(pattern, string)

# Replace matches
re.sub(pattern, replacement, string)

# Split by pattern
re.split(pattern, string)

# Compile for reuse
compiled = re.compile(pattern)
compiled.search(string)

Test Setup

#!/usr/bin/env python3
"""Regex practice module."""

import re

# Sample data
LOG_LINES = """
2026-03-15T10:30:45 [INFO] User admin logged in from 192.168.1.100
2026-03-15T10:31:02 [WARN] Disk space low: 15% remaining
2026-03-15T10:32:00 [ERROR] Connection failed to db-01.example.com:5432
2026-03-15T10:33:00 [DEBUG] Query took 145ms for /api/v1/users
""".strip()

CONFIG_DATA = """
server_name=web-prod-01
server_ip=192.168.1.100
server_port=8080
admin_email=admin@example.com
db_host=db-prod-01.internal:5432
""".strip()

NETWORK_INFO = """
IP: 192.168.1.100 MAC: AA:BB:CC:DD:EE:FF VLAN: 100
IP: 10.50.1.20 MAC: 14:F6:D8:7B:31:80 VLAN: 10
IP: 172.16.0.1 MAC: 98:BB:1E:1F:A7:13 VLAN: 999
""".strip()
text = "Error: Connection failed"

# match() - only at START
result = re.match(r'Error', text)
print(result.group() if result else "No match")
# Output: Error

# match() fails if pattern not at start
result = re.match(r'Connection', text)
print(result.group() if result else "No match")
# Output: No match

# search() - finds ANYWHERE
result = re.search(r'Connection', text)
print(result.group() if result else "No match")
# Output: Connection

Rule: Use search() unless you specifically need start-of-string matching.

Lesson 2: Capture Groups

# Numbered groups
pattern = r'(\d{4})-(\d{2})-(\d{2})'
match = re.search(pattern, "Date: 2026-03-15")

print(match.group(0))  # Entire match: 2026-03-15
print(match.group(1))  # First group: 2026
print(match.group(2))  # Second group: 03
print(match.group(3))  # Third group: 15
print(match.groups())  # Tuple: ('2026', '03', '15')

Named Groups (Python Power Feature)

# Named groups with (?P<name>...)
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, "Date: 2026-03-15")

print(match.group('year'))   # 2026
print(match.group('month'))  # 03
print(match.group('day'))    # 15
print(match.groupdict())     # {'year': '2026', 'month': '03', 'day': '15'}

Named groups = self-documenting code.

Lesson 3: findall() and finditer()

text = """
IP: 192.168.1.100
Gateway: 10.50.1.1
DNS: 10.50.1.90
"""

# findall() - returns list of strings
pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
ips = re.findall(pattern, text)
print(ips)
# Output: ['192.168.1.100', '10.50.1.1', '10.50.1.90']

# findall() with groups - returns list of tuples
pattern = r'(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})'
octets = re.findall(pattern, text)
print(octets)
# Output: [('192', '168', '1', '100'), ('10', '50', '1', '1'), ...]

finditer() for Match Objects

# finditer() - returns iterator of match objects (more info than findall)
pattern = r'(?P<label>\w+):\s+(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'

for match in re.finditer(pattern, text):
    print(f"{match.group('label')}: {match.group('ip')}")
    print(f"  Position: {match.start()}-{match.end()}")

Output:

IP: 192.168.1.100
  Position: 1-18
Gateway: 10.50.1.1
  Position: 19-37
DNS: 10.50.1.90
  Position: 38-52

Lesson 4: Compiled Patterns

# Compile for reuse (better performance when used multiple times)
IP_PATTERN = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
MAC_PATTERN = re.compile(r'([A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}')

# Use compiled patterns
ips = IP_PATTERN.findall(NETWORK_INFO)
macs = MAC_PATTERN.findall(NETWORK_INFO)

# Compile with flags
EMAIL_PATTERN = re.compile(
    r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}',
    re.IGNORECASE
)

Common Flags

Flag Effect

re.IGNORECASE (re.I)

Case-insensitive matching

re.MULTILINE (re.M)

^ and $ match line boundaries

re.DOTALL (re.S)

. matches newline too

re.VERBOSE (re.X)

Allow whitespace and comments

Lesson 5: Substitution with re.sub()

# Basic replacement
text = "Error: Connection failed"
result = re.sub(r'Error', 'WARNING', text)
print(result)
# Output: WARNING: Connection failed

# Replace all occurrences
text = "foo bar foo baz foo"
result = re.sub(r'foo', 'XXX', text)
print(result)
# Output: XXX bar XXX baz XXX

# Limit replacements
result = re.sub(r'foo', 'XXX', text, count=1)
print(result)
# Output: XXX bar foo baz foo

Backreferences in Replacement

# Use \1, \2 or \g<name> for backreferences
text = "2026-03-15"
result = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', text)
print(result)
# Output: 03/15/2026

# Named backreferences
result = re.sub(
    r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',
    r'\g<month>/\g<day>/\g<year>',
    text
)

Function as Replacement (Advanced)

def mask_ip(match):
    """Mask last two octets of IP."""
    octets = match.group().split('.')
    return f"{octets[0]}.{octets[1]}.X.X"

text = "Server at 192.168.1.100, backup at 10.50.1.20"
result = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', mask_ip, text)
print(result)
# Output: Server at 192.168.X.X, backup at 10.50.X.X

Lesson 6: Practical Parsers

Log Parser

def parse_log_line(line: str) -> dict | None:
    """Parse structured log line into components."""
    pattern = re.compile(
        r'(?P<timestamp>\d{4}-\d{2}-\d{2}T[\d:]+)\s+'
        r'\[(?P<level>\w+)\]\s+'
        r'(?P<message>.*)'
    )
    match = pattern.match(line)
    if match:
        return match.groupdict()
    return None

# Usage
for line in LOG_LINES.split('\n'):
    parsed = parse_log_line(line)
    if parsed:
        print(f"[{parsed['level']}] {parsed['message']}")

Config File Parser

def parse_config(config_text: str) -> dict:
    """Parse key=value config into dictionary."""
    pattern = re.compile(r'^(?P<key>\w+)=(?P<value>.+)$', re.MULTILINE)
    return {m.group('key'): m.group('value') for m in pattern.finditer(config_text)}

config = parse_config(CONFIG_DATA)
print(config)
# Output: {'server_name': 'web-prod-01', 'server_ip': '192.168.1.100', ...}

Network Info Extractor

def extract_network_info(text: str) -> list[dict]:
    """Extract IP, MAC, VLAN from network info lines."""
    pattern = re.compile(
        r'IP:\s+(?P<ip>[\d.]+)\s+'
        r'MAC:\s+(?P<mac>[A-F0-9:]+)\s+'
        r'VLAN:\s+(?P<vlan>\d+)',
        re.IGNORECASE
    )
    return [m.groupdict() for m in pattern.finditer(text)]

devices = extract_network_info(NETWORK_INFO)
for device in devices:
    print(f"IP: {device['ip']}, VLAN: {device['vlan']}")

Lesson 7: Validation Functions

def is_valid_ip(ip: str) -> bool:
    """Validate IPv4 address format and range."""
    pattern = r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$'
    match = re.match(pattern, ip)
    if not match:
        return False
    return all(0 <= int(octet) <= 255 for octet in match.groups())

def is_valid_email(email: str) -> bool:
    """Validate email format (basic)."""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, email))

def is_valid_mac(mac: str) -> bool:
    """Validate MAC address format."""
    pattern = r'^([A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}$'
    return bool(re.match(pattern, mac))

# Usage
print(is_valid_ip("192.168.1.100"))  # True
print(is_valid_ip("999.999.999.999"))  # False (out of range)
print(is_valid_email("user@example.com"))  # True
print(is_valid_mac("AA:BB:CC:DD:EE:FF"))  # True

Lesson 8: Verbose Patterns

Use re.VERBOSE for complex patterns:

# Hard to read
pattern = r'^([A-Za-z0-9._%+-]+)@([A-Za-z0-9.-]+)\.([A-Za-z]{2,})$'

# Self-documenting with VERBOSE
EMAIL_PATTERN = re.compile(r'''
    ^                       # Start of string
    (?P<local>              # Local part (before @)
        [A-Za-z0-9._%+-]+   # Allowed characters
    )
    @                       # Literal @
    (?P<domain>             # Domain part
        [A-Za-z0-9.-]+      # Domain name
    )
    \.                      # Literal dot
    (?P<tld>                # Top-level domain
        [A-Za-z]{2,}        # 2+ letters
    )
    $                       # End of string
''', re.VERBOSE)

match = EMAIL_PATTERN.match("user@example.com")
if match:
    print(match.groupdict())
    # Output: {'local': 'user', 'domain': 'example', 'tld': 'com'}

Quick Reference

import re

# Most common operations
re.search(r'pattern', string)           # Find first match
re.findall(r'pattern', string)          # All matches as list
re.sub(r'old', 'new', string)           # Replace all
re.split(r'delimiter', string)          # Split by pattern

# Compiled pattern
pattern = re.compile(r'pattern', re.IGNORECASE)
pattern.findall(string)

# Named groups
r'(?P<name>pattern)'                    # Define
match.group('name')                     # Access
match.groupdict()                       # All as dict

# Flags
re.IGNORECASE  # Case insensitive
re.MULTILINE   # ^ and $ match line bounds
re.DOTALL      # . matches newline
re.VERBOSE     # Allow whitespace/comments

Exercises to Complete

  1. [ ] Write a function to extract all URLs from text

  2. [ ] Parse ISE RADIUS log to extract username, MAC, and result

  3. [ ] Create a config validator that checks required keys exist

  4. [ ] Write a log analyzer that counts errors by type

  5. [ ] Build a data sanitizer that masks sensitive fields

Self-Check

Solutions
# 1. Extract URLs
def extract_urls(text: str) -> list[str]:
    pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
    return re.findall(pattern, text)

# 2. Parse ISE RADIUS log
def parse_radius_log(line: str) -> dict | None:
    pattern = re.compile(
        r'User-Name=(?P<user>\S+).*'
        r'Calling-Station-Id=(?P<mac>[A-Fa-f0-9:-]+).*'
        r'(?P<result>PASS|FAIL)',
        re.IGNORECASE
    )
    match = pattern.search(line)
    return match.groupdict() if match else None

# 3. Config validator
def validate_config(config: dict, required: list[str]) -> list[str]:
    return [key for key in required if key not in config]

# 4. Error counter
def count_errors(log_text: str) -> dict[str, int]:
    pattern = re.compile(r'\[ERROR\]\s+(\w+)')
    errors = re.findall(pattern, log_text)
    return {error: errors.count(error) for error in set(errors)}

# 5. Data sanitizer
def sanitize(text: str, fields: list[str]) -> str:
    for field in fields:
        pattern = rf'({field}[=:]\s*)(\S+)'
        text = re.sub(pattern, r'\1[REDACTED]', text, flags=re.IGNORECASE)
    return text

Next Session

Session 09: Advanced Patterns - Complex extraction, multi-line, and real-world challenges.