Regex Session 08: Python re Module
Python’s re module brings regex into automation. Move beyond one-liners to building reusable parsers, validators, and data extractors.
Why Python for Regex?
| Advantage | Description |
|---|---|
Named groups |
|
Multiple matches |
|
Compiled patterns |
|
Flag combinations |
|
Substitution functions |
Pass a function to |
Core Functions
import re
# Match at START of string only
re.match(pattern, string)
# Search ANYWHERE in string (use this most often)
re.search(pattern, string)
# Find ALL non-overlapping matches
re.findall(pattern, string)
# Iterator of match objects
re.finditer(pattern, string)
# Replace matches
re.sub(pattern, replacement, string)
# Split by pattern
re.split(pattern, string)
# Compile for reuse
compiled = re.compile(pattern)
compiled.search(string)
Test Setup
#!/usr/bin/env python3
"""Regex practice module."""
import re
# Sample data
LOG_LINES = """
2026-03-15T10:30:45 [INFO] User admin logged in from 192.168.1.100
2026-03-15T10:31:02 [WARN] Disk space low: 15% remaining
2026-03-15T10:32:00 [ERROR] Connection failed to db-01.example.com:5432
2026-03-15T10:33:00 [DEBUG] Query took 145ms for /api/v1/users
""".strip()
CONFIG_DATA = """
server_name=web-prod-01
server_ip=192.168.1.100
server_port=8080
admin_email=admin@example.com
db_host=db-prod-01.internal:5432
""".strip()
NETWORK_INFO = """
IP: 192.168.1.100 MAC: AA:BB:CC:DD:EE:FF VLAN: 100
IP: 10.50.1.20 MAC: 14:F6:D8:7B:31:80 VLAN: 10
IP: 172.16.0.1 MAC: 98:BB:1E:1F:A7:13 VLAN: 999
""".strip()
Lesson 1: match() vs search()
text = "Error: Connection failed"
# match() - only at START
result = re.match(r'Error', text)
print(result.group() if result else "No match")
# Output: Error
# match() fails if pattern not at start
result = re.match(r'Connection', text)
print(result.group() if result else "No match")
# Output: No match
# search() - finds ANYWHERE
result = re.search(r'Connection', text)
print(result.group() if result else "No match")
# Output: Connection
Rule: Use search() unless you specifically need start-of-string matching.
Lesson 2: Capture Groups
# Numbered groups
pattern = r'(\d{4})-(\d{2})-(\d{2})'
match = re.search(pattern, "Date: 2026-03-15")
print(match.group(0)) # Entire match: 2026-03-15
print(match.group(1)) # First group: 2026
print(match.group(2)) # Second group: 03
print(match.group(3)) # Third group: 15
print(match.groups()) # Tuple: ('2026', '03', '15')
Named Groups (Python Power Feature)
# Named groups with (?P<name>...)
pattern = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})'
match = re.search(pattern, "Date: 2026-03-15")
print(match.group('year')) # 2026
print(match.group('month')) # 03
print(match.group('day')) # 15
print(match.groupdict()) # {'year': '2026', 'month': '03', 'day': '15'}
Named groups = self-documenting code.
Lesson 3: findall() and finditer()
text = """
IP: 192.168.1.100
Gateway: 10.50.1.1
DNS: 10.50.1.90
"""
# findall() - returns list of strings
pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
ips = re.findall(pattern, text)
print(ips)
# Output: ['192.168.1.100', '10.50.1.1', '10.50.1.90']
# findall() with groups - returns list of tuples
pattern = r'(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})'
octets = re.findall(pattern, text)
print(octets)
# Output: [('192', '168', '1', '100'), ('10', '50', '1', '1'), ...]
finditer() for Match Objects
# finditer() - returns iterator of match objects (more info than findall)
pattern = r'(?P<label>\w+):\s+(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
for match in re.finditer(pattern, text):
print(f"{match.group('label')}: {match.group('ip')}")
print(f" Position: {match.start()}-{match.end()}")
Output:
IP: 192.168.1.100 Position: 1-18 Gateway: 10.50.1.1 Position: 19-37 DNS: 10.50.1.90 Position: 38-52
Lesson 4: Compiled Patterns
# Compile for reuse (better performance when used multiple times)
IP_PATTERN = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
MAC_PATTERN = re.compile(r'([A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}')
# Use compiled patterns
ips = IP_PATTERN.findall(NETWORK_INFO)
macs = MAC_PATTERN.findall(NETWORK_INFO)
# Compile with flags
EMAIL_PATTERN = re.compile(
r'[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}',
re.IGNORECASE
)
Common Flags
| Flag | Effect |
|---|---|
|
Case-insensitive matching |
|
|
|
|
|
Allow whitespace and comments |
Lesson 5: Substitution with re.sub()
# Basic replacement
text = "Error: Connection failed"
result = re.sub(r'Error', 'WARNING', text)
print(result)
# Output: WARNING: Connection failed
# Replace all occurrences
text = "foo bar foo baz foo"
result = re.sub(r'foo', 'XXX', text)
print(result)
# Output: XXX bar XXX baz XXX
# Limit replacements
result = re.sub(r'foo', 'XXX', text, count=1)
print(result)
# Output: XXX bar foo baz foo
Backreferences in Replacement
# Use \1, \2 or \g<name> for backreferences
text = "2026-03-15"
result = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\2/\3/\1', text)
print(result)
# Output: 03/15/2026
# Named backreferences
result = re.sub(
r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',
r'\g<month>/\g<day>/\g<year>',
text
)
Function as Replacement (Advanced)
def mask_ip(match):
"""Mask last two octets of IP."""
octets = match.group().split('.')
return f"{octets[0]}.{octets[1]}.X.X"
text = "Server at 192.168.1.100, backup at 10.50.1.20"
result = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', mask_ip, text)
print(result)
# Output: Server at 192.168.X.X, backup at 10.50.X.X
Lesson 6: Practical Parsers
Log Parser
def parse_log_line(line: str) -> dict | None:
"""Parse structured log line into components."""
pattern = re.compile(
r'(?P<timestamp>\d{4}-\d{2}-\d{2}T[\d:]+)\s+'
r'\[(?P<level>\w+)\]\s+'
r'(?P<message>.*)'
)
match = pattern.match(line)
if match:
return match.groupdict()
return None
# Usage
for line in LOG_LINES.split('\n'):
parsed = parse_log_line(line)
if parsed:
print(f"[{parsed['level']}] {parsed['message']}")
Config File Parser
def parse_config(config_text: str) -> dict:
"""Parse key=value config into dictionary."""
pattern = re.compile(r'^(?P<key>\w+)=(?P<value>.+)$', re.MULTILINE)
return {m.group('key'): m.group('value') for m in pattern.finditer(config_text)}
config = parse_config(CONFIG_DATA)
print(config)
# Output: {'server_name': 'web-prod-01', 'server_ip': '192.168.1.100', ...}
Network Info Extractor
def extract_network_info(text: str) -> list[dict]:
"""Extract IP, MAC, VLAN from network info lines."""
pattern = re.compile(
r'IP:\s+(?P<ip>[\d.]+)\s+'
r'MAC:\s+(?P<mac>[A-F0-9:]+)\s+'
r'VLAN:\s+(?P<vlan>\d+)',
re.IGNORECASE
)
return [m.groupdict() for m in pattern.finditer(text)]
devices = extract_network_info(NETWORK_INFO)
for device in devices:
print(f"IP: {device['ip']}, VLAN: {device['vlan']}")
Lesson 7: Validation Functions
def is_valid_ip(ip: str) -> bool:
"""Validate IPv4 address format and range."""
pattern = r'^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$'
match = re.match(pattern, ip)
if not match:
return False
return all(0 <= int(octet) <= 255 for octet in match.groups())
def is_valid_email(email: str) -> bool:
"""Validate email format (basic)."""
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def is_valid_mac(mac: str) -> bool:
"""Validate MAC address format."""
pattern = r'^([A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2}$'
return bool(re.match(pattern, mac))
# Usage
print(is_valid_ip("192.168.1.100")) # True
print(is_valid_ip("999.999.999.999")) # False (out of range)
print(is_valid_email("user@example.com")) # True
print(is_valid_mac("AA:BB:CC:DD:EE:FF")) # True
Lesson 8: Verbose Patterns
Use re.VERBOSE for complex patterns:
# Hard to read
pattern = r'^([A-Za-z0-9._%+-]+)@([A-Za-z0-9.-]+)\.([A-Za-z]{2,})$'
# Self-documenting with VERBOSE
EMAIL_PATTERN = re.compile(r'''
^ # Start of string
(?P<local> # Local part (before @)
[A-Za-z0-9._%+-]+ # Allowed characters
)
@ # Literal @
(?P<domain> # Domain part
[A-Za-z0-9.-]+ # Domain name
)
\. # Literal dot
(?P<tld> # Top-level domain
[A-Za-z]{2,} # 2+ letters
)
$ # End of string
''', re.VERBOSE)
match = EMAIL_PATTERN.match("user@example.com")
if match:
print(match.groupdict())
# Output: {'local': 'user', 'domain': 'example', 'tld': 'com'}
Quick Reference
import re
# Most common operations
re.search(r'pattern', string) # Find first match
re.findall(r'pattern', string) # All matches as list
re.sub(r'old', 'new', string) # Replace all
re.split(r'delimiter', string) # Split by pattern
# Compiled pattern
pattern = re.compile(r'pattern', re.IGNORECASE)
pattern.findall(string)
# Named groups
r'(?P<name>pattern)' # Define
match.group('name') # Access
match.groupdict() # All as dict
# Flags
re.IGNORECASE # Case insensitive
re.MULTILINE # ^ and $ match line bounds
re.DOTALL # . matches newline
re.VERBOSE # Allow whitespace/comments
Exercises to Complete
-
[ ] Write a function to extract all URLs from text
-
[ ] Parse ISE RADIUS log to extract username, MAC, and result
-
[ ] Create a config validator that checks required keys exist
-
[ ] Write a log analyzer that counts errors by type
-
[ ] Build a data sanitizer that masks sensitive fields
Self-Check
Solutions
# 1. Extract URLs
def extract_urls(text: str) -> list[str]:
pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
return re.findall(pattern, text)
# 2. Parse ISE RADIUS log
def parse_radius_log(line: str) -> dict | None:
pattern = re.compile(
r'User-Name=(?P<user>\S+).*'
r'Calling-Station-Id=(?P<mac>[A-Fa-f0-9:-]+).*'
r'(?P<result>PASS|FAIL)',
re.IGNORECASE
)
match = pattern.search(line)
return match.groupdict() if match else None
# 3. Config validator
def validate_config(config: dict, required: list[str]) -> list[str]:
return [key for key in required if key not in config]
# 4. Error counter
def count_errors(log_text: str) -> dict[str, int]:
pattern = re.compile(r'\[ERROR\]\s+(\w+)')
errors = re.findall(pattern, log_text)
return {error: errors.count(error) for error in set(errors)}
# 5. Data sanitizer
def sanitize(text: str, fields: list[str]) -> str:
for field in fields:
pattern = rf'({field}[=:]\s*)(\S+)'
text = re.sub(pattern, r'\1[REDACTED]', text, flags=re.IGNORECASE)
return text
Next Session
Session 09: Advanced Patterns - Complex extraction, multi-line, and real-world challenges.