Alert Routing & Silences

Routing Tree

route:
  receiver: 'default'
  group_by: ['alertname', 'cluster']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h

  routes:
    # Critical alerts to PagerDuty
    - match:
        severity: critical
      receiver: 'pagerduty'
      continue: false

    # Database alerts to DBA team
    - match_re:
        alertname: ^(MySQL|Postgres).*
      receiver: 'dba-team'

    # Everything else to Slack
    - receiver: 'slack'

Silences

# Create silence by alertname
amtool silence add alertname=HighMemory \
    --duration=2h \
    --author="evan" \
    --comment="Scheduled maintenance"

# Create silence with regex
amtool silence add alertname=~"High.*" instance=server1 \
    --duration=1h

# List active silences
amtool silence query

# Expire a silence
amtool silence expire <silence-id>

# Expire all silences
amtool silence expire $(amtool silence query -q)

Inhibition Rules

# Suppress warnings when critical alert fires
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']

TODO: Time-based muting, complex routing examples