# Alert for any instance that is unreachable for >5 minutes. -alert:InstanceDown expr:up==0 for:5m labels: severity:page annotations: summary:"Instance {{ $labels.instance }} down" description:"{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
# Alert for any instance that has a median request latency >1s. -alert:APIHighRequestLatency expr:api_http_request_latencies_second{quantile="0.5"}>1 for:10m annotations: summary:"High request latency on {{ $labels.instance }}" description:"{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
global: # The smarthost and SMTP sender used for mail notifications. smtp_smarthost:'localhost:25' smtp_from:'alertmanager@example.org' smtp_auth_username:'alertmanager' smtp_auth_password:'password' # The auth token for Hipchat. hipchat_auth_token:'1234556789' # Alternative host for Hipchat. hipchat_api_url:'https://hipchat.foobar.org/'
# The directory from which notification templates are read. templates: -'/etc/alertmanager/template/*.tmpl'
# The root route on which each incoming alert enters. route: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. # # To aggregate by all possible labels use '...' as the sole label name. # This effectively disables aggregation entirely, passing through all # alerts as-is. This is unlikely to be what you want, unless you have # a very low alert volume or your upstream notification system performs # its own grouping. Example: group_by: [...] group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start # firing shortly after another are batched together on the first # notification. group_wait:30s
# When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. group_interval:5m
# If an alert has successfully been sent, wait 'repeat_interval' to # resend them. repeat_interval:3h
# A default receiver receiver:team-X-mails
# All the above attributes are inherited by all child routes and can # overwritten on each.
# The child route trees. routes: # This routes performs a regular expression match on alert labels to # catch alerts that are related to a list of services. -match_re: service:^(foo1|foo2|baz)$ receiver:team-X-mails # The service has a sub-route for critical alerts, any alerts # that do not match, i.e. severity != critical, fall-back to the # parent node and are sent to 'team-X-mails' routes: -match: severity:critical receiver:team-X-pager -match: service:files receiver:team-Y-mails
# This route handles all alerts coming from a database service. If there's # no team to handle it, it defaults to the DB team. -match: service:database receiver:team-DB-pager # Also group alerts by affected database. group_by: [alertname, cluster, database] routes: -match: owner:team-X receiver:team-X-pager continue:true -match: owner:team-Y receiver:team-Y-pager
# Inhibition rules allow to mute a set of alerts given that another alert is # firing. # We use this to mute any warning-level notifications if the same alert is # already critical. inhibit_rules: -source_match: severity:'critical' target_match: severity:'warning' # Apply inhibition if the alertname is the same. equal: ['alertname', 'cluster', 'service']