Commit 78271e6c authored by ale's avatar ale

Add playbook annotations ("runbook") to all alerts

Better to do it this way than automagically creating playbook links
in the email templates (as that affects only emails, while adding the
annotation explicitly works across all notification systems).
parent 9ab98958
Pipeline #6656 failed with stages
in 12 minutes and 16 seconds
......@@ -11,8 +11,8 @@ alertmanager_smtp_hello: "localhost"
alertmanager_smtp_auth_username: ""
alertmanager_smtp_auth_password: ""
# Define if you have a playbook website
#alertmanager_playbook_url: ""
# Point at something that actually exists.
alert_playbook_url: "https://playbooks.{{ domain }}"
# Custom blackbox probes.
prometheus_custom_blackbox_probes: {}
---
# Configure Prometheus components.
#
# Since Prometheus configurations and templates are heavy with Go template
# syntax, which uses the same escapes as Ansible, we override the Ansible
# template variable delimiters to '[[' and ']]', so that they do not conflict
# with the Go syntax.
- name: Create /etc/prometheus and subdirs
file:
......@@ -14,17 +19,21 @@
- "/etc/prometheus/console_libraries"
- name: Install Prometheus rules
copy:
template:
src: "{{ item }}"
dest: "/etc/prometheus/rules/"
variable_start_string: "[["
variable_end_string: "]]"
with_fileglob:
- files/rules/*.conf.yml
- templates/rules/*.conf.yml
notify: "reload prometheus"
- name: Install alertmanager templates
template:
src: "{{ item }}"
dest: "/etc/prometheus/alertmanager_templates/"
variable_start_string: "[["
variable_end_string: "]]"
with_fileglob:
- templates/alertmanager_templates/*
notify: "reload prometheus"
......
{% if alertmanager_playbook_url is defined %}
{% raw %}{{ define "playbook_url.html" }}{% endraw %}
<a href="{{ alertmanager_playbook_url }}/{% raw %}{{.Labels.alertname}}{% endraw %}.md">Playbook</a><br />
{% raw %}{{ end }}{% endraw %}
{% else %}
{% raw %}
{{ define "playbook_url.html" }}{{ end }}
{% endraw %}
{% endif %}
{% raw %}
{{ define "email.float.html" }}
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
......@@ -341,7 +330,6 @@ a {
{{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
{{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }}
{{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
{{ template "playbook_url.html" . }}
<a href="{{ .GeneratorURL }}">Source</a><br />
</td>
</tr>
......@@ -394,4 +382,3 @@ a {
</body>
</html>
{{ end }}
{% endraw %}
......@@ -35,3 +35,4 @@ groups:
annotations:
summary: 'ACME automation completely broken'
description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.'
runbook: '[[ alert_playbook_url ]]/ACMEBroken.md'
......@@ -17,3 +17,4 @@ groups:
annotations:
summary: '{{ $labels.dataset }} backup failure on {{ $labels.host }}'
description: 'Dataset {{ $labels.dataset }} has failed its backups on {{ $labels.host }} for two days.'
runbook: '[[ alert_playbook_url ]]/BackupFailed.md'
......@@ -63,6 +63,7 @@ groups:
annotations:
summary: 'Job {{ $labels.job }} is down globally'
description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/JobDown.md'
- alert: ProbeFailure
expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
......@@ -74,6 +75,7 @@ groups:
summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
for target {{ $labels.target }} (success ratio {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
- alert: ProbeFailure
expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
......@@ -85,3 +87,4 @@ groups:
summary: 'Probe {{ $labels.probe }} is failing globally'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
globally (success ratio {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
......@@ -22,4 +22,5 @@ groups:
annotations:
summary: 'Host {{$labels.host}} is thrashing'
description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.'
runbook: '[[ alert_playbook_url ]]/HostThrashing.md'
......@@ -10,3 +10,4 @@ groups:
annotations:
summary: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is almost full'
description: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} will be full in less than 4 hours'
runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours.md'
......@@ -9,6 +9,8 @@ groups:
annotations:
description: Slave replication (IO or SQL) has been down for more than 2 minutes on {{ $labels.job }}@{{ $labels.host }}.
summary: Slave replication is not running for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken.md'
- alert: MySQLReplicationBehind
expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
for: 2m
......@@ -17,6 +19,8 @@ groups:
annotations:
description: The mysql slave replication has fallen behind and is not recovering on {{ $labels.job }}@{{ $labels.host }}.
summary: MySQL slave replication is lagging for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind.md'
- alert: MySQLInnoDBLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
labels:
......@@ -24,3 +28,5 @@ groups:
annotations:
description: The innodb logs are waiting for disk at a rate of {{$value}} / second on {{ $labels.job }}@{{ $labels.host }}
summary: MySQL innodb log writes stalling for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits.md'
......@@ -9,6 +9,8 @@ groups:
annotations:
description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.'
summary: 'Conntrack table on {{ $labels.instance }} is almost full'
runbook: '[[ alert_playbook_url ]]/ConntrackTableFull.md'
- alert: NetworkErrors
expr: instance:node_network_errs_total:rate5m > 1
for: 15m
......@@ -17,6 +19,8 @@ groups:
annotations:
summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.'
runbook: '[[ alert_playbook_url ]]/NetworkErrors.md'
- alert: NetworkDrops
expr: instance:node_network_drop_total:rate5m > 1
for: 15m
......@@ -25,6 +29,8 @@ groups:
annotations:
summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.'
runbook: '[[ alert_playbook_url ]]/NetworkDrops.md'
- alert: HostUnreachable
expr: probe_success{job="blackbox_ping"} < 1
for: 5m
......@@ -33,3 +39,4 @@ groups:
annotations:
summary: 'Host {{ $labels.instance }} is unreachable'
description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).'
runbook: '[[ alert_playbook_url ]]/HostUnreachable.md'
......@@ -12,6 +12,7 @@ groups:
annotations:
summary: 'High HTTP error ratio for {{$labels.vhost}} globally'
description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
- alert: HTTPErrorRatioHigh
expr: (host:nginx_http_requests_errs:ratio > 0.2 and host:nginx_http_requests_total:rate5m > 0.1)
......@@ -23,3 +24,5 @@ groups:
annotations:
summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}'
description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
groups:
- name: roles/prometheus/files/rules/alerts_prometheus.conf
rules:
- alert: PrometheusUnreachable
expr: up{job=~"prometheus.*"} == 0
for: 10m
labels:
pager: pagerduty
service: prometheus
severity: critical
annotations:
description: '{{$labels.job}} at {{$labels.instance}} could not be scraped for
over 10 minutes.'
title: '{{$labels.job}} is unreachable'
- alert: PrometheusManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
for: 30m
labels:
pager: pagerduty
service: prometheus
severity: critical
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has restarted more than
3 times in the last 30 minutes. It might be crashlooping.'
title: '{{$labels.job}} is restarting frequently'
- alert: PrometheusRuleEvaluationSlow
expr: prometheus_evaluator_duration_seconds{job=~"prometheus.*",quantile="0.9"}
> 60
for: 10m
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
latency of {{$value}}s completing rule evaluation cycles.'
title: '{{$labels.job}} is evaluating rules too slowly'
- alert: PrometheusCheckpointingSlow
expr: avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
> prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000
for: 5m
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average
for each checkpoint.'
title: '{{$labels.job}} is checkpointing too slowly'
- alert: PrometheusIndexingBacklog
expr: prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"} / prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
* 100 > 10
for: 30m
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}%
full.'
title: '{{$labels.job}} is backlogging on the indexing queue'
- alert: PrometheusNotIngestingSamples
expr: rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m])
== 0
for: 5m
labels:
service: prometheus
severity: critical
scope: host
severity: page
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples
in the last 10 minutes.'
title: '{{$labels.job}} is not ingesting samples'
runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples.md'
- alert: PrometheusPersistErrors
expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m])
> 0
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has encountered {{$value}}
persist errors per second in the last 10 minutes.'
title: '{{$labels.job}} has persist errors'
- alert: PrometheusNotificationsBacklog
expr: prometheus_notifications_queue_length{job=~"prometheus.*"} > 0
for: 10m
labels:
pager: pagerduty
service: prometheus
severity: critical
scope: host
severity: page
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
notifications queue. The queue has not been empty for 10 minutes. Current
queue length: {{$value}}.'
title: '{{$labels.job}} is backlogging on the notifications queue'
runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog.md'
- alert: PrometheusScrapingSlowly
expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"}
> 2 * 60
for: 10m
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
latency of {{$value}}s for scraping targets in the {{$labels.interval}} target
pool.'
title: '{{$labels.job}} is scraping targets slowly'
- alert: PrometheusStorageInconsistent
expr: prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has detected a storage
inconsistency. A server restart is needed to initiate recovery.'
title: '{{$labels.job}} has an inconsistent storage'
- alert: PrometheusPersistencePressureTooHigh
expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
> 0.8 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
3600 * 24) > 1
for: 30m
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
persistence pressure. Throttled ingestion expected within the next 24h.'
title: '{{$labels.job}} can not keep up persisting'
- alert: PrometheusPersistencePressureTooHigh
expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
> 0.85 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
3600 * 2) > 1
for: 30m
labels:
service: prometheus
severity: critical
scope: host
severity: page
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
persistence pressure. Throttled ingestion expected within the next 2h.'
title: '{{$labels.job}} can not keep up persisting'
runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh.md'
- alert: PrometheusSeriesMaintenanceStalled
expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance)
rate(prometheus_local_storage_series_ops_total{job=~"prometheus.*",type="maintenance_in_memory"}[5m])
/ 3600 > 24 and ON(job, instance) prometheus_local_storage_rushed_mode == 1
for: 1h
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} is maintaining memory
time series so slowly that it will take {{$value | printf `%.0f`}}h to complete
a full cycle. This will lead to persistence falling behind.'
title: '{{$labels.job}} is maintaining memory time series too slowly'
- alert: PrometheusInvalidConfigFile
expr: prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0
for: 30m
labels:
pager: pagerduty
service: prometheus
severity: critical
scope: host
severity: page
annotations:
description: The configuration file for {{$labels.job}} at {{$labels.instance}}
is invalid and was therefore not reloaded.
title: '{{$labels.job}} has an invalid config'
runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile.md'
- alert: PrometheusOutOfOrderSamplesDiscarded
expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m])
> 0
for: 1h
labels:
service: prometheus
scope: host
severity: warn
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has discarded {{$value}}
......
......@@ -10,6 +10,7 @@ groups:
annotations:
description: 'Availability too low for service {{ $labels.float_service }}'
summary: 'Availability too low for service {{ $labels.float_service }}'
runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow.md'
- alert: ServiceDegraded
expr: float_service:ok_by_host == 0
......
......@@ -2,11 +2,13 @@ groups:
- name: roles/prometheus/files/rules/alerts_ssl_probes.conf
rules:
- alert: SSLCertificateAboutToExpire
expr: target:probe_ssl_cert_expiry:days < 15
for: 1h
expr: target:probe_ssl_cert_expiry:days < 10
for: 3h
labels:
scope: global
severity: page
annotations:
summary: 'SSL certificate about to expire for {{ $labels.job }}@{{ $labels.target }}'
description: 'The "{{ $labels.job }}" prober reports that {{ $labels.target }} is serving a SSL certificate that will expire in {{ $value }} days.'
runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire.md'
......@@ -2,10 +2,12 @@ groups:
- name: roles/prometheus/files/rules/alerts_syslog.conf
rules:
- alert: SyslogDroppingLogs
expr: rsyslog_queue_discarded:rate5m > 10
for: 10m
expr: rsyslog_queue_discarded:rate5m{job="rsyslog-collector"} > 10
for: 15m
labels:
scope: global
severity: page
annotations:
description: Syslog collector is dropping logs on {{ $labels.host }}
summary: Syslog collector is dropping logs
description: 'Syslog collector is dropping logs on {{ $labels.host }}'
summary: 'Syslog collector is dropping logs'
runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs.md'
......@@ -19,12 +19,14 @@ groups:
summary: "RAID device {{ $labels.md_device }} on {{ $labels.host }} is unhealthy"
description: "The RAID device {{ $labels.md_device }} on {{ $labels.host }} is reporting a degraded state, which means that probably one or more of the disks in the array have failed."
- alert: HostPhysicalComponentTooHot
- alert: PhysicalComponentTooHot
expr: node_hwmon_temp_celsius / node_hwmon_temp_crit_celsius > 0.95
for: 2h
labels:
scope: host
severity: page
annotations:
summary: "A physical component is running too hot on {{ $labels.host }}"
description: "A sensor is reporting that a physical component ({{ $labels.sensor }}/{{ $labels.chip }}) on {{ $labels.host }} has been running very close to the critical level ({{ $value }}) for the last 2 hours."
runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot.md'
......@@ -9,11 +9,15 @@ groups:
annotations:
summary: '{{ $labels.name }} has failed on {{ $labels.host }}'
description: 'The systemd unit {{ $labels.name }} has failed on {{ $labels.host }}.'
- alert: SystemdUnitCrashLooping
expr: instance:systemd_unit_restarts:delta10m > 12
for: 30m
labels:
scope: host
severity: page
annotations:
summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}'
description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.'
runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping.md'
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment