Add playbook annotations ("runbook") to all alerts

Better to do it this way than automagically creating playbook links in the email templates (as that affects only emails, while adding the annotation explicitly works across all notification systems).

Add playbook annotations ("runbook") to all alerts
78271e6c · ale · 9ab98958 · 78271e6c · 78271e6c · 78271e6c
Commit 78271e6c authored 5 years ago by ale
--- a/roles/prometheus/defaults/main.yml
+++ b/roles/prometheus/defaults/main.yml
@@ -11,8 +11,8 @@ alertmanager_smtp_hello: "localhost"
 alertmanager_smtp_auth_username: ""
 alertmanager_smtp_auth_password: ""

-# Define if you have a playbook website
-#alertmanager_playbook_url: ""
+# Point at something that actually exists.
+alert_playbook_url: "https://playbooks.{{ domain }}"

 # Custom blackbox probes.
 prometheus_custom_blackbox_probes: {}
--- a/roles/prometheus/tasks/prometheus.yml
+++ b/roles/prometheus/tasks/prometheus.yml
 ---

 # Configure Prometheus components.
+#
+# Since Prometheus configurations and templates are heavy with Go template
+# syntax, which uses the same escapes as Ansible, we override the Ansible
+# template variable delimiters to '[[' and ']]', so that they do not conflict
+# with the Go syntax.

 - name: Create /etc/prometheus and subdirs
  file:
@@ -14,17 +19,21 @@
    - "/etc/prometheus/console_libraries"

 - name: Install Prometheus rules
-  copy:
+  template:
    src: "{{ item }}"
    dest: "/etc/prometheus/rules/"
+    variable_start_string: "[["
+    variable_end_string: "]]"
  with_fileglob:
-    - files/rules/*.conf.yml
+    - templates/rules/*.conf.yml
  notify: "reload prometheus"

 - name: Install alertmanager templates
  template:
    src: "{{ item }}"
    dest: "/etc/prometheus/alertmanager_templates/"
+    variable_start_string: "[["
+    variable_end_string: "]]"
  with_fileglob:
    - templates/alertmanager_templates/*
  notify: "reload prometheus"

--- a/roles/prometheus/templates/alertmanager_templates/email.tmpl
+++ b/roles/prometheus/templates/alertmanager_templates/email.tmpl
-{% if alertmanager_playbook_url is defined %}
-{% raw %}{{ define "playbook_url.html" }}{% endraw %}
-<a href="{{ alertmanager_playbook_url }}/{% raw %}{{.Labels.alertname}}{% endraw %}.md">Playbook</a><br />
-{% raw %}{{ end }}{% endraw %}
-{% else %}
-{% raw %}
-{{ define "playbook_url.html" }}{{ end }}
-{% endraw %}
-{% endif %}
-
-{% raw %}
 {{ define "email.float.html" }}
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
@@ -341,7 +330,6 @@ a {
                    {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                    {{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }}
                    {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
-                    {{ template "playbook_url.html" . }}
                    <a href="{{ .GeneratorURL }}">Source</a><br />
                  </td>
                </tr>
@@ -394,4 +382,3 @@ a {
 </body>
 </html>
 {{ end }}
-{% endraw %}
--- a/roles/prometheus/files/rules/alerts_acme.conf.yml
+++ b/roles/prometheus/files/rules/alerts_acme.conf.yml
@@ -35,3 +35,4 @@ groups:
    annotations:
      summary: 'ACME automation completely broken'
      description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.'
+      runbook: '[[ alert_playbook_url ]]/ACMEBroken.md'
--- a/roles/prometheus/files/rules/alerts_backup.conf.yml
+++ b/roles/prometheus/files/rules/alerts_backup.conf.yml
@@ -17,3 +17,4 @@ groups:
    annotations:
      summary: '{{ $labels.dataset }} backup failure on {{ $labels.host }}'
      description: 'Dataset {{ $labels.dataset }} has failed its backups on {{ $labels.host }} for two days.'
+      runbook: '[[ alert_playbook_url ]]/BackupFailed.md'
--- a/roles/prometheus/files/rules/alerts_base.conf.yml
+++ b/roles/prometheus/files/rules/alerts_base.conf.yml
@@ -63,6 +63,7 @@ groups:
    annotations:
      summary: 'Job {{ $labels.job }} is down globally'
      description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/JobDown.md'

  - alert: ProbeFailure
    expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
@@ -74,6 +75,7 @@ groups:
      summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing'
      description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
        for target {{ $labels.target }} (success ratio {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'

  - alert: ProbeFailure
    expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
@@ -85,3 +87,4 @@ groups:
      summary: 'Probe {{ $labels.probe }} is failing globally'
      description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
        globally (success ratio {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
--- a/roles/prometheus/files/rules/alerts_cpu.conf.yml
+++ b/roles/prometheus/files/rules/alerts_cpu.conf.yml
@@ -22,4 +22,5 @@ groups:
    annotations:
      summary: 'Host {{$labels.host}} is thrashing'
      description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.'
+      runbook: '[[ alert_playbook_url ]]/HostThrashing.md'

--- a/roles/prometheus/files/rules/alerts_disk.conf.yml
+++ b/roles/prometheus/files/rules/alerts_disk.conf.yml
@@ -10,3 +10,4 @@ groups:
    annotations:
      summary: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is almost full'
      description: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} will be full in less than 4 hours'
+      runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours.md'
--- a/roles/prometheus/files/rules/alerts_mysql.conf.yml
+++ b/roles/prometheus/files/rules/alerts_mysql.conf.yml
@@ -9,6 +9,8 @@ groups:
        annotations:
          description: Slave replication (IO or SQL) has been down for more than 2 minutes on {{ $labels.job }}@{{ $labels.host }}.
          summary: Slave replication is not running for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken.md'
+
      - alert: MySQLReplicationBehind
        expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
        for: 2m
@@ -17,6 +19,8 @@ groups:
        annotations:
          description: The mysql slave replication has fallen behind and is not recovering on {{ $labels.job }}@{{ $labels.host }}.
          summary: MySQL slave replication is lagging for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind.md'
+
      - alert: MySQLInnoDBLogWaits
        expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
        labels:
@@ -24,3 +28,5 @@ groups:
        annotations:
          description: The innodb logs are waiting for disk at a rate of {{$value}} / second on {{ $labels.job }}@{{ $labels.host }}
          summary: MySQL innodb log writes stalling for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits.md'
+
--- a/roles/prometheus/files/rules/alerts_net.conf.yml
+++ b/roles/prometheus/files/rules/alerts_net.conf.yml
@@ -9,6 +9,8 @@ groups:
    annotations:
      description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.'
      summary: 'Conntrack table on {{ $labels.instance }} is almost full'
+      runbook: '[[ alert_playbook_url ]]/ConntrackTableFull.md'
+
  - alert: NetworkErrors
    expr: instance:node_network_errs_total:rate5m > 1
    for: 15m
@@ -17,6 +19,8 @@ groups:
    annotations:
      summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}'
      description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.'
+      runbook: '[[ alert_playbook_url ]]/NetworkErrors.md'
+
  - alert: NetworkDrops
    expr: instance:node_network_drop_total:rate5m > 1
    for: 15m
@@ -25,6 +29,8 @@ groups:
    annotations:
      summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}'
      description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.'
+      runbook: '[[ alert_playbook_url ]]/NetworkDrops.md'
+
  - alert: HostUnreachable
    expr: probe_success{job="blackbox_ping"} < 1
    for: 5m
@@ -33,3 +39,4 @@ groups:
    annotations:
      summary: 'Host {{ $labels.instance }} is unreachable'
      description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).'
+      runbook: '[[ alert_playbook_url ]]/HostUnreachable.md'
--- a/roles/prometheus/files/rules/alerts_nginx.conf.yml
+++ b/roles/prometheus/files/rules/alerts_nginx.conf.yml
@@ -12,6 +12,7 @@ groups:
    annotations:
      summary: 'High HTTP error ratio for {{$labels.vhost}} globally'
      description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.'
+      runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'

  - alert: HTTPErrorRatioHigh
    expr: (host:nginx_http_requests_errs:ratio > 0.2 and host:nginx_http_requests_total:rate5m > 0.1)
@@ -23,3 +24,5 @@ groups:
    annotations:
      summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}'
      description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.'
+      runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
+
--- a/roles/prometheus/files/rules/alerts_prometheus.conf.yml
+++ b/roles/prometheus/files/rules/alerts_prometheus.conf.yml
 groups:
 - name: roles/prometheus/files/rules/alerts_prometheus.conf
  rules:
-  - alert: PrometheusUnreachable
-    expr: up{job=~"prometheus.*"} == 0
-    for: 10m
-    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} could not be scraped for
-        over 10 minutes.'
-      title: '{{$labels.job}} is unreachable'
-  - alert: PrometheusManyRestarts
-    expr: changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
-    for: 30m
-    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} has restarted more than
-        3 times in the last 30 minutes. It might be crashlooping.'
-      title: '{{$labels.job}} is restarting frequently'
  - alert: PrometheusRuleEvaluationSlow
    expr: prometheus_evaluator_duration_seconds{job=~"prometheus.*",quantile="0.9"}
      > 60
    for: 10m
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
        latency of {{$value}}s completing rule evaluation cycles.'
      title: '{{$labels.job}} is evaluating rules too slowly'
+
  - alert: PrometheusCheckpointingSlow
    expr: avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
      > prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000
    for: 5m
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average
        for each checkpoint.'
      title: '{{$labels.job}} is checkpointing too slowly'
+
  - alert: PrometheusIndexingBacklog
    expr: prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"} / prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
      * 100 > 10
    for: 30m
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
        indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}%
        full.'
      title: '{{$labels.job}} is backlogging on the indexing queue'
+
  - alert: PrometheusNotIngestingSamples
    expr: rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m])
      == 0
    for: 5m
    labels:
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples
        in the last 10 minutes.'
      title: '{{$labels.job}} is not ingesting samples'
+      runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples.md'
+
  - alert: PrometheusPersistErrors
    expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m])
      > 0
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has encountered {{$value}}
        persist errors per second in the last 10 minutes.'
      title: '{{$labels.job}} has persist errors'
+
  - alert: PrometheusNotificationsBacklog
    expr: prometheus_notifications_queue_length{job=~"prometheus.*"} > 0
    for: 10m
    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
        notifications queue. The queue has not been empty for 10 minutes. Current
        queue length: {{$value}}.'
      title: '{{$labels.job}} is backlogging on the notifications queue'
+      runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog.md'
+
  - alert: PrometheusScrapingSlowly
    expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"}
      > 2 * 60
    for: 10m
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
        latency of {{$value}}s for scraping targets in the {{$labels.interval}} target
        pool.'
      title: '{{$labels.job}} is scraping targets slowly'
+
  - alert: PrometheusStorageInconsistent
    expr: prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has detected a storage
        inconsistency. A server restart is needed to initiate recovery.'
      title: '{{$labels.job}} has an inconsistent storage'
+
  - alert: PrometheusPersistencePressureTooHigh
    expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
      > 0.8 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
      3600 * 24) > 1
    for: 30m
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
        persistence pressure. Throttled ingestion expected within the next 24h.'
      title: '{{$labels.job}} can not keep up persisting'
+
  - alert: PrometheusPersistencePressureTooHigh
    expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
      > 0.85 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
      3600 * 2) > 1
    for: 30m
    labels:
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
        persistence pressure. Throttled ingestion expected within the next 2h.'
      title: '{{$labels.job}} can not keep up persisting'
+      runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh.md'
+
  - alert: PrometheusSeriesMaintenanceStalled
    expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance)
      rate(prometheus_local_storage_series_ops_total{job=~"prometheus.*",type="maintenance_in_memory"}[5m])
      / 3600 > 24 and ON(job, instance) prometheus_local_storage_rushed_mode == 1
    for: 1h
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} is maintaining memory
        time series so slowly that it will take {{$value | printf `%.0f`}}h to complete
        a full cycle. This will lead to persistence falling behind.'
      title: '{{$labels.job}} is maintaining memory time series too slowly'
+
  - alert: PrometheusInvalidConfigFile
    expr: prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0
    for: 30m
    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
    annotations:
      description: The configuration file for {{$labels.job}} at {{$labels.instance}}
        is invalid and was therefore not reloaded.
      title: '{{$labels.job}} has an invalid config'
+      runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile.md'
+
  - alert: PrometheusOutOfOrderSamplesDiscarded
    expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m])
      > 0
    for: 1h
    labels:
-      service: prometheus
+      scope: host
      severity: warn
    annotations:
      description: '{{$labels.job}} at {{$labels.instance}} has discarded {{$value}}

--- a/roles/prometheus/files/rules/alerts_services.conf.yml
+++ b/roles/prometheus/files/rules/alerts_services.conf.yml
@@ -10,6 +10,7 @@ groups:
        annotations:
          description: 'Availability too low for service {{ $labels.float_service }}'
          summary: 'Availability too low for service {{ $labels.float_service }}'
+          runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow.md'

      - alert: ServiceDegraded
        expr: float_service:ok_by_host == 0

--- a/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml
+++ b/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml
@@ -2,11 +2,13 @@ groups:
 - name: roles/prometheus/files/rules/alerts_ssl_probes.conf
  rules:
  - alert: SSLCertificateAboutToExpire
-    expr: target:probe_ssl_cert_expiry:days < 15
-    for: 1h
+    expr: target:probe_ssl_cert_expiry:days < 10
+    for: 3h
    labels:
+      scope: global
      severity: page 
    annotations:
      summary: 'SSL certificate about to expire for {{ $labels.job }}@{{ $labels.target }}'
      description: 'The "{{ $labels.job }}" prober reports that {{ $labels.target }} is serving a SSL certificate that will expire in {{ $value }} days.'
+      runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire.md'

--- a/roles/prometheus/files/rules/alerts_syslog.conf.yml
+++ b/roles/prometheus/files/rules/alerts_syslog.conf.yml
@@ -2,10 +2,12 @@ groups:
  - name: roles/prometheus/files/rules/alerts_syslog.conf
    rules:
      - alert: SyslogDroppingLogs
-        expr: rsyslog_queue_discarded:rate5m > 10
-        for: 10m
+        expr: rsyslog_queue_discarded:rate5m{job="rsyslog-collector"} > 10
+        for: 15m
        labels:
+          scope: global
          severity: page
        annotations:
-          description: Syslog collector is dropping logs on {{ $labels.host }}
-          summary: Syslog collector is dropping logs
+          description: 'Syslog collector is dropping logs on {{ $labels.host }}'
+          summary: 'Syslog collector is dropping logs'
+          runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs.md'
--- a/roles/prometheus/files/rules/alerts_system_health.conf.yml
+++ b/roles/prometheus/files/rules/alerts_system_health.conf.yml
@@ -19,12 +19,14 @@ groups:
        summary: "RAID device {{ $labels.md_device }} on {{ $labels.host }} is unhealthy"
        description: "The RAID device {{ $labels.md_device }} on {{ $labels.host }} is reporting a degraded state, which means that probably one or more of the disks in the array have failed."

-    - alert: HostPhysicalComponentTooHot
+    - alert: PhysicalComponentTooHot
      expr: node_hwmon_temp_celsius / node_hwmon_temp_crit_celsius > 0.95
      for: 2h
      labels:
+        scope: host
        severity: page
      annotations:
        summary: "A physical component is running too hot on {{ $labels.host }}"
        description: "A sensor is reporting that a physical component ({{ $labels.sensor }}/{{ $labels.chip }}) on {{ $labels.host }} has been running very close to the critical level ({{ $value }}) for the last 2 hours."
+        runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot.md'

--- a/roles/prometheus/files/rules/alerts_systemd.conf.yml
+++ b/roles/prometheus/files/rules/alerts_systemd.conf.yml
@@ -9,11 +9,15 @@ groups:
    annotations:
      summary: '{{ $labels.name }} has failed on {{ $labels.host }}'
      description: 'The systemd unit {{ $labels.name }} has failed on {{ $labels.host }}.'
+
  - alert: SystemdUnitCrashLooping
    expr: instance:systemd_unit_restarts:delta10m > 12
    for: 30m
    labels:
+      scope: host
      severity: page
    annotations:
      summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}'
      description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.'
+      runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping.md'
+
--- a/roles/prometheus/files/rules/rules_acme.conf.yml
+++ b/roles/prometheus/files/rules/rules_acme.conf.yml
--- a/roles/prometheus/files/rules/rules_base.conf.yml
+++ b/roles/prometheus/files/rules/rules_base.conf.yml
--- a/roles/prometheus/files/rules/rules_cpu.conf.yml
+++ b/roles/prometheus/files/rules/rules_cpu.conf.yml