From 78271e6ce100717542d384883cf601eb606f9a89 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Tue, 28 Apr 2020 15:53:28 +0100
Subject: [PATCH] Add playbook annotations ("runbook") to all alerts

Better to do it this way than automagically creating playbook links
in the email templates (as that affects only emails, while adding the
annotation explicitly works across all notification systems).
---
 roles/prometheus/defaults/main.yml            |  4 +-
 .../files/rules/alerts_syslog.conf.yml        | 11 ---
 roles/prometheus/tasks/prometheus.yml         | 13 +++-
 .../alertmanager_templates/email.tmpl         | 13 ----
 .../rules/alerts_acme.conf.yml                |  1 +
 .../rules/alerts_backup.conf.yml              |  1 +
 .../rules/alerts_base.conf.yml                |  3 +
 .../rules/alerts_cpu.conf.yml                 |  1 +
 .../rules/alerts_disk.conf.yml                |  1 +
 .../rules/alerts_mysql.conf.yml               |  6 ++
 .../rules/alerts_net.conf.yml                 |  7 ++
 .../rules/alerts_nginx.conf.yml               |  3 +
 .../rules/alerts_prometheus.conf.yml          | 74 +++++++++----------
 .../rules/alerts_services.conf.yml            |  1 +
 .../rules/alerts_ssl_probes.conf.yml          |  6 +-
 .../templates/rules/alerts_syslog.conf.yml    | 13 ++++
 .../rules/alerts_system_health.conf.yml       |  4 +-
 .../rules/alerts_systemd.conf.yml             |  4 +
 .../rules/rules_acme.conf.yml                 |  0
 .../rules/rules_base.conf.yml                 |  0
 .../rules/rules_cpu.conf.yml                  |  0
 .../rules/rules_disk.conf.yml                 |  0
 .../rules/rules_elasticsearch.conf.yml        |  0
 .../rules/rules_mysql.conf.yml                |  0
 .../rules/rules_net.conf.yml                  |  0
 .../rules/rules_nginx.conf.yml                |  0
 .../rules/rules_node_016.conf.yml             |  0
 .../rules/rules_services.conf.yml             |  0
 .../rules/rules_ssl_probes.conf.yml           |  0
 .../rules/rules_syslog.conf.yml               |  0
 .../rules/rules_systemd.conf.yml              |  0
 31 files changed, 94 insertions(+), 72 deletions(-)
 delete mode 100644 roles/prometheus/files/rules/alerts_syslog.conf.yml
 rename roles/prometheus/{files => templates}/rules/alerts_acme.conf.yml (95%)
 rename roles/prometheus/{files => templates}/rules/alerts_backup.conf.yml (91%)
 rename roles/prometheus/{files => templates}/rules/alerts_base.conf.yml (94%)
 rename roles/prometheus/{files => templates}/rules/alerts_cpu.conf.yml (92%)
 rename roles/prometheus/{files => templates}/rules/alerts_disk.conf.yml (87%)
 rename roles/prometheus/{files => templates}/rules/alerts_mysql.conf.yml (85%)
 rename roles/prometheus/{files => templates}/rules/alerts_net.conf.yml (84%)
 rename roles/prometheus/{files => templates}/rules/alerts_nginx.conf.yml (87%)
 rename roles/prometheus/{files => templates}/rules/alerts_prometheus.conf.yml (82%)
 rename roles/prometheus/{files => templates}/rules/alerts_services.conf.yml (91%)
 rename roles/prometheus/{files => templates}/rules/alerts_ssl_probes.conf.yml (72%)
 create mode 100644 roles/prometheus/templates/rules/alerts_syslog.conf.yml
 rename roles/prometheus/{files => templates}/rules/alerts_system_health.conf.yml (91%)
 rename roles/prometheus/{files => templates}/rules/alerts_systemd.conf.yml (89%)
 rename roles/prometheus/{files => templates}/rules/rules_acme.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_base.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_cpu.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_disk.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_elasticsearch.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_mysql.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_net.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_nginx.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_node_016.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_services.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_ssl_probes.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_syslog.conf.yml (100%)
 rename roles/prometheus/{files => templates}/rules/rules_systemd.conf.yml (100%)

diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml
index 6200419e..aaf27ca9 100644
--- a/roles/prometheus/defaults/main.yml
+++ b/roles/prometheus/defaults/main.yml
@@ -11,8 +11,8 @@ alertmanager_smtp_hello: "localhost"
 alertmanager_smtp_auth_username: ""
 alertmanager_smtp_auth_password: ""
 
-# Define if you have a playbook website
-#alertmanager_playbook_url: ""
+# Point at something that actually exists.
+alert_playbook_url: "https://playbooks.{{ domain }}"
 
 # Custom blackbox probes.
 prometheus_custom_blackbox_probes: {}
diff --git a/roles/prometheus/files/rules/alerts_syslog.conf.yml b/roles/prometheus/files/rules/alerts_syslog.conf.yml
deleted file mode 100644
index 22079a95..00000000
--- a/roles/prometheus/files/rules/alerts_syslog.conf.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-groups:
-  - name: roles/prometheus/files/rules/alerts_syslog.conf
-    rules:
-      - alert: SyslogDroppingLogs
-        expr: rsyslog_queue_discarded:rate5m > 10
-        for: 10m
-        labels:
-          severity: page
-        annotations:
-          description: Syslog collector is dropping logs on {{ $labels.host }}
-          summary: Syslog collector is dropping logs
diff --git a/roles/prometheus/tasks/prometheus.yml b/roles/prometheus/tasks/prometheus.yml
index 42b83997..ce39da4f 100644
--- a/roles/prometheus/tasks/prometheus.yml
+++ b/roles/prometheus/tasks/prometheus.yml
@@ -1,6 +1,11 @@
 ---
 
 # Configure Prometheus components.
+#
+# Since Prometheus configurations and templates are heavy with Go template
+# syntax, which uses the same escapes as Ansible, we override the Ansible
+# template variable delimiters to '[[' and ']]', so that they do not conflict
+# with the Go syntax.
 
 - name: Create /etc/prometheus and subdirs
   file:
@@ -14,17 +19,21 @@
     - "/etc/prometheus/console_libraries"
 
 - name: Install Prometheus rules
-  copy:
+  template:
     src: "{{ item }}"
     dest: "/etc/prometheus/rules/"
+    variable_start_string: "[["
+    variable_end_string: "]]"
   with_fileglob:
-    - files/rules/*.conf.yml
+    - templates/rules/*.conf.yml
   notify: "reload prometheus"
 
 - name: Install alertmanager templates
   template:
     src: "{{ item }}"
     dest: "/etc/prometheus/alertmanager_templates/"
+    variable_start_string: "[["
+    variable_end_string: "]]"
   with_fileglob:
     - templates/alertmanager_templates/*
   notify: "reload prometheus"
diff --git a/roles/prometheus/templates/alertmanager_templates/email.tmpl b/roles/prometheus/templates/alertmanager_templates/email.tmpl
index ff2e3320..10b507db 100644
--- a/roles/prometheus/templates/alertmanager_templates/email.tmpl
+++ b/roles/prometheus/templates/alertmanager_templates/email.tmpl
@@ -1,14 +1,3 @@
-{% if alertmanager_playbook_url is defined %}
-{% raw %}{{ define "playbook_url.html" }}{% endraw %}
-<a href="{{ alertmanager_playbook_url }}/{% raw %}{{.Labels.alertname}}{% endraw %}.md">Playbook</a><br />
-{% raw %}{{ end }}{% endraw %}
-{% else %}
-{% raw %}
-{{ define "playbook_url.html" }}{{ end }}
-{% endraw %}
-{% endif %}
-
-{% raw %}
 {{ define "email.float.html" }}
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml">
@@ -341,7 +330,6 @@ a {
                     {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                     {{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }}
                     {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
-                    {{ template "playbook_url.html" . }}
                     <a href="{{ .GeneratorURL }}">Source</a><br />
                   </td>
                 </tr>
@@ -394,4 +382,3 @@ a {
 </body>
 </html>
 {{ end }}
-{% endraw %}
diff --git a/roles/prometheus/files/rules/alerts_acme.conf.yml b/roles/prometheus/templates/rules/alerts_acme.conf.yml
similarity index 95%
rename from roles/prometheus/files/rules/alerts_acme.conf.yml
rename to roles/prometheus/templates/rules/alerts_acme.conf.yml
index 6f15c9e6..85e3be6f 100644
--- a/roles/prometheus/files/rules/alerts_acme.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_acme.conf.yml
@@ -35,3 +35,4 @@ groups:
     annotations:
       summary: 'ACME automation completely broken'
       description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.'
+      runbook: '[[ alert_playbook_url ]]/ACMEBroken.md'
diff --git a/roles/prometheus/files/rules/alerts_backup.conf.yml b/roles/prometheus/templates/rules/alerts_backup.conf.yml
similarity index 91%
rename from roles/prometheus/files/rules/alerts_backup.conf.yml
rename to roles/prometheus/templates/rules/alerts_backup.conf.yml
index 6d5d0f7d..45402b3f 100644
--- a/roles/prometheus/files/rules/alerts_backup.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_backup.conf.yml
@@ -17,3 +17,4 @@ groups:
     annotations:
       summary: '{{ $labels.dataset }} backup failure on {{ $labels.host }}'
       description: 'Dataset {{ $labels.dataset }} has failed its backups on {{ $labels.host }} for two days.'
+      runbook: '[[ alert_playbook_url ]]/BackupFailed.md'
diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/templates/rules/alerts_base.conf.yml
similarity index 94%
rename from roles/prometheus/files/rules/alerts_base.conf.yml
rename to roles/prometheus/templates/rules/alerts_base.conf.yml
index 1cc6bf60..a0aaf2d3 100644
--- a/roles/prometheus/files/rules/alerts_base.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_base.conf.yml
@@ -63,6 +63,7 @@ groups:
     annotations:
       summary: 'Job {{ $labels.job }} is down globally'
       description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/JobDown.md'
 
   - alert: ProbeFailure
     expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
@@ -74,6 +75,7 @@ groups:
       summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing'
       description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
         for target {{ $labels.target }} (success ratio {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
 
   - alert: ProbeFailure
     expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
@@ -85,3 +87,4 @@ groups:
       summary: 'Probe {{ $labels.probe }} is failing globally'
       description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
         globally (success ratio {{ $value }}).'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
diff --git a/roles/prometheus/files/rules/alerts_cpu.conf.yml b/roles/prometheus/templates/rules/alerts_cpu.conf.yml
similarity index 92%
rename from roles/prometheus/files/rules/alerts_cpu.conf.yml
rename to roles/prometheus/templates/rules/alerts_cpu.conf.yml
index bcc59126..96508963 100644
--- a/roles/prometheus/files/rules/alerts_cpu.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_cpu.conf.yml
@@ -22,4 +22,5 @@ groups:
     annotations:
       summary: 'Host {{$labels.host}} is thrashing'
       description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.'
+      runbook: '[[ alert_playbook_url ]]/HostThrashing.md'
 
diff --git a/roles/prometheus/files/rules/alerts_disk.conf.yml b/roles/prometheus/templates/rules/alerts_disk.conf.yml
similarity index 87%
rename from roles/prometheus/files/rules/alerts_disk.conf.yml
rename to roles/prometheus/templates/rules/alerts_disk.conf.yml
index 2796b298..6d0df8e4 100644
--- a/roles/prometheus/files/rules/alerts_disk.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_disk.conf.yml
@@ -10,3 +10,4 @@ groups:
     annotations:
       summary: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is almost full'
       description: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} will be full in less than 4 hours'
+      runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours.md'
diff --git a/roles/prometheus/files/rules/alerts_mysql.conf.yml b/roles/prometheus/templates/rules/alerts_mysql.conf.yml
similarity index 85%
rename from roles/prometheus/files/rules/alerts_mysql.conf.yml
rename to roles/prometheus/templates/rules/alerts_mysql.conf.yml
index 4d33a13f..6cdccd78 100644
--- a/roles/prometheus/files/rules/alerts_mysql.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_mysql.conf.yml
@@ -9,6 +9,8 @@ groups:
         annotations:
           description: Slave replication (IO or SQL) has been down for more than 2 minutes on {{ $labels.job }}@{{ $labels.host }}.
           summary: Slave replication is not running for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken.md'
+
       - alert: MySQLReplicationBehind
         expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
         for: 2m
@@ -17,6 +19,8 @@ groups:
         annotations:
           description: The mysql slave replication has fallen behind and is not recovering on {{ $labels.job }}@{{ $labels.host }}.
           summary: MySQL slave replication is lagging for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind.md'
+
       - alert: MySQLInnoDBLogWaits
         expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
         labels:
@@ -24,3 +28,5 @@ groups:
         annotations:
           description: The innodb logs are waiting for disk at a rate of {{$value}} / second on {{ $labels.job }}@{{ $labels.host }}
           summary: MySQL innodb log writes stalling for {{ $labels.job }}
+          runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits.md'
+
diff --git a/roles/prometheus/files/rules/alerts_net.conf.yml b/roles/prometheus/templates/rules/alerts_net.conf.yml
similarity index 84%
rename from roles/prometheus/files/rules/alerts_net.conf.yml
rename to roles/prometheus/templates/rules/alerts_net.conf.yml
index 2d9f2ebb..3a94442b 100644
--- a/roles/prometheus/files/rules/alerts_net.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_net.conf.yml
@@ -9,6 +9,8 @@ groups:
     annotations:
       description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.'
       summary: 'Conntrack table on {{ $labels.instance }} is almost full'
+      runbook: '[[ alert_playbook_url ]]/ConntrackTableFull.md'
+
   - alert: NetworkErrors
     expr: instance:node_network_errs_total:rate5m > 1
     for: 15m
@@ -17,6 +19,8 @@ groups:
     annotations:
       summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}'
       description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.'
+      runbook: '[[ alert_playbook_url ]]/NetworkErrors.md'
+
   - alert: NetworkDrops
     expr: instance:node_network_drop_total:rate5m > 1
     for: 15m
@@ -25,6 +29,8 @@ groups:
     annotations:
       summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}'
       description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.'
+      runbook: '[[ alert_playbook_url ]]/NetworkDrops.md'
+
   - alert: HostUnreachable
     expr: probe_success{job="blackbox_ping"} < 1
     for: 5m
@@ -33,3 +39,4 @@ groups:
     annotations:
       summary: 'Host {{ $labels.instance }} is unreachable'
       description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).'
+      runbook: '[[ alert_playbook_url ]]/HostUnreachable.md'
diff --git a/roles/prometheus/files/rules/alerts_nginx.conf.yml b/roles/prometheus/templates/rules/alerts_nginx.conf.yml
similarity index 87%
rename from roles/prometheus/files/rules/alerts_nginx.conf.yml
rename to roles/prometheus/templates/rules/alerts_nginx.conf.yml
index 15d5e967..a830d140 100644
--- a/roles/prometheus/files/rules/alerts_nginx.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_nginx.conf.yml
@@ -12,6 +12,7 @@ groups:
     annotations:
       summary: 'High HTTP error ratio for {{$labels.vhost}} globally'
       description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.'
+      runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
 
   - alert: HTTPErrorRatioHigh
     expr: (host:nginx_http_requests_errs:ratio > 0.2 and host:nginx_http_requests_total:rate5m > 0.1)
@@ -23,3 +24,5 @@ groups:
     annotations:
       summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}'
       description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.'
+      runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
+
diff --git a/roles/prometheus/files/rules/alerts_prometheus.conf.yml b/roles/prometheus/templates/rules/alerts_prometheus.conf.yml
similarity index 82%
rename from roles/prometheus/files/rules/alerts_prometheus.conf.yml
rename to roles/prometheus/templates/rules/alerts_prometheus.conf.yml
index e384a7ba..24049492 100644
--- a/roles/prometheus/files/rules/alerts_prometheus.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_prometheus.conf.yml
@@ -1,170 +1,162 @@
 groups:
 - name: roles/prometheus/files/rules/alerts_prometheus.conf
   rules:
-  - alert: PrometheusUnreachable
-    expr: up{job=~"prometheus.*"} == 0
-    for: 10m
-    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} could not be scraped for
-        over 10 minutes.'
-      title: '{{$labels.job}} is unreachable'
-  - alert: PrometheusManyRestarts
-    expr: changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3
-    for: 30m
-    labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
-    annotations:
-      description: '{{$labels.job}} at {{$labels.instance}} has restarted more than
-        3 times in the last 30 minutes. It might be crashlooping.'
-      title: '{{$labels.job}} is restarting frequently'
   - alert: PrometheusRuleEvaluationSlow
     expr: prometheus_evaluator_duration_seconds{job=~"prometheus.*",quantile="0.9"}
       > 60
     for: 10m
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
         latency of {{$value}}s completing rule evaluation cycles.'
       title: '{{$labels.job}} is evaluating rules too slowly'
+
   - alert: PrometheusCheckpointingSlow
     expr: avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m])
       > prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000
     for: 5m
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average
         for each checkpoint.'
       title: '{{$labels.job}} is checkpointing too slowly'
+
   - alert: PrometheusIndexingBacklog
     expr: prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"} / prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"}
       * 100 > 10
     for: 30m
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
         indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}%
         full.'
       title: '{{$labels.job}} is backlogging on the indexing queue'
+
   - alert: PrometheusNotIngestingSamples
     expr: rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m])
       == 0
     for: 5m
     labels:
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples
         in the last 10 minutes.'
       title: '{{$labels.job}} is not ingesting samples'
+      runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples.md'
+
   - alert: PrometheusPersistErrors
     expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m])
       > 0
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has encountered {{$value}}
         persist errors per second in the last 10 minutes.'
       title: '{{$labels.job}} has persist errors'
+
   - alert: PrometheusNotificationsBacklog
     expr: prometheus_notifications_queue_length{job=~"prometheus.*"} > 0
     for: 10m
     labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the
         notifications queue. The queue has not been empty for 10 minutes. Current
         queue length: {{$value}}.'
       title: '{{$labels.job}} is backlogging on the notifications queue'
+      runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog.md'
+
   - alert: PrometheusScrapingSlowly
     expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"}
       > 2 * 60
     for: 10m
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile
         latency of {{$value}}s for scraping targets in the {{$labels.interval}} target
         pool.'
       title: '{{$labels.job}} is scraping targets slowly'
+
   - alert: PrometheusStorageInconsistent
     expr: prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has detected a storage
         inconsistency. A server restart is needed to initiate recovery.'
       title: '{{$labels.job}} has an inconsistent storage'
+
   - alert: PrometheusPersistencePressureTooHigh
     expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
       > 0.8 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
       3600 * 24) > 1
     for: 30m
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
         persistence pressure. Throttled ingestion expected within the next 24h.'
       title: '{{$labels.job}} can not keep up persisting'
+
   - alert: PrometheusPersistencePressureTooHigh
     expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}
       > 0.85 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m],
       3600 * 2) > 1
     for: 30m
     labels:
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
         persistence pressure. Throttled ingestion expected within the next 2h.'
       title: '{{$labels.job}} can not keep up persisting'
+      runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh.md'
+
   - alert: PrometheusSeriesMaintenanceStalled
     expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance)
       rate(prometheus_local_storage_series_ops_total{job=~"prometheus.*",type="maintenance_in_memory"}[5m])
       / 3600 > 24 and ON(job, instance) prometheus_local_storage_rushed_mode == 1
     for: 1h
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} is maintaining memory
         time series so slowly that it will take {{$value | printf `%.0f`}}h to complete
         a full cycle. This will lead to persistence falling behind.'
       title: '{{$labels.job}} is maintaining memory time series too slowly'
+
   - alert: PrometheusInvalidConfigFile
     expr: prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0
     for: 30m
     labels:
-      pager: pagerduty
-      service: prometheus
-      severity: critical
+      scope: host
+      severity: page
     annotations:
       description: The configuration file for {{$labels.job}} at {{$labels.instance}}
         is invalid and was therefore not reloaded.
       title: '{{$labels.job}} has an invalid config'
+      runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile.md'
+
   - alert: PrometheusOutOfOrderSamplesDiscarded
     expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m])
       > 0
     for: 1h
     labels:
-      service: prometheus
+      scope: host
       severity: warn
     annotations:
       description: '{{$labels.job}} at {{$labels.instance}} has discarded {{$value}}
diff --git a/roles/prometheus/files/rules/alerts_services.conf.yml b/roles/prometheus/templates/rules/alerts_services.conf.yml
similarity index 91%
rename from roles/prometheus/files/rules/alerts_services.conf.yml
rename to roles/prometheus/templates/rules/alerts_services.conf.yml
index 56711a0d..36e37b04 100644
--- a/roles/prometheus/files/rules/alerts_services.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_services.conf.yml
@@ -10,6 +10,7 @@ groups:
         annotations:
           description: 'Availability too low for service {{ $labels.float_service }}'
           summary: 'Availability too low for service {{ $labels.float_service }}'
+          runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow.md'
 
       - alert: ServiceDegraded
         expr: float_service:ok_by_host == 0
diff --git a/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml b/roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml
similarity index 72%
rename from roles/prometheus/files/rules/alerts_ssl_probes.conf.yml
rename to roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml
index cb436291..ddc0efc8 100644
--- a/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml
@@ -2,11 +2,13 @@ groups:
 - name: roles/prometheus/files/rules/alerts_ssl_probes.conf
   rules:
   - alert: SSLCertificateAboutToExpire
-    expr: target:probe_ssl_cert_expiry:days < 15
-    for: 1h
+    expr: target:probe_ssl_cert_expiry:days < 10
+    for: 3h
     labels:
+      scope: global
       severity: page 
     annotations:
       summary: 'SSL certificate about to expire for {{ $labels.job }}@{{ $labels.target }}'
       description: 'The "{{ $labels.job }}" prober reports that {{ $labels.target }} is serving a SSL certificate that will expire in {{ $value }} days.'
+      runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire.md'
 
diff --git a/roles/prometheus/templates/rules/alerts_syslog.conf.yml b/roles/prometheus/templates/rules/alerts_syslog.conf.yml
new file mode 100644
index 00000000..3b15c3e9
--- /dev/null
+++ b/roles/prometheus/templates/rules/alerts_syslog.conf.yml
@@ -0,0 +1,13 @@
+groups:
+  - name: roles/prometheus/files/rules/alerts_syslog.conf
+    rules:
+      - alert: SyslogDroppingLogs
+        expr: rsyslog_queue_discarded:rate5m{job="rsyslog-collector"} > 10
+        for: 15m
+        labels:
+          scope: global
+          severity: page
+        annotations:
+          description: 'Syslog collector is dropping logs on {{ $labels.host }}'
+          summary: 'Syslog collector is dropping logs'
+          runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs.md'
diff --git a/roles/prometheus/files/rules/alerts_system_health.conf.yml b/roles/prometheus/templates/rules/alerts_system_health.conf.yml
similarity index 91%
rename from roles/prometheus/files/rules/alerts_system_health.conf.yml
rename to roles/prometheus/templates/rules/alerts_system_health.conf.yml
index 476c4654..07f0b806 100644
--- a/roles/prometheus/files/rules/alerts_system_health.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_system_health.conf.yml
@@ -19,12 +19,14 @@ groups:
         summary: "RAID device {{ $labels.md_device }} on {{ $labels.host }} is unhealthy"
         description: "The RAID device {{ $labels.md_device }} on {{ $labels.host }} is reporting a degraded state, which means that probably one or more of the disks in the array have failed."
 
-    - alert: HostPhysicalComponentTooHot
+    - alert: PhysicalComponentTooHot
       expr: node_hwmon_temp_celsius / node_hwmon_temp_crit_celsius > 0.95
       for: 2h
       labels:
+        scope: host
         severity: page
       annotations:
         summary: "A physical component is running too hot on {{ $labels.host }}"
         description: "A sensor is reporting that a physical component ({{ $labels.sensor }}/{{ $labels.chip }}) on {{ $labels.host }} has been running very close to the critical level ({{ $value }}) for the last 2 hours."
+        runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot.md'
 
diff --git a/roles/prometheus/files/rules/alerts_systemd.conf.yml b/roles/prometheus/templates/rules/alerts_systemd.conf.yml
similarity index 89%
rename from roles/prometheus/files/rules/alerts_systemd.conf.yml
rename to roles/prometheus/templates/rules/alerts_systemd.conf.yml
index 07c2b48f..f28f67a1 100644
--- a/roles/prometheus/files/rules/alerts_systemd.conf.yml
+++ b/roles/prometheus/templates/rules/alerts_systemd.conf.yml
@@ -9,11 +9,15 @@ groups:
     annotations:
       summary: '{{ $labels.name }} has failed on {{ $labels.host }}'
       description: 'The systemd unit {{ $labels.name }} has failed on {{ $labels.host }}.'
+
   - alert: SystemdUnitCrashLooping
     expr: instance:systemd_unit_restarts:delta10m > 12
     for: 30m
     labels:
+      scope: host
       severity: page
     annotations:
       summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}'
       description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.'
+      runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping.md'
+
diff --git a/roles/prometheus/files/rules/rules_acme.conf.yml b/roles/prometheus/templates/rules/rules_acme.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_acme.conf.yml
rename to roles/prometheus/templates/rules/rules_acme.conf.yml
diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/templates/rules/rules_base.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_base.conf.yml
rename to roles/prometheus/templates/rules/rules_base.conf.yml
diff --git a/roles/prometheus/files/rules/rules_cpu.conf.yml b/roles/prometheus/templates/rules/rules_cpu.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_cpu.conf.yml
rename to roles/prometheus/templates/rules/rules_cpu.conf.yml
diff --git a/roles/prometheus/files/rules/rules_disk.conf.yml b/roles/prometheus/templates/rules/rules_disk.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_disk.conf.yml
rename to roles/prometheus/templates/rules/rules_disk.conf.yml
diff --git a/roles/prometheus/files/rules/rules_elasticsearch.conf.yml b/roles/prometheus/templates/rules/rules_elasticsearch.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_elasticsearch.conf.yml
rename to roles/prometheus/templates/rules/rules_elasticsearch.conf.yml
diff --git a/roles/prometheus/files/rules/rules_mysql.conf.yml b/roles/prometheus/templates/rules/rules_mysql.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_mysql.conf.yml
rename to roles/prometheus/templates/rules/rules_mysql.conf.yml
diff --git a/roles/prometheus/files/rules/rules_net.conf.yml b/roles/prometheus/templates/rules/rules_net.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_net.conf.yml
rename to roles/prometheus/templates/rules/rules_net.conf.yml
diff --git a/roles/prometheus/files/rules/rules_nginx.conf.yml b/roles/prometheus/templates/rules/rules_nginx.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_nginx.conf.yml
rename to roles/prometheus/templates/rules/rules_nginx.conf.yml
diff --git a/roles/prometheus/files/rules/rules_node_016.conf.yml b/roles/prometheus/templates/rules/rules_node_016.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_node_016.conf.yml
rename to roles/prometheus/templates/rules/rules_node_016.conf.yml
diff --git a/roles/prometheus/files/rules/rules_services.conf.yml b/roles/prometheus/templates/rules/rules_services.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_services.conf.yml
rename to roles/prometheus/templates/rules/rules_services.conf.yml
diff --git a/roles/prometheus/files/rules/rules_ssl_probes.conf.yml b/roles/prometheus/templates/rules/rules_ssl_probes.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_ssl_probes.conf.yml
rename to roles/prometheus/templates/rules/rules_ssl_probes.conf.yml
diff --git a/roles/prometheus/files/rules/rules_syslog.conf.yml b/roles/prometheus/templates/rules/rules_syslog.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_syslog.conf.yml
rename to roles/prometheus/templates/rules/rules_syslog.conf.yml
diff --git a/roles/prometheus/files/rules/rules_systemd.conf.yml b/roles/prometheus/templates/rules/rules_systemd.conf.yml
similarity index 100%
rename from roles/prometheus/files/rules/rules_systemd.conf.yml
rename to roles/prometheus/templates/rules/rules_systemd.conf.yml
-- 
GitLab