From 78271e6ce100717542d384883cf601eb606f9a89 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Tue, 28 Apr 2020 15:53:28 +0100 Subject: [PATCH] Add playbook annotations ("runbook") to all alerts Better to do it this way than automagically creating playbook links in the email templates (as that affects only emails, while adding the annotation explicitly works across all notification systems). --- roles/prometheus/defaults/main.yml | 4 +- .../files/rules/alerts_syslog.conf.yml | 11 --- roles/prometheus/tasks/prometheus.yml | 13 +++- .../alertmanager_templates/email.tmpl | 13 ---- .../rules/alerts_acme.conf.yml | 1 + .../rules/alerts_backup.conf.yml | 1 + .../rules/alerts_base.conf.yml | 3 + .../rules/alerts_cpu.conf.yml | 1 + .../rules/alerts_disk.conf.yml | 1 + .../rules/alerts_mysql.conf.yml | 6 ++ .../rules/alerts_net.conf.yml | 7 ++ .../rules/alerts_nginx.conf.yml | 3 + .../rules/alerts_prometheus.conf.yml | 74 +++++++++---------- .../rules/alerts_services.conf.yml | 1 + .../rules/alerts_ssl_probes.conf.yml | 6 +- .../templates/rules/alerts_syslog.conf.yml | 13 ++++ .../rules/alerts_system_health.conf.yml | 4 +- .../rules/alerts_systemd.conf.yml | 4 + .../rules/rules_acme.conf.yml | 0 .../rules/rules_base.conf.yml | 0 .../rules/rules_cpu.conf.yml | 0 .../rules/rules_disk.conf.yml | 0 .../rules/rules_elasticsearch.conf.yml | 0 .../rules/rules_mysql.conf.yml | 0 .../rules/rules_net.conf.yml | 0 .../rules/rules_nginx.conf.yml | 0 .../rules/rules_node_016.conf.yml | 0 .../rules/rules_services.conf.yml | 0 .../rules/rules_ssl_probes.conf.yml | 0 .../rules/rules_syslog.conf.yml | 0 .../rules/rules_systemd.conf.yml | 0 31 files changed, 94 insertions(+), 72 deletions(-) delete mode 100644 roles/prometheus/files/rules/alerts_syslog.conf.yml rename roles/prometheus/{files => templates}/rules/alerts_acme.conf.yml (95%) rename roles/prometheus/{files => templates}/rules/alerts_backup.conf.yml (91%) rename roles/prometheus/{files => templates}/rules/alerts_base.conf.yml (94%) rename roles/prometheus/{files => templates}/rules/alerts_cpu.conf.yml (92%) rename roles/prometheus/{files => templates}/rules/alerts_disk.conf.yml (87%) rename roles/prometheus/{files => templates}/rules/alerts_mysql.conf.yml (85%) rename roles/prometheus/{files => templates}/rules/alerts_net.conf.yml (84%) rename roles/prometheus/{files => templates}/rules/alerts_nginx.conf.yml (87%) rename roles/prometheus/{files => templates}/rules/alerts_prometheus.conf.yml (82%) rename roles/prometheus/{files => templates}/rules/alerts_services.conf.yml (91%) rename roles/prometheus/{files => templates}/rules/alerts_ssl_probes.conf.yml (72%) create mode 100644 roles/prometheus/templates/rules/alerts_syslog.conf.yml rename roles/prometheus/{files => templates}/rules/alerts_system_health.conf.yml (91%) rename roles/prometheus/{files => templates}/rules/alerts_systemd.conf.yml (89%) rename roles/prometheus/{files => templates}/rules/rules_acme.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_base.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_cpu.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_disk.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_elasticsearch.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_mysql.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_net.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_nginx.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_node_016.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_services.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_ssl_probes.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_syslog.conf.yml (100%) rename roles/prometheus/{files => templates}/rules/rules_systemd.conf.yml (100%) diff --git a/roles/prometheus/defaults/main.yml b/roles/prometheus/defaults/main.yml index 6200419e..aaf27ca9 100644 --- a/roles/prometheus/defaults/main.yml +++ b/roles/prometheus/defaults/main.yml @@ -11,8 +11,8 @@ alertmanager_smtp_hello: "localhost" alertmanager_smtp_auth_username: "" alertmanager_smtp_auth_password: "" -# Define if you have a playbook website -#alertmanager_playbook_url: "" +# Point at something that actually exists. +alert_playbook_url: "https://playbooks.{{ domain }}" # Custom blackbox probes. prometheus_custom_blackbox_probes: {} diff --git a/roles/prometheus/files/rules/alerts_syslog.conf.yml b/roles/prometheus/files/rules/alerts_syslog.conf.yml deleted file mode 100644 index 22079a95..00000000 --- a/roles/prometheus/files/rules/alerts_syslog.conf.yml +++ /dev/null @@ -1,11 +0,0 @@ -groups: - - name: roles/prometheus/files/rules/alerts_syslog.conf - rules: - - alert: SyslogDroppingLogs - expr: rsyslog_queue_discarded:rate5m > 10 - for: 10m - labels: - severity: page - annotations: - description: Syslog collector is dropping logs on {{ $labels.host }} - summary: Syslog collector is dropping logs diff --git a/roles/prometheus/tasks/prometheus.yml b/roles/prometheus/tasks/prometheus.yml index 42b83997..ce39da4f 100644 --- a/roles/prometheus/tasks/prometheus.yml +++ b/roles/prometheus/tasks/prometheus.yml @@ -1,6 +1,11 @@ --- # Configure Prometheus components. +# +# Since Prometheus configurations and templates are heavy with Go template +# syntax, which uses the same escapes as Ansible, we override the Ansible +# template variable delimiters to '[[' and ']]', so that they do not conflict +# with the Go syntax. - name: Create /etc/prometheus and subdirs file: @@ -14,17 +19,21 @@ - "/etc/prometheus/console_libraries" - name: Install Prometheus rules - copy: + template: src: "{{ item }}" dest: "/etc/prometheus/rules/" + variable_start_string: "[[" + variable_end_string: "]]" with_fileglob: - - files/rules/*.conf.yml + - templates/rules/*.conf.yml notify: "reload prometheus" - name: Install alertmanager templates template: src: "{{ item }}" dest: "/etc/prometheus/alertmanager_templates/" + variable_start_string: "[[" + variable_end_string: "]]" with_fileglob: - templates/alertmanager_templates/* notify: "reload prometheus" diff --git a/roles/prometheus/templates/alertmanager_templates/email.tmpl b/roles/prometheus/templates/alertmanager_templates/email.tmpl index ff2e3320..10b507db 100644 --- a/roles/prometheus/templates/alertmanager_templates/email.tmpl +++ b/roles/prometheus/templates/alertmanager_templates/email.tmpl @@ -1,14 +1,3 @@ -{% if alertmanager_playbook_url is defined %} -{% raw %}{{ define "playbook_url.html" }}{% endraw %} -<a href="{{ alertmanager_playbook_url }}/{% raw %}{{.Labels.alertname}}{% endraw %}.md">Playbook</a><br /> -{% raw %}{{ end }}{% endraw %} -{% else %} -{% raw %} -{{ define "playbook_url.html" }}{{ end }} -{% endraw %} -{% endif %} - -{% raw %} {{ define "email.float.html" }} <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> @@ -341,7 +330,6 @@ a { {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }} {{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }} {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }} - {{ template "playbook_url.html" . }} <a href="{{ .GeneratorURL }}">Source</a><br /> </td> </tr> @@ -394,4 +382,3 @@ a { </body> </html> {{ end }} -{% endraw %} diff --git a/roles/prometheus/files/rules/alerts_acme.conf.yml b/roles/prometheus/templates/rules/alerts_acme.conf.yml similarity index 95% rename from roles/prometheus/files/rules/alerts_acme.conf.yml rename to roles/prometheus/templates/rules/alerts_acme.conf.yml index 6f15c9e6..85e3be6f 100644 --- a/roles/prometheus/files/rules/alerts_acme.conf.yml +++ b/roles/prometheus/templates/rules/alerts_acme.conf.yml @@ -35,3 +35,4 @@ groups: annotations: summary: 'ACME automation completely broken' description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.' + runbook: '[[ alert_playbook_url ]]/ACMEBroken.md' diff --git a/roles/prometheus/files/rules/alerts_backup.conf.yml b/roles/prometheus/templates/rules/alerts_backup.conf.yml similarity index 91% rename from roles/prometheus/files/rules/alerts_backup.conf.yml rename to roles/prometheus/templates/rules/alerts_backup.conf.yml index 6d5d0f7d..45402b3f 100644 --- a/roles/prometheus/files/rules/alerts_backup.conf.yml +++ b/roles/prometheus/templates/rules/alerts_backup.conf.yml @@ -17,3 +17,4 @@ groups: annotations: summary: '{{ $labels.dataset }} backup failure on {{ $labels.host }}' description: 'Dataset {{ $labels.dataset }} has failed its backups on {{ $labels.host }} for two days.' + runbook: '[[ alert_playbook_url ]]/BackupFailed.md' diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/templates/rules/alerts_base.conf.yml similarity index 94% rename from roles/prometheus/files/rules/alerts_base.conf.yml rename to roles/prometheus/templates/rules/alerts_base.conf.yml index 1cc6bf60..a0aaf2d3 100644 --- a/roles/prometheus/files/rules/alerts_base.conf.yml +++ b/roles/prometheus/templates/rules/alerts_base.conf.yml @@ -63,6 +63,7 @@ groups: annotations: summary: 'Job {{ $labels.job }} is down globally' description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).' + runbook: '[[ alert_playbook_url ]]/JobDown.md' - alert: ProbeFailure expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 @@ -74,6 +75,7 @@ groups: summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing' description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing for target {{ $labels.target }} (success ratio {{ $value }}).' + runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' - alert: ProbeFailure expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 @@ -85,3 +87,4 @@ groups: summary: 'Probe {{ $labels.probe }} is failing globally' description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing globally (success ratio {{ $value }}).' + runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' diff --git a/roles/prometheus/files/rules/alerts_cpu.conf.yml b/roles/prometheus/templates/rules/alerts_cpu.conf.yml similarity index 92% rename from roles/prometheus/files/rules/alerts_cpu.conf.yml rename to roles/prometheus/templates/rules/alerts_cpu.conf.yml index bcc59126..96508963 100644 --- a/roles/prometheus/files/rules/alerts_cpu.conf.yml +++ b/roles/prometheus/templates/rules/alerts_cpu.conf.yml @@ -22,4 +22,5 @@ groups: annotations: summary: 'Host {{$labels.host}} is thrashing' description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.' + runbook: '[[ alert_playbook_url ]]/HostThrashing.md' diff --git a/roles/prometheus/files/rules/alerts_disk.conf.yml b/roles/prometheus/templates/rules/alerts_disk.conf.yml similarity index 87% rename from roles/prometheus/files/rules/alerts_disk.conf.yml rename to roles/prometheus/templates/rules/alerts_disk.conf.yml index 2796b298..6d0df8e4 100644 --- a/roles/prometheus/files/rules/alerts_disk.conf.yml +++ b/roles/prometheus/templates/rules/alerts_disk.conf.yml @@ -10,3 +10,4 @@ groups: annotations: summary: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is almost full' description: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} will be full in less than 4 hours' + runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours.md' diff --git a/roles/prometheus/files/rules/alerts_mysql.conf.yml b/roles/prometheus/templates/rules/alerts_mysql.conf.yml similarity index 85% rename from roles/prometheus/files/rules/alerts_mysql.conf.yml rename to roles/prometheus/templates/rules/alerts_mysql.conf.yml index 4d33a13f..6cdccd78 100644 --- a/roles/prometheus/files/rules/alerts_mysql.conf.yml +++ b/roles/prometheus/templates/rules/alerts_mysql.conf.yml @@ -9,6 +9,8 @@ groups: annotations: description: Slave replication (IO or SQL) has been down for more than 2 minutes on {{ $labels.job }}@{{ $labels.host }}. summary: Slave replication is not running for {{ $labels.job }} + runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken.md' + - alert: MySQLReplicationBehind expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0) for: 2m @@ -17,6 +19,8 @@ groups: annotations: description: The mysql slave replication has fallen behind and is not recovering on {{ $labels.job }}@{{ $labels.host }}. summary: MySQL slave replication is lagging for {{ $labels.job }} + runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind.md' + - alert: MySQLInnoDBLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 labels: @@ -24,3 +28,5 @@ groups: annotations: description: The innodb logs are waiting for disk at a rate of {{$value}} / second on {{ $labels.job }}@{{ $labels.host }} summary: MySQL innodb log writes stalling for {{ $labels.job }} + runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits.md' + diff --git a/roles/prometheus/files/rules/alerts_net.conf.yml b/roles/prometheus/templates/rules/alerts_net.conf.yml similarity index 84% rename from roles/prometheus/files/rules/alerts_net.conf.yml rename to roles/prometheus/templates/rules/alerts_net.conf.yml index 2d9f2ebb..3a94442b 100644 --- a/roles/prometheus/files/rules/alerts_net.conf.yml +++ b/roles/prometheus/templates/rules/alerts_net.conf.yml @@ -9,6 +9,8 @@ groups: annotations: description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.' summary: 'Conntrack table on {{ $labels.instance }} is almost full' + runbook: '[[ alert_playbook_url ]]/ConntrackTableFull.md' + - alert: NetworkErrors expr: instance:node_network_errs_total:rate5m > 1 for: 15m @@ -17,6 +19,8 @@ groups: annotations: summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}' description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.' + runbook: '[[ alert_playbook_url ]]/NetworkErrors.md' + - alert: NetworkDrops expr: instance:node_network_drop_total:rate5m > 1 for: 15m @@ -25,6 +29,8 @@ groups: annotations: summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}' description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.' + runbook: '[[ alert_playbook_url ]]/NetworkDrops.md' + - alert: HostUnreachable expr: probe_success{job="blackbox_ping"} < 1 for: 5m @@ -33,3 +39,4 @@ groups: annotations: summary: 'Host {{ $labels.instance }} is unreachable' description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).' + runbook: '[[ alert_playbook_url ]]/HostUnreachable.md' diff --git a/roles/prometheus/files/rules/alerts_nginx.conf.yml b/roles/prometheus/templates/rules/alerts_nginx.conf.yml similarity index 87% rename from roles/prometheus/files/rules/alerts_nginx.conf.yml rename to roles/prometheus/templates/rules/alerts_nginx.conf.yml index 15d5e967..a830d140 100644 --- a/roles/prometheus/files/rules/alerts_nginx.conf.yml +++ b/roles/prometheus/templates/rules/alerts_nginx.conf.yml @@ -12,6 +12,7 @@ groups: annotations: summary: 'High HTTP error ratio for {{$labels.vhost}} globally' description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.' + runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md' - alert: HTTPErrorRatioHigh expr: (host:nginx_http_requests_errs:ratio > 0.2 and host:nginx_http_requests_total:rate5m > 0.1) @@ -23,3 +24,5 @@ groups: annotations: summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}' description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.' + runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md' + diff --git a/roles/prometheus/files/rules/alerts_prometheus.conf.yml b/roles/prometheus/templates/rules/alerts_prometheus.conf.yml similarity index 82% rename from roles/prometheus/files/rules/alerts_prometheus.conf.yml rename to roles/prometheus/templates/rules/alerts_prometheus.conf.yml index e384a7ba..24049492 100644 --- a/roles/prometheus/files/rules/alerts_prometheus.conf.yml +++ b/roles/prometheus/templates/rules/alerts_prometheus.conf.yml @@ -1,170 +1,162 @@ groups: - name: roles/prometheus/files/rules/alerts_prometheus.conf rules: - - alert: PrometheusUnreachable - expr: up{job=~"prometheus.*"} == 0 - for: 10m - labels: - pager: pagerduty - service: prometheus - severity: critical - annotations: - description: '{{$labels.job}} at {{$labels.instance}} could not be scraped for - over 10 minutes.' - title: '{{$labels.job}} is unreachable' - - alert: PrometheusManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus.*"}[30m]) > 3 - for: 30m - labels: - pager: pagerduty - service: prometheus - severity: critical - annotations: - description: '{{$labels.job}} at {{$labels.instance}} has restarted more than - 3 times in the last 30 minutes. It might be crashlooping.' - title: '{{$labels.job}} is restarting frequently' - alert: PrometheusRuleEvaluationSlow expr: prometheus_evaluator_duration_seconds{job=~"prometheus.*",quantile="0.9"} > 60 for: 10m labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s completing rule evaluation cycles.' title: '{{$labels.job}} is evaluating rules too slowly' + - alert: PrometheusCheckpointingSlow expr: avg_over_time(prometheus_local_storage_checkpoint_last_duration_seconds{job=~"prometheus.*"}[15m]) > prometheus_local_storage_max_chunks_to_persist{job=~"prometheus.*"} / 5000 for: 5m labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} needs {{$value}}s on average for each checkpoint.' title: '{{$labels.job}} is checkpointing too slowly' + - alert: PrometheusIndexingBacklog expr: prometheus_local_storage_indexing_queue_length{job=~"prometheus.*"} / prometheus_local_storage_indexing_queue_capacity{job=~"prometheus.*"} * 100 > 10 for: 30m labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the indexing queue for more than 30m. Queue is currently {{$value | printf `%.0f`}}% full.' title: '{{$labels.job}} is backlogging on the indexing queue' + - alert: PrometheusNotIngestingSamples expr: rate(prometheus_local_storage_ingested_samples_total{job=~"prometheus.*"}[5m]) == 0 for: 5m labels: - service: prometheus - severity: critical + scope: host + severity: page annotations: description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples in the last 10 minutes.' title: '{{$labels.job}} is not ingesting samples' + runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples.md' + - alert: PrometheusPersistErrors expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m]) > 0 labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} has encountered {{$value}} persist errors per second in the last 10 minutes.' title: '{{$labels.job}} has persist errors' + - alert: PrometheusNotificationsBacklog expr: prometheus_notifications_queue_length{job=~"prometheus.*"} > 0 for: 10m labels: - pager: pagerduty - service: prometheus - severity: critical + scope: host + severity: page annotations: description: '{{$labels.job}} at {{$labels.instance}} is backlogging on the notifications queue. The queue has not been empty for 10 minutes. Current queue length: {{$value}}.' title: '{{$labels.job}} is backlogging on the notifications queue' + runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog.md' + - alert: PrometheusScrapingSlowly expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"} > 2 * 60 for: 10m labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} has a 90th percentile latency of {{$value}}s for scraping targets in the {{$labels.interval}} target pool.' title: '{{$labels.job}} is scraping targets slowly' + - alert: PrometheusStorageInconsistent expr: prometheus_local_storage_inconsistencies_total{job=~"prometheus.*"} > 0 labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} has detected a storage inconsistency. A server restart is needed to initiate recovery.' title: '{{$labels.job}} has an inconsistent storage' + - alert: PrometheusPersistencePressureTooHigh expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.8 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 24) > 1 for: 30m labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 24h.' title: '{{$labels.job}} can not keep up persisting' + - alert: PrometheusPersistencePressureTooHigh expr: prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"} > 0.85 and predict_linear(prometheus_local_storage_persistence_urgency_score{job=~"prometheus.*"}[30m], 3600 * 2) > 1 for: 30m labels: - service: prometheus - severity: critical + scope: host + severity: page annotations: description: '{{$labels.job}} at {{$labels.instance}} is approaching critical persistence pressure. Throttled ingestion expected within the next 2h.' title: '{{$labels.job}} can not keep up persisting' + runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh.md' + - alert: PrometheusSeriesMaintenanceStalled expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance) rate(prometheus_local_storage_series_ops_total{job=~"prometheus.*",type="maintenance_in_memory"}[5m]) / 3600 > 24 and ON(job, instance) prometheus_local_storage_rushed_mode == 1 for: 1h labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} is maintaining memory time series so slowly that it will take {{$value | printf `%.0f`}}h to complete a full cycle. This will lead to persistence falling behind.' title: '{{$labels.job}} is maintaining memory time series too slowly' + - alert: PrometheusInvalidConfigFile expr: prometheus_config_last_reload_successful{job=~"prometheus.*"} == 0 for: 30m labels: - pager: pagerduty - service: prometheus - severity: critical + scope: host + severity: page annotations: description: The configuration file for {{$labels.job}} at {{$labels.instance}} is invalid and was therefore not reloaded. title: '{{$labels.job}} has an invalid config' + runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile.md' + - alert: PrometheusOutOfOrderSamplesDiscarded expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m]) > 0 for: 1h labels: - service: prometheus + scope: host severity: warn annotations: description: '{{$labels.job}} at {{$labels.instance}} has discarded {{$value}} diff --git a/roles/prometheus/files/rules/alerts_services.conf.yml b/roles/prometheus/templates/rules/alerts_services.conf.yml similarity index 91% rename from roles/prometheus/files/rules/alerts_services.conf.yml rename to roles/prometheus/templates/rules/alerts_services.conf.yml index 56711a0d..36e37b04 100644 --- a/roles/prometheus/files/rules/alerts_services.conf.yml +++ b/roles/prometheus/templates/rules/alerts_services.conf.yml @@ -10,6 +10,7 @@ groups: annotations: description: 'Availability too low for service {{ $labels.float_service }}' summary: 'Availability too low for service {{ $labels.float_service }}' + runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow.md' - alert: ServiceDegraded expr: float_service:ok_by_host == 0 diff --git a/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml b/roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml similarity index 72% rename from roles/prometheus/files/rules/alerts_ssl_probes.conf.yml rename to roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml index cb436291..ddc0efc8 100644 --- a/roles/prometheus/files/rules/alerts_ssl_probes.conf.yml +++ b/roles/prometheus/templates/rules/alerts_ssl_probes.conf.yml @@ -2,11 +2,13 @@ groups: - name: roles/prometheus/files/rules/alerts_ssl_probes.conf rules: - alert: SSLCertificateAboutToExpire - expr: target:probe_ssl_cert_expiry:days < 15 - for: 1h + expr: target:probe_ssl_cert_expiry:days < 10 + for: 3h labels: + scope: global severity: page annotations: summary: 'SSL certificate about to expire for {{ $labels.job }}@{{ $labels.target }}' description: 'The "{{ $labels.job }}" prober reports that {{ $labels.target }} is serving a SSL certificate that will expire in {{ $value }} days.' + runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire.md' diff --git a/roles/prometheus/templates/rules/alerts_syslog.conf.yml b/roles/prometheus/templates/rules/alerts_syslog.conf.yml new file mode 100644 index 00000000..3b15c3e9 --- /dev/null +++ b/roles/prometheus/templates/rules/alerts_syslog.conf.yml @@ -0,0 +1,13 @@ +groups: + - name: roles/prometheus/files/rules/alerts_syslog.conf + rules: + - alert: SyslogDroppingLogs + expr: rsyslog_queue_discarded:rate5m{job="rsyslog-collector"} > 10 + for: 15m + labels: + scope: global + severity: page + annotations: + description: 'Syslog collector is dropping logs on {{ $labels.host }}' + summary: 'Syslog collector is dropping logs' + runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs.md' diff --git a/roles/prometheus/files/rules/alerts_system_health.conf.yml b/roles/prometheus/templates/rules/alerts_system_health.conf.yml similarity index 91% rename from roles/prometheus/files/rules/alerts_system_health.conf.yml rename to roles/prometheus/templates/rules/alerts_system_health.conf.yml index 476c4654..07f0b806 100644 --- a/roles/prometheus/files/rules/alerts_system_health.conf.yml +++ b/roles/prometheus/templates/rules/alerts_system_health.conf.yml @@ -19,12 +19,14 @@ groups: summary: "RAID device {{ $labels.md_device }} on {{ $labels.host }} is unhealthy" description: "The RAID device {{ $labels.md_device }} on {{ $labels.host }} is reporting a degraded state, which means that probably one or more of the disks in the array have failed." - - alert: HostPhysicalComponentTooHot + - alert: PhysicalComponentTooHot expr: node_hwmon_temp_celsius / node_hwmon_temp_crit_celsius > 0.95 for: 2h labels: + scope: host severity: page annotations: summary: "A physical component is running too hot on {{ $labels.host }}" description: "A sensor is reporting that a physical component ({{ $labels.sensor }}/{{ $labels.chip }}) on {{ $labels.host }} has been running very close to the critical level ({{ $value }}) for the last 2 hours." + runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot.md' diff --git a/roles/prometheus/files/rules/alerts_systemd.conf.yml b/roles/prometheus/templates/rules/alerts_systemd.conf.yml similarity index 89% rename from roles/prometheus/files/rules/alerts_systemd.conf.yml rename to roles/prometheus/templates/rules/alerts_systemd.conf.yml index 07c2b48f..f28f67a1 100644 --- a/roles/prometheus/files/rules/alerts_systemd.conf.yml +++ b/roles/prometheus/templates/rules/alerts_systemd.conf.yml @@ -9,11 +9,15 @@ groups: annotations: summary: '{{ $labels.name }} has failed on {{ $labels.host }}' description: 'The systemd unit {{ $labels.name }} has failed on {{ $labels.host }}.' + - alert: SystemdUnitCrashLooping expr: instance:systemd_unit_restarts:delta10m > 12 for: 30m labels: + scope: host severity: page annotations: summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}' description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.' + runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping.md' + diff --git a/roles/prometheus/files/rules/rules_acme.conf.yml b/roles/prometheus/templates/rules/rules_acme.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_acme.conf.yml rename to roles/prometheus/templates/rules/rules_acme.conf.yml diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/templates/rules/rules_base.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_base.conf.yml rename to roles/prometheus/templates/rules/rules_base.conf.yml diff --git a/roles/prometheus/files/rules/rules_cpu.conf.yml b/roles/prometheus/templates/rules/rules_cpu.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_cpu.conf.yml rename to roles/prometheus/templates/rules/rules_cpu.conf.yml diff --git a/roles/prometheus/files/rules/rules_disk.conf.yml b/roles/prometheus/templates/rules/rules_disk.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_disk.conf.yml rename to roles/prometheus/templates/rules/rules_disk.conf.yml diff --git a/roles/prometheus/files/rules/rules_elasticsearch.conf.yml b/roles/prometheus/templates/rules/rules_elasticsearch.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_elasticsearch.conf.yml rename to roles/prometheus/templates/rules/rules_elasticsearch.conf.yml diff --git a/roles/prometheus/files/rules/rules_mysql.conf.yml b/roles/prometheus/templates/rules/rules_mysql.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_mysql.conf.yml rename to roles/prometheus/templates/rules/rules_mysql.conf.yml diff --git a/roles/prometheus/files/rules/rules_net.conf.yml b/roles/prometheus/templates/rules/rules_net.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_net.conf.yml rename to roles/prometheus/templates/rules/rules_net.conf.yml diff --git a/roles/prometheus/files/rules/rules_nginx.conf.yml b/roles/prometheus/templates/rules/rules_nginx.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_nginx.conf.yml rename to roles/prometheus/templates/rules/rules_nginx.conf.yml diff --git a/roles/prometheus/files/rules/rules_node_016.conf.yml b/roles/prometheus/templates/rules/rules_node_016.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_node_016.conf.yml rename to roles/prometheus/templates/rules/rules_node_016.conf.yml diff --git a/roles/prometheus/files/rules/rules_services.conf.yml b/roles/prometheus/templates/rules/rules_services.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_services.conf.yml rename to roles/prometheus/templates/rules/rules_services.conf.yml diff --git a/roles/prometheus/files/rules/rules_ssl_probes.conf.yml b/roles/prometheus/templates/rules/rules_ssl_probes.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_ssl_probes.conf.yml rename to roles/prometheus/templates/rules/rules_ssl_probes.conf.yml diff --git a/roles/prometheus/files/rules/rules_syslog.conf.yml b/roles/prometheus/templates/rules/rules_syslog.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_syslog.conf.yml rename to roles/prometheus/templates/rules/rules_syslog.conf.yml diff --git a/roles/prometheus/files/rules/rules_systemd.conf.yml b/roles/prometheus/templates/rules/rules_systemd.conf.yml similarity index 100% rename from roles/prometheus/files/rules/rules_systemd.conf.yml rename to roles/prometheus/templates/rules/rules_systemd.conf.yml -- GitLab