From af97738cf5190ae009d033c7301821fc498514fb Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 28 Apr 2019 16:01:02 +0100
Subject: [PATCH] Update Prometheus config to support multiple instances

Enable cross-scraping of probers, and add proper prober alerting
rules.

Configure alertmanagers' mesh protocol so they talk to each other.
Make Prometheus talk to all alertmanagers.

Fixes issue #6.
---
 passwords.yml.default                         |   4 +
 .../files/rules/alerts_base.conf.yml          |  74 ++++++++-
 .../files/rules/rules_base.conf.yml           |  32 ++++
 .../templates/alertmanager.default.j2         |   2 +-
 .../prometheus/templates/alertmanager.yml.j2  |  11 +-
 roles/prometheus/templates/prometheus.yml.j2  | 146 +++++++++++-------
 services.yml.default                          |   2 +
 7 files changed, 198 insertions(+), 73 deletions(-)
 create mode 100644 roles/prometheus/files/rules/rules_base.conf.yml

diff --git a/passwords.yml.default b/passwords.yml.default
index f463d00b..34e506c2 100644
--- a/passwords.yml.default
+++ b/passwords.yml.default
@@ -30,3 +30,7 @@
 
 - name: acme_tsig_key
   type: tsig
+
+- name: prometheus_alertmanager_mesh_secret
+  description: Password for the Prometheus alertmanager mesh protocol
+  length: 32
diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/files/rules/alerts_base.conf.yml
index 4b68c0a7..b6dd693f 100644
--- a/roles/prometheus/files/rules/alerts_base.conf.yml
+++ b/roles/prometheus/files/rules/alerts_base.conf.yml
@@ -1,20 +1,78 @@
 groups:
 - name: roles/prometheus/files/rules/alerts_base.conf
   rules:
-  - alert: InstanceDown
-    expr: up < 1
-    for: 10m
+
+  # HostUnreachable is used as a gate for most other host-based pages
+  # (via inhibit rules in the alertmanager configuration).
+  - alert: HostUnreachable
+    expr: host_reachable == 0
+    for: 1m
     labels:
       severity: warn
     annotations:
-      DESCRIPTION: 'Job {{ $labels.job }} on {{ $labels.instance }} has been down
-        for more than 10 minutes.'
-      SUMMARY: Instance {{ $labels.job }} / {{ $labels.instance }} is down
+      summary: Host {{ $labels.host }} is down
+      description: 'Host {{ $labels.host }} is unreachable (icmp ping).'
+
   - alert: Reboot
     expr: os_uptime < 600
     for: 1m
     labels:
       severity: warn
     annotations:
-      DESCRIPTION: reboot on {{ $labels.instance }}
-      SUMMARY: reboot on {{ $labels.instance }}
+      description: reboot on {{ $labels.host }}
+      summary: reboot on {{ $labels.host }}
+
+  - alert: JobDown
+    expr: up < 1
+    for: 5m
+    labels:
+      severity: warn
+      scope: host
+    annotations:
+      summary: Job {{ $labels.job }}@{{ $labels.host }} is down
+      description: 'Job {{ $labels.job }} on {{ $labels.host }} has been down
+        for more than 5 minutes. If this is a prober job, then the alert refers
+        to the prometheus-blackbox-exporter service itself.'
+
+  - alert: JobDown
+    expr: job:up:ratio < 1
+    for: 5m
+    labels:
+      severity: warn
+      scope: global
+    annotations:
+      summary: Job {{ $labels.job }} has degraded redundancy
+      description: 'Job {{ $labels.job }} is running with slightly degraded
+        redundancy ({{$value}}) and may eventually be at risk.'
+
+  - alert: JobDown
+    expr: job:up:ratio < 0.51
+    for: 5m
+    labels:
+      severity: page
+      scope: global
+    annotations:
+      summary: Job {{ $labels.job }} is down globally
+      description: 'Job {{ $labels.job }} is down globally (availability {{$value}}).'
+
+  - alert: ProbeFailure
+    expr: target:probe_success:ratio{probe!="ping"} < 0.5
+    for: 5m
+    labels:
+      severity: page
+      scope: host
+    annotations:
+      summary: Probe {{ $labels.probe }}@{{ $labels.target }} is failing
+      description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
+        for target {{ $labels.target }} (success ratio {{ $value }}).'
+
+  - alert: ProbeFailure
+    expr: probe:probe_success:ratio{probe!="ping"} < 0.5
+    for: 5m
+    labels:
+      severity: page
+      scope: global
+    annotations:
+      summary: Probe {{ $labels.probe }} is failing globally
+      description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
+        globally (success ratio {{ $value }}).'
diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/files/rules/rules_base.conf.yml
new file mode 100644
index 00000000..cdc16006
--- /dev/null
+++ b/roles/prometheus/files/rules/rules_base.conf.yml
@@ -0,0 +1,32 @@
+groups:
+  - name: roles/prometheus/files/rules/rules_base.conf
+    rules:
+      - record: job:up:count
+        expr: count(up) by (job)
+      - record: job:up:sum
+        expr: sum(up) by (job)
+      - record: job:up:ratio
+        expr: job:up:sum / job:up:count
+
+      # Sum prober metrics over the probers (hosts), producing
+      # an aggregation by target.
+      - record: target:probe_success:count
+        expr: count(probe_success) by (probe,zone,target)
+      - record: target:probe_success:sum
+        expr: sum(probe_success) by (probe,zone,target)
+      - record: target:probe_success:ratio
+        expr: target:probe_success:sum / target:probe_success:count
+
+      # Sum prober metrics over targets, aggregating by probe.
+      - record: probe:probe_success:count
+        expr: count(probe_success) by (probe,zone)
+      - record: probe:probe_success:sum
+        expr: sum(probe_success) by (probe,zone)
+      - record: probe:probe_success:ratio
+        expr: probe:probe_success:sum / probe:probe_success:count
+
+      # Special metric for the ping probe. The label_replace() sets
+      # the host to the value of the target label (instead of the host
+      # running the prober).
+      - record: host_reachable
+        expr: label_replace(target:probe_success:ratio{probe="ping"} > 0.6, "host", "$1", "target", "(.*)")
diff --git a/roles/prometheus/templates/alertmanager.default.j2 b/roles/prometheus/templates/alertmanager.default.j2
index fa1d17c3..61b85cec 100644
--- a/roles/prometheus/templates/alertmanager.default.j2
+++ b/roles/prometheus/templates/alertmanager.default.j2
@@ -1,2 +1,2 @@
-ARGS="-web.external-url=https://alertmanager.{{ domain_public[0] }}"
+ARGS="-web.external-url=https://alertmanager.{{ domain_public[0] }} --mesh.password={{ prometheus_alertmanager_mesh_secret }} --mesh.listen-address=:6783{% for h in groups['prometheus'] %} --mesh.peer={{ h }}.prometheus.{{ domain }}:6783{% endfor %}"
 
diff --git a/roles/prometheus/templates/alertmanager.yml.j2 b/roles/prometheus/templates/alertmanager.yml.j2
index 042f7e58..716173ff 100644
--- a/roles/prometheus/templates/alertmanager.yml.j2
+++ b/roles/prometheus/templates/alertmanager.yml.j2
@@ -12,19 +12,22 @@ route:
   repeat_interval: 3h
   receiver: default
 
+# The following inhibit rules are meant to work both with the job/host
+# hierarchy (for standard metrics) and with the probe/target one (for
+# the blackbox prober metrics).
 inhibit_rules:
-  # Inhibit severity=warning alerts if the same alert is already active with severity=page.
+  # Inhibit severity=warn alerts if the same alert is already active with severity=page.
   - source_match:
       severity: 'page'
     target_match:
-      severity: 'warning'
-    equal: ['alertname', 'service']
+      severity: 'warn'
+    equal: ['alertname', 'job', 'probe']
   # Inhibit scope=host alerts if the same alert is active with scope=global.
   - source_match:
       scope: 'global'
     target_match:
       scope: 'host'
-    equal: ['alertname', 'service']
+    equal: ['alertname', 'job', 'probe']
   # Inhibit all host-level alerts if HostUnreachable is firing.
   - source_match:
       alertname: HostUnreachable
diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2
index 576b9761..6374e5da 100644
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
@@ -7,60 +7,37 @@
         replacement: "${1}"
 {% endmacro %}
 
-global:
-  scrape_interval: "10s"
-  external_labels:
-    monitor: "{{ inventory_hostname }}"
-
-alerting:
-  alertmanagers:
-    - static_configs:
+{# Generate static targets for hosts in an Ansible group #}
+{% macro targets_for_group(group, port) %}
       - targets:
-        - localhost:9093
-
-scrape_configs:
-  - job_name: "node"
-    static_configs:
-      - targets:
-{% for host in groups['all'] %}{% if host != 'localhost' %}
-          - "{{ host }}:9100"
-{% endif %}{% endfor %}
-{{ relabel_configs() }}
-
-  - job_name: "cgroups-exporter"
-    static_configs:
-      - targets:
-{% for host in groups['all'] %}{% if host != 'localhost' %}
-          - "{{ host }}:3909"
-{% endif %}{% endfor %}
-{{ relabel_configs() }}
-
-  - job_name: "mtail"
-    static_configs:
-      - targets:
-{% for host in groups['all'] %}{% if host != 'localhost' %}
-          - "{{ host }}:3903"
-{% endif %}{% endfor %}
-{{ relabel_configs() }}
+{% for host in groups[group] %}
+          - "{{ host }}:{{ port }}"
+{% endfor %}
+{% endmacro %}
 
-  - job_name: "docker"
+{# Generate a static_configs entry for a scrape config #}
+{% macro static_configs_for_group(group, port, service_label='') %}
     static_configs:
-      - targets:
-{% for host in groups['all'] %}{% if host != 'localhost' %}
-          - "{{ host }}:9323"
-{% endif %}{% endfor %}
-{{ relabel_configs() }}
+{{ targets_for_group(group, port) }}
+{% if service_label %}
+        labels:
+          service: "{{ service_label }}"  
+{% endif %}
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: host
+        regex: "([^.]*).*:[0-9]+"
+        replacement: "${1}"
+{% endmacro %}
 
-  - job_name: "backup-agent"
-    static_configs:
-      - targets:
-{% for host in groups['all'] %}{% if host != 'localhost' %}
-          - "{{ host }}:5331"
-{% endif %}{% endfor %}
-{{ relabel_configs() }}
+{# Simple group-based job scrape config #}
+{% macro job_static_config(job_name, group, port) %}
+   - job_name: "{{ job_name }}"
+{{ static_configs_for_group(group, port) }}
+{% endmacro %}
 
-{# Template for a static target. #}
-{% macro static_target(service_name, target_config) %}
+{# Job scrape config for a float service. #}
+{% macro job_service_config(service_name, target_config) %}
   - job_name: "{{ target_config.get('job_name', service_name) }}"
     scheme: "{{ target_config.get('scheme', 'https') }}"
 {% if target_config.get('scheme', 'https') == 'https' %}
@@ -69,42 +46,91 @@ scrape_configs:
       cert_file: /etc/credentials/x509/prometheus/client/cert.pem
       key_file: /etc/credentials/x509/prometheus/client/private_key.pem
 {% endif %}
-    static_configs:
-      - targets:
-{% for host in groups[target_config.get('group', service_name)] %}
-          - "{{ host }}.{{ service_name }}.{{ domain }}:{{ target_config['port'] }}"
-{% endfor %}
-{{ relabel_configs() }}
-
+{{ static_configs_for_group(target_config.get('group', service_name), target_config.port, service_name) }}
 {% endmacro %}
 
+global:
+  scrape_interval: "10s"
+  external_labels:
+    monitor: "{{ inventory_hostname }}"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+{{ targets_for_group('prometheus', 9093) }}
+
+scrape_configs:
+{{ job_static_config('node', 'all', 9100) }}
+{{ job_static_config('cgroups-exporter', 'all', 3909) }}
+{{ job_static_config('mtail', 'all', 3903) }}
+{{ job_static_config('docker', 'all', 9323) }}
+{{ job_static_config('backup-agent', 'all', 5331) }}
+
+
 {# Iterate over monitoring endpoints of all services. #}
 {% for service_name, service in services.items() %}
   {%- for target_config in service.get('monitoring_endpoints', []) %}
-{{ static_target(service_name, target_config) }}
+{{ job_service_config(service_name, target_config) }}
   {%- endfor %}
 {% endfor %}
 
 
 {# Blackbox prober configs: host targets (icmp) #}
-{# TODO: scrape *all* blackbox exporters #}
-  - job_name: "blackbox_ping"
+{% for prober_host in groups['prometheus'] %}
+
+  - job_name: "prober_ping_{{ loop.index }}"
     metrics_path: "/probe"
     params:
       module: [ping]
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: target
+        regex: "([^.]*).*"
+        replacement: "${1}"
+      - source_labels: [__address__]
+        target_label: __param_target
+      - source_labels: [__param_target]
+        target_label: instance
+      - target_label: __address__
+        replacement: {{ prober_host }}:9115
+      - target_label: host
+        replacement: {{ prober_host }}
     static_configs:
       - targets:
 {% for host in groups['all'] %}
         - "{{ host }}"
 {% endfor %}
+        labels:
+          zone: internal
+          probe: ping
+
+  - job_name: "prober_https_{{ loop.index }}"
+    metrics_path: "/probe"
+    params:
+      module: [https]
     relabel_configs:
+      - source_labels: [__address__]
+        target_label: target
+        regex: "http://([^.:/]*).*"
+        replacement: "${1}"
       - source_labels: [__address__]
         target_label: __param_target
       - source_labels: [__param_target]
         target_label: instance
       - target_label: __address__
-        replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
+        replacement: {{ prober_host }}:9115
+      - target_label: host
+        replacement: {{ prober_host }}
+    static_configs:
+      - targets:
+{% for host in groups['frontend'] %}
+        - "https://{{ host }}/"
+{% endfor %}
+        labels:
+          zone: public
+          probe: https
 
+{% endfor %}
 
 rule_files:
   - /etc/prometheus/rules/*.yml
diff --git a/services.yml.default b/services.yml.default
index d68f94a9..60f62ca0 100644
--- a/services.yml.default
+++ b/services.yml.default
@@ -102,6 +102,8 @@ prometheus:
     - prometheus.service
     - prometheus-alertmanager.service
     - prometheus-blackbox-exporter.service
+  ports:
+    - 6783
 
 sso-server:
   num_instances: 1
-- 
GitLab