From 676c70285ca6da98c338cb09069343625fa75c9a Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sun, 26 Apr 2020 07:54:18 +0100
Subject: [PATCH] Add "probeset" dimension to probe metrics

---
 roles/prometheus/files/rules/alerts_base.conf.yml | 6 +++---
 roles/prometheus/files/rules/rules_base.conf.yml  | 8 ++++----
 roles/prometheus/templates/prometheus.yml.j2      | 6 ++++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/files/rules/alerts_base.conf.yml
index fc392726..1cc6bf60 100644
--- a/roles/prometheus/files/rules/alerts_base.conf.yml
+++ b/roles/prometheus/files/rules/alerts_base.conf.yml
@@ -55,7 +55,7 @@ groups:
         redundancy ({{ $value }}) and may eventually be at risk.'
 
   - alert: JobDown
-    expr: job:up:ratio < 0.51
+    expr: job:up:ratio < 0.5
     for: 5m
     labels:
       severity: page
@@ -65,7 +65,7 @@ groups:
       description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
 
   - alert: ProbeFailure
-    expr: target:probe_success:ratio{probe!="ping"} < 0.5
+    expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
     for: 5m
     labels:
       severity: page
@@ -76,7 +76,7 @@ groups:
         for target {{ $labels.target }} (success ratio {{ $value }}).'
 
   - alert: ProbeFailure
-    expr: probe:probe_success:ratio{probe!="ping"} < 0.5
+    expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
     for: 5m
     labels:
       severity: page
diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/files/rules/rules_base.conf.yml
index cdc16006..7f08c3b7 100644
--- a/roles/prometheus/files/rules/rules_base.conf.yml
+++ b/roles/prometheus/files/rules/rules_base.conf.yml
@@ -11,17 +11,17 @@ groups:
       # Sum prober metrics over the probers (hosts), producing
       # an aggregation by target.
       - record: target:probe_success:count
-        expr: count(probe_success) by (probe,zone,target)
+        expr: count(probe_success) by (probe,probeset,zone,target)
       - record: target:probe_success:sum
-        expr: sum(probe_success) by (probe,zone,target)
+        expr: sum(probe_success) by (probe,probeset,zone,target)
       - record: target:probe_success:ratio
         expr: target:probe_success:sum / target:probe_success:count
 
       # Sum prober metrics over targets, aggregating by probe.
       - record: probe:probe_success:count
-        expr: count(probe_success) by (probe,zone)
+        expr: count(probe_success) by (probe,probeset,zone)
       - record: probe:probe_success:sum
-        expr: sum(probe_success) by (probe,zone)
+        expr: sum(probe_success) by (probe,probeset,zone)
       - record: probe:probe_success:ratio
         expr: probe:probe_success:sum / probe:probe_success:count
 
diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2
index 442289ea..b25d5239 100644
--- a/roles/prometheus/templates/prometheus.yml.j2
+++ b/roles/prometheus/templates/prometheus.yml.j2
@@ -124,6 +124,7 @@ scrape_configs:
         labels:
           zone: internal
           probe: ping
+          probeset: base
 
   - job_name: "prober_https_{{ prober_idx }}"
     metrics_path: "/probe"
@@ -151,6 +152,7 @@ scrape_configs:
         labels:
           zone: public
           probe: https_up
+          probeset: base
 
 {% for custom_probe in prometheus_custom_blackbox_probes.get('http', []) %}
   - job_name: "prober_https_{{ custom_probe.name }}_{{ prober_idx }}"
@@ -179,6 +181,7 @@ scrape_configs:
         labels:
           zone: public
           probe: https
+          probeset: base
 {% endfor %}
 
   - job_name: "prober_dns_{{ prober_idx }}"
@@ -206,6 +209,7 @@ scrape_configs:
         labels:
           zone: public
           probe: dns
+          probeset: base
 
 {% endfor %}
 
@@ -216,6 +220,8 @@ scrape_configs:
     metrics_path: "{{ target.metrics_path | default('/metrics') }}"
     static_configs:
       - targets: {{ target.targets | to_json }}
+        labels:
+          zone: external
     relabel_configs:
       - source_labels: [__address__]
         target_label: host
-- 
GitLab