From d1abcc792e630e87e0ea1283d6798259e3c61b5d Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Tue, 1 Feb 2022 11:19:04 +0000
Subject: [PATCH] Replace job:up base metric with the health check's
 probe_success

Look at healthcheck prober metrics to assess target state, rather than
using Prometheus' own "up" metric. This allows us to take advantage of
redundant blackbox probers by removing noise caused by individual
prober failures.

To do so without changing the semantics, we add a new 'float_job'
label on the health check probes, that maps to the Prometheus target names.
---
 .../templates/prometheus.yml.j2                  |  1 +
 .../templates/rules/rules_base.conf.yml          | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/roles/float-infra-prometheus/templates/prometheus.yml.j2 b/roles/float-infra-prometheus/templates/prometheus.yml.j2
index d3107047..9ce8fbdc 100644
--- a/roles/float-infra-prometheus/templates/prometheus.yml.j2
+++ b/roles/float-infra-prometheus/templates/prometheus.yml.j2
@@ -136,6 +136,7 @@ scrape_configs:
           probeset: health
           prober_float_service: prometheus
           float_service: "{{ service_name }}"
+          float_job: "{{ service_name }}_{{ target_config.port }}"
 {% endfor %}
 {% endfor %}
 
diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
index c30dc102..f4ba4107 100644
--- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
@@ -1,10 +1,14 @@
 groups:
   - name: roles/float-infra-prometheus/templates/rules/rules_base.conf
     rules:
+      # Look at prober metrics to assess target state, rather than
+      # using Prometheus' "up" metric. This allows us to take
+      # advantage of a redundant blackbox prober setup and remove
+      # noise caused by prober failures.
       - record: job:up:count
-        expr: count(up) by (job)
+        expr: label_replace(count(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)")
       - record: job:up:sum
-        expr: sum(up) by (job)
+        expr: label_replace(sum(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)")
       - record: job:up:ratio
         expr: job:up:sum / job:up:count
 
@@ -13,17 +17,17 @@ groups:
       # order to preserve additional probe_success labels that might
       # be present.
       - record: target:probe_success:count
-        expr: count(probe_success) without (job,instance,prober_host)
+        expr: count(probe_success) without (job,float_job,instance,prober_host)
       - record: target:probe_success:sum
-        expr: sum(probe_success) without (job,instance,prober_host)
+        expr: sum(probe_success) without (job,float_job,instance,prober_host)
       - record: target:probe_success:ratio
         expr: target:probe_success:sum / target:probe_success:count
 
       # Sum prober metrics over targets, aggregating by probe.
       - record: probe:probe_success:count
-        expr: count(probe_success) without (job,instance,prober_host,host)
+        expr: count(probe_success) without (job,float_job,instance,prober_host,host)
       - record: probe:probe_success:sum
-        expr: sum(probe_success) without (job,instance,prober_host,host)
+        expr: sum(probe_success) without (job,float_job,instance,prober_host,host)
       - record: probe:probe_success:ratio
         expr: probe:probe_success:sum / probe:probe_success:count
 
-- 
GitLab