From d1abcc792e630e87e0ea1283d6798259e3c61b5d Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Tue, 1 Feb 2022 11:19:04 +0000 Subject: [PATCH] Replace job:up base metric with the health check's probe_success Look at healthcheck prober metrics to assess target state, rather than using Prometheus' own "up" metric. This allows us to take advantage of redundant blackbox probers by removing noise caused by individual prober failures. To do so without changing the semantics, we add a new 'float_job' label on the health check probes, that maps to the Prometheus target names. --- .../templates/prometheus.yml.j2 | 1 + .../templates/rules/rules_base.conf.yml | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/roles/float-infra-prometheus/templates/prometheus.yml.j2 b/roles/float-infra-prometheus/templates/prometheus.yml.j2 index d3107047..9ce8fbdc 100644 --- a/roles/float-infra-prometheus/templates/prometheus.yml.j2 +++ b/roles/float-infra-prometheus/templates/prometheus.yml.j2 @@ -136,6 +136,7 @@ scrape_configs: probeset: health prober_float_service: prometheus float_service: "{{ service_name }}" + float_job: "{{ service_name }}_{{ target_config.port }}" {% endfor %} {% endfor %} diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index c30dc102..f4ba4107 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -1,10 +1,14 @@ groups: - name: roles/float-infra-prometheus/templates/rules/rules_base.conf rules: + # Look at prober metrics to assess target state, rather than + # using Prometheus' "up" metric. This allows us to take + # advantage of a redundant blackbox prober setup and remove + # noise caused by prober failures. - record: job:up:count - expr: count(up) by (job) + expr: label_replace(count(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)") - record: job:up:sum - expr: sum(up) by (job) + expr: label_replace(sum(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)") - record: job:up:ratio expr: job:up:sum / job:up:count @@ -13,17 +17,17 @@ groups: # order to preserve additional probe_success labels that might # be present. - record: target:probe_success:count - expr: count(probe_success) without (job,instance,prober_host) + expr: count(probe_success) without (job,float_job,instance,prober_host) - record: target:probe_success:sum - expr: sum(probe_success) without (job,instance,prober_host) + expr: sum(probe_success) without (job,float_job,instance,prober_host) - record: target:probe_success:ratio expr: target:probe_success:sum / target:probe_success:count # Sum prober metrics over targets, aggregating by probe. - record: probe:probe_success:count - expr: count(probe_success) without (job,instance,prober_host,host) + expr: count(probe_success) without (job,float_job,instance,prober_host,host) - record: probe:probe_success:sum - expr: sum(probe_success) without (job,instance,prober_host,host) + expr: sum(probe_success) without (job,float_job,instance,prober_host,host) - record: probe:probe_success:ratio expr: probe:probe_success:sum / probe:probe_success:count -- GitLab