diff --git a/roles/float-infra-prometheus/templates/prometheus.yml.j2 b/roles/float-infra-prometheus/templates/prometheus.yml.j2 index d31070475df48a17319439af75d08277fe7a1635..9ce8fbdcf475bad24cc97edc5e567fd7675e0b4a 100644 --- a/roles/float-infra-prometheus/templates/prometheus.yml.j2 +++ b/roles/float-infra-prometheus/templates/prometheus.yml.j2 @@ -136,6 +136,7 @@ scrape_configs: probeset: health prober_float_service: prometheus float_service: "{{ service_name }}" + float_job: "{{ service_name }}_{{ target_config.port }}" {% endfor %} {% endfor %} diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index c30dc1027e89d50d9a92a44247b1499b6602425d..f4ba4107993904618b0a1aca4e1bbdef047c6e19 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -1,10 +1,14 @@ groups: - name: roles/float-infra-prometheus/templates/rules/rules_base.conf rules: + # Look at prober metrics to assess target state, rather than + # using Prometheus' "up" metric. This allows us to take + # advantage of a redundant blackbox prober setup and remove + # noise caused by prober failures. - record: job:up:count - expr: count(up) by (job) + expr: label_replace(count(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)") - record: job:up:sum - expr: sum(up) by (job) + expr: label_replace(sum(probe_success{probe="health"}) by (float_job),"job","$1","float_job","(.*)") - record: job:up:ratio expr: job:up:sum / job:up:count @@ -13,17 +17,17 @@ groups: # order to preserve additional probe_success labels that might # be present. - record: target:probe_success:count - expr: count(probe_success) without (job,instance,prober_host) + expr: count(probe_success) without (job,float_job,instance,prober_host) - record: target:probe_success:sum - expr: sum(probe_success) without (job,instance,prober_host) + expr: sum(probe_success) without (job,float_job,instance,prober_host) - record: target:probe_success:ratio expr: target:probe_success:sum / target:probe_success:count # Sum prober metrics over targets, aggregating by probe. - record: probe:probe_success:count - expr: count(probe_success) without (job,instance,prober_host,host) + expr: count(probe_success) without (job,float_job,instance,prober_host,host) - record: probe:probe_success:sum - expr: sum(probe_success) without (job,instance,prober_host,host) + expr: sum(probe_success) without (job,float_job,instance,prober_host,host) - record: probe:probe_success:ratio expr: probe:probe_success:sum / probe:probe_success:count