From 676c70285ca6da98c338cb09069343625fa75c9a Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sun, 26 Apr 2020 07:54:18 +0100 Subject: [PATCH] Add "probeset" dimension to probe metrics --- roles/prometheus/files/rules/alerts_base.conf.yml | 6 +++--- roles/prometheus/files/rules/rules_base.conf.yml | 8 ++++---- roles/prometheus/templates/prometheus.yml.j2 | 6 ++++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/files/rules/alerts_base.conf.yml index fc392726..1cc6bf60 100644 --- a/roles/prometheus/files/rules/alerts_base.conf.yml +++ b/roles/prometheus/files/rules/alerts_base.conf.yml @@ -55,7 +55,7 @@ groups: redundancy ({{ $value }}) and may eventually be at risk.' - alert: JobDown - expr: job:up:ratio < 0.51 + expr: job:up:ratio < 0.5 for: 5m labels: severity: page @@ -65,7 +65,7 @@ groups: description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).' - alert: ProbeFailure - expr: target:probe_success:ratio{probe!="ping"} < 0.5 + expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 for: 5m labels: severity: page @@ -76,7 +76,7 @@ groups: for target {{ $labels.target }} (success ratio {{ $value }}).' - alert: ProbeFailure - expr: probe:probe_success:ratio{probe!="ping"} < 0.5 + expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 for: 5m labels: severity: page diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/files/rules/rules_base.conf.yml index cdc16006..7f08c3b7 100644 --- a/roles/prometheus/files/rules/rules_base.conf.yml +++ b/roles/prometheus/files/rules/rules_base.conf.yml @@ -11,17 +11,17 @@ groups: # Sum prober metrics over the probers (hosts), producing # an aggregation by target. - record: target:probe_success:count - expr: count(probe_success) by (probe,zone,target) + expr: count(probe_success) by (probe,probeset,zone,target) - record: target:probe_success:sum - expr: sum(probe_success) by (probe,zone,target) + expr: sum(probe_success) by (probe,probeset,zone,target) - record: target:probe_success:ratio expr: target:probe_success:sum / target:probe_success:count # Sum prober metrics over targets, aggregating by probe. - record: probe:probe_success:count - expr: count(probe_success) by (probe,zone) + expr: count(probe_success) by (probe,probeset,zone) - record: probe:probe_success:sum - expr: sum(probe_success) by (probe,zone) + expr: sum(probe_success) by (probe,probeset,zone) - record: probe:probe_success:ratio expr: probe:probe_success:sum / probe:probe_success:count diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 442289ea..b25d5239 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -124,6 +124,7 @@ scrape_configs: labels: zone: internal probe: ping + probeset: base - job_name: "prober_https_{{ prober_idx }}" metrics_path: "/probe" @@ -151,6 +152,7 @@ scrape_configs: labels: zone: public probe: https_up + probeset: base {% for custom_probe in prometheus_custom_blackbox_probes.get('http', []) %} - job_name: "prober_https_{{ custom_probe.name }}_{{ prober_idx }}" @@ -179,6 +181,7 @@ scrape_configs: labels: zone: public probe: https + probeset: base {% endfor %} - job_name: "prober_dns_{{ prober_idx }}" @@ -206,6 +209,7 @@ scrape_configs: labels: zone: public probe: dns + probeset: base {% endfor %} @@ -216,6 +220,8 @@ scrape_configs: metrics_path: "{{ target.metrics_path | default('/metrics') }}" static_configs: - targets: {{ target.targets | to_json }} + labels: + zone: external relabel_configs: - source_labels: [__address__] target_label: host -- GitLab