diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/files/rules/alerts_base.conf.yml index fc3927263b7038542a726c08a85b498d65ca7ce1..1cc6bf60221baa7eac0fb0fff4f6b8f03f731f86 100644 --- a/roles/prometheus/files/rules/alerts_base.conf.yml +++ b/roles/prometheus/files/rules/alerts_base.conf.yml @@ -55,7 +55,7 @@ groups: redundancy ({{ $value }}) and may eventually be at risk.' - alert: JobDown - expr: job:up:ratio < 0.51 + expr: job:up:ratio < 0.5 for: 5m labels: severity: page @@ -65,7 +65,7 @@ groups: description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).' - alert: ProbeFailure - expr: target:probe_success:ratio{probe!="ping"} < 0.5 + expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 for: 5m labels: severity: page @@ -76,7 +76,7 @@ groups: for target {{ $labels.target }} (success ratio {{ $value }}).' - alert: ProbeFailure - expr: probe:probe_success:ratio{probe!="ping"} < 0.5 + expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5 for: 5m labels: severity: page diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/files/rules/rules_base.conf.yml index cdc16006688bfde2a7d9719d3ccaf4b136eb702b..7f08c3b7e6883af7d2f686103c794096475c5d99 100644 --- a/roles/prometheus/files/rules/rules_base.conf.yml +++ b/roles/prometheus/files/rules/rules_base.conf.yml @@ -11,17 +11,17 @@ groups: # Sum prober metrics over the probers (hosts), producing # an aggregation by target. - record: target:probe_success:count - expr: count(probe_success) by (probe,zone,target) + expr: count(probe_success) by (probe,probeset,zone,target) - record: target:probe_success:sum - expr: sum(probe_success) by (probe,zone,target) + expr: sum(probe_success) by (probe,probeset,zone,target) - record: target:probe_success:ratio expr: target:probe_success:sum / target:probe_success:count # Sum prober metrics over targets, aggregating by probe. - record: probe:probe_success:count - expr: count(probe_success) by (probe,zone) + expr: count(probe_success) by (probe,probeset,zone) - record: probe:probe_success:sum - expr: sum(probe_success) by (probe,zone) + expr: sum(probe_success) by (probe,probeset,zone) - record: probe:probe_success:ratio expr: probe:probe_success:sum / probe:probe_success:count diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 442289ea9a534a515ef3db2a57174ad68437c30d..b25d52391d9064973a8cc4b9fcf834bf5cab6b9d 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -124,6 +124,7 @@ scrape_configs: labels: zone: internal probe: ping + probeset: base - job_name: "prober_https_{{ prober_idx }}" metrics_path: "/probe" @@ -151,6 +152,7 @@ scrape_configs: labels: zone: public probe: https_up + probeset: base {% for custom_probe in prometheus_custom_blackbox_probes.get('http', []) %} - job_name: "prober_https_{{ custom_probe.name }}_{{ prober_idx }}" @@ -179,6 +181,7 @@ scrape_configs: labels: zone: public probe: https + probeset: base {% endfor %} - job_name: "prober_dns_{{ prober_idx }}" @@ -206,6 +209,7 @@ scrape_configs: labels: zone: public probe: dns + probeset: base {% endfor %} @@ -216,6 +220,8 @@ scrape_configs: metrics_path: "{{ target.metrics_path | default('/metrics') }}" static_configs: - targets: {{ target.targets | to_json }} + labels: + zone: external relabel_configs: - source_labels: [__address__] target_label: host