diff --git a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml index 6e8a55411623b0cb96fd40e5ed81d740f1593abc..11110d6f33cc5ddcd23e4e2b5935186326a7a732 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml @@ -73,8 +73,11 @@ groups: scope: host annotations: summary: 'Probe {{ $labels.probe }}@{{ $labels.host }} is failing' - description: 'Probe {{ $labels.probe }} ({{ $labels.host }}) is failing - for target {{ $labels.host }} (success ratio {{ $value }}).' + description: >- + Probe {{ $labels.probe }} ({{ $labels.host }}) is failing for target {{ $labels.host }} + (success ratio {{ $value }}). + + Failed probe logs: https://{{ $labels.prober_float_service }}.[[ domain_public[0] ]]/ runbook: '[[ alert_runbook_fmt | format("ProbeFailure") ]]' - alert: ProbeFailure @@ -85,8 +88,11 @@ groups: scope: global annotations: summary: 'Probe {{ $labels.probe }} is failing globally' - description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing - globally (success ratio {{ $value }}).' + description: >- + Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing globally + (success ratio {{ $value }}). + + Failed probe logs: https://{{ $labels.prober_float_service }}.[[ domain_public[0] ]]/ runbook: '[[ alert_runbook_fmt | format("ProbeFailure") ]]' - alert: CronJobFailure diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index 456c1ecbbe869c072b60799a7d77b2f607a3ef29..86fdc16ca7959651130cc57f2e997bbb5e5dd8a5 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -11,17 +11,17 @@ groups: # Sum prober metrics over the probers (hosts), producing # an aggregation by target. - record: target:probe_success:count - expr: count(probe_success) by (probe,probeset,zone,host) + expr: count(probe_success) by (probe,probeset,zone,host,prober_float_service) - record: target:probe_success:sum - expr: sum(probe_success) by (probe,probeset,zone,host) + expr: sum(probe_success) by (probe,probeset,zone,host,prober_float_service) - record: target:probe_success:ratio expr: target:probe_success:sum / target:probe_success:count # Sum prober metrics over targets, aggregating by probe. - record: probe:probe_success:count - expr: count(probe_success) by (probe,probeset,zone) + expr: count(probe_success) by (probe,probeset,prober_float_service,zone) - record: probe:probe_success:sum - expr: sum(probe_success) by (probe,probeset,zone) + expr: sum(probe_success) by (probe,probeset,prober_float_service,zone) - record: probe:probe_success:ratio expr: probe:probe_success:sum / probe:probe_success:count