From 72a2eb63df1c97630ca82b78d4caf9235f82e5c2 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sat, 4 Jan 2025 08:12:29 +0100 Subject: [PATCH] De-noise HostUnreachable signal --- .../templates/rules/alerts_base.conf.yml | 4 ++-- .../templates/rules/rules_base.conf.yml | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml index 861cf3bd..c75be319 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml @@ -8,8 +8,8 @@ groups: # once the host becomes reachable again, so as to inhibit alerts that # might fire immediately after the transition. - alert: HostUnreachable - expr: min_over_time(host_reachable[10m]) == 0 - for: 2m + expr: min_over_time(smoothed_host_reachable[10m]) == 0 + for: 1m labels: severity: warn annotations: diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index bb20fb70..6d328a82 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -37,5 +37,10 @@ groups: # Special metric for the ping probe. # The 'bool' qualifier makes the greater-than operation not act as a filter. + # The 'smoothed' metric verifies that the host has been unreachable for two + # consecutive minutes. - record: host_reachable expr: target:probe_success:ratio{probe="ping"} > bool 0.6 + - record: smoothed_host_reachable + expr: max_over_time(host_reachable[2m]) + -- GitLab