From 72a2eb63df1c97630ca82b78d4caf9235f82e5c2 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 4 Jan 2025 08:12:29 +0100
Subject: [PATCH] De-noise HostUnreachable signal

---
 .../templates/rules/alerts_base.conf.yml                     | 4 ++--
 .../templates/rules/rules_base.conf.yml                      | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
index 861cf3bd..c75be319 100644
--- a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml
@@ -8,8 +8,8 @@ groups:
   # once the host becomes reachable again, so as to inhibit alerts that
   # might fire immediately after the transition.
   - alert: HostUnreachable
-    expr: min_over_time(host_reachable[10m]) == 0
-    for: 2m
+    expr: min_over_time(smoothed_host_reachable[10m]) == 0
+    for: 1m
     labels:
       severity: warn
     annotations:
diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
index bb20fb70..6d328a82 100644
--- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml
@@ -37,5 +37,10 @@ groups:
 
       # Special metric for the ping probe.
       # The 'bool' qualifier makes the greater-than operation not act as a filter.
+      # The 'smoothed' metric verifies that the host has been unreachable for two
+      # consecutive minutes.
       - record: host_reachable
         expr: target:probe_success:ratio{probe="ping"} > bool 0.6
+      - record: smoothed_host_reachable
+        expr: max_over_time(host_reachable[2m])
+
-- 
GitLab