diff --git a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml index 861cf3bdd4e5012d26bb1a406b95dd9f447caf0c..c75be319eb14007990540eb37687faefcefba9ba 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml @@ -8,8 +8,8 @@ groups: # once the host becomes reachable again, so as to inhibit alerts that # might fire immediately after the transition. - alert: HostUnreachable - expr: min_over_time(host_reachable[10m]) == 0 - for: 2m + expr: min_over_time(smoothed_host_reachable[10m]) == 0 + for: 1m labels: severity: warn annotations: diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index bb20fb707ef003f56340eebff4a625e3d5c38302..6d328a82374e6dc0ece767e929c101958c9575fd 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -37,5 +37,10 @@ groups: # Special metric for the ping probe. # The 'bool' qualifier makes the greater-than operation not act as a filter. + # The 'smoothed' metric verifies that the host has been unreachable for two + # consecutive minutes. - record: host_reachable expr: target:probe_success:ratio{probe="ping"} > bool 0.6 + - record: smoothed_host_reachable + expr: max_over_time(host_reachable[2m]) +