From f2972b7b71cd9b6605d38669bdf819897c97b145 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Mon, 16 Dec 2024 11:44:38 +0000
Subject: [PATCH] Add another DiskUnhealthy trigger when disks fail SMART
 self-tests

---
 .../templates/rules/alerts_system_health.conf.yml     | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
index b29f9030..77a8c16c 100644
--- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
@@ -3,13 +3,22 @@ groups:
   rules:
     - alert: DiskUnhealthy
       expr: smartmon_device_smart_healthy < 1
-      for: 2h
+      for: 1h
       labels:
         severity: warn
       annotations:
         summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is unhealthy"
         description: "Disk {{ $labels.disk }} on {{ $labels.host }} is reporting unhealthy SMART status and might need to be replaced."
 
+    - alert: DiskUnhealthy
+      expr: smartmon_self_test_status < 1
+      for: 1h
+      labels:
+        severity: warn
+      annotations:
+        summary: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its self-test"
+        description: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its {{ $labels.test }} SMART self-test, and might need to be replaced."
+
     - alert: RAIDDeviceUnhealthy
       expr: node_md_degraded > 0
       for: 15m
-- 
GitLab