From b882361d555114d3eb53b08f279372c7d1d52834 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Wed, 18 Dec 2024 07:18:26 +0000
Subject: [PATCH] Add a warning to detect when SMART self-tests aren't running

---
 .../templates/rules/alerts_system_health.conf.yml        | 9 +++++++++
 .../templates/rules/rules_disk.conf.yml                  | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
index faa4cfe6..6617a848 100644
--- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
@@ -28,6 +28,15 @@ groups:
         summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors"
         description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced."
 
+    - alert: DiskUnhealthy
+      expr: smartmon_self_test_age_hours > 336
+      for: 1h
+      labels:
+        severity: warn
+      annotations:
+        summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is not running SMART self-tests"
+        description: "SMART self-tests have not run for over two weeks for disk {{ $labels.disk }} on {{ $labels.host }}, check smartd logs."
+
     - alert: RAIDDeviceUnhealthy
       expr: node_md_degraded > 0
       for: 15m
diff --git a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml
index c64d5f81..d78ffed5 100644
--- a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml
@@ -8,3 +8,5 @@ groups:
   - record: volume:used_space:ratio
     expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)
 
+  - record: smartmon_self_test_age_hours
+    expr: smartmon_power_on_hours - (max(smartmon_self_test_hours) without (test))
-- 
GitLab