From 81923b6040240521ba8abdf3047a316799fdbf71 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Tue, 17 Dec 2024 15:42:06 +0000
Subject: [PATCH] Add a DiskUnhealthy alert for active 'pending' sectors

---
 .../templates/rules/alerts_system_health.conf.yml        | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
index 77a8c16c..209ec741 100644
--- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
+++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml
@@ -19,6 +19,15 @@ groups:
         summary: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its self-test"
         description: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its {{ $labels.test }} SMART self-test, and might need to be replaced."
 
+    - alert: DiskUnhealthy
+      expr: smartmon_attribute{attr="current_pending_sector"} > 0
+      for: 1h
+      labels:
+        severity: warn
+      annotations:
+        summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors"
+        description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced."
+
     - alert: RAIDDeviceUnhealthy
       expr: node_md_degraded > 0
       for: 15m
-- 
GitLab