From 81923b6040240521ba8abdf3047a316799fdbf71 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Tue, 17 Dec 2024 15:42:06 +0000 Subject: [PATCH] Add a DiskUnhealthy alert for active 'pending' sectors --- .../templates/rules/alerts_system_health.conf.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index 77a8c16c..209ec741 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -19,6 +19,15 @@ groups: summary: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its self-test" description: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its {{ $labels.test }} SMART self-test, and might need to be replaced." + - alert: DiskUnhealthy + expr: smartmon_attribute{attr="current_pending_sector"} > 0 + for: 1h + labels: + severity: warn + annotations: + summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors" + description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced." + - alert: RAIDDeviceUnhealthy expr: node_md_degraded > 0 for: 15m -- GitLab