From f2972b7b71cd9b6605d38669bdf819897c97b145 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Mon, 16 Dec 2024 11:44:38 +0000 Subject: [PATCH] Add another DiskUnhealthy trigger when disks fail SMART self-tests --- .../templates/rules/alerts_system_health.conf.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index b29f9030..77a8c16c 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -3,13 +3,22 @@ groups: rules: - alert: DiskUnhealthy expr: smartmon_device_smart_healthy < 1 - for: 2h + for: 1h labels: severity: warn annotations: summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is unhealthy" description: "Disk {{ $labels.disk }} on {{ $labels.host }} is reporting unhealthy SMART status and might need to be replaced." + - alert: DiskUnhealthy + expr: smartmon_self_test_status < 1 + for: 1h + labels: + severity: warn + annotations: + summary: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its self-test" + description: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its {{ $labels.test }} SMART self-test, and might need to be replaced." + - alert: RAIDDeviceUnhealthy expr: node_md_degraded > 0 for: 15m -- GitLab