From b882361d555114d3eb53b08f279372c7d1d52834 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Wed, 18 Dec 2024 07:18:26 +0000 Subject: [PATCH] Add a warning to detect when SMART self-tests aren't running --- .../templates/rules/alerts_system_health.conf.yml | 9 +++++++++ .../templates/rules/rules_disk.conf.yml | 2 ++ 2 files changed, 11 insertions(+) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index faa4cfe6..6617a848 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -28,6 +28,15 @@ groups: summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors" description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced." + - alert: DiskUnhealthy + expr: smartmon_self_test_age_hours > 336 + for: 1h + labels: + severity: warn + annotations: + summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is not running SMART self-tests" + description: "SMART self-tests have not run for over two weeks for disk {{ $labels.disk }} on {{ $labels.host }}, check smartd logs." + - alert: RAIDDeviceUnhealthy expr: node_md_degraded > 0 for: 15m diff --git a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml index c64d5f81..d78ffed5 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml @@ -8,3 +8,5 @@ groups: - record: volume:used_space:ratio expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) + - record: smartmon_self_test_age_hours + expr: smartmon_power_on_hours - (max(smartmon_self_test_hours) without (test)) -- GitLab