diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index faa4cfe686e2e60a1abb9257c91efb925ba7ffce..6617a848613a39ccc6439485079e582966e36ea9 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -28,6 +28,15 @@ groups: summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors" description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced." + - alert: DiskUnhealthy + expr: smartmon_self_test_age_hours > 336 + for: 1h + labels: + severity: warn + annotations: + summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is not running SMART self-tests" + description: "SMART self-tests have not run for over two weeks for disk {{ $labels.disk }} on {{ $labels.host }}, check smartd logs." + - alert: RAIDDeviceUnhealthy expr: node_md_degraded > 0 for: 15m diff --git a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml index c64d5f814bb93de3395ffc6e90584d3ca7422a41..d78ffed59280088c7fa970f82a08ae930340495c 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml @@ -8,3 +8,5 @@ groups: - record: volume:used_space:ratio expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) + - record: smartmon_self_test_age_hours + expr: smartmon_power_on_hours - (max(smartmon_self_test_hours) without (test))