diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index b29f903034e597436224cfaa062e770264a78cb5..77a8c16c18356e77a2b067c20a2036a625cf51fa 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -3,13 +3,22 @@ groups: rules: - alert: DiskUnhealthy expr: smartmon_device_smart_healthy < 1 - for: 2h + for: 1h labels: severity: warn annotations: summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is unhealthy" description: "Disk {{ $labels.disk }} on {{ $labels.host }} is reporting unhealthy SMART status and might need to be replaced." + - alert: DiskUnhealthy + expr: smartmon_self_test_status < 1 + for: 1h + labels: + severity: warn + annotations: + summary: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its self-test" + description: "Disk {{ $labels.disk }} on {{ $labels.host }} failed its {{ $labels.test }} SMART self-test, and might need to be replaced." + - alert: RAIDDeviceUnhealthy expr: node_md_degraded > 0 for: 15m