Skip to content
Snippets Groups Projects
Commit b882361d authored by ale's avatar ale
Browse files

Add a warning to detect when SMART self-tests aren't running

parent e4ce11ce
No related branches found
No related tags found
No related merge requests found
Pipeline #82765 passed
...@@ -28,6 +28,15 @@ groups: ...@@ -28,6 +28,15 @@ groups:
summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors" summary: "Disk {{ $labels.disk }} on {{ $labels.host }} has pending sectors"
description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced." description: "Disk {{ $labels.disk }} on {{ $labels.host }} seems to be experiencing hardware errors, and might need to be replaced."
- alert: DiskUnhealthy
expr: smartmon_self_test_age_hours > 336
for: 1h
labels:
severity: warn
annotations:
summary: "Disk {{ $labels.disk }} on {{ $labels.host }} is not running SMART self-tests"
description: "SMART self-tests have not run for over two weeks for disk {{ $labels.disk }} on {{ $labels.host }}, check smartd logs."
- alert: RAIDDeviceUnhealthy - alert: RAIDDeviceUnhealthy
expr: node_md_degraded > 0 expr: node_md_degraded > 0
for: 15m for: 15m
......
...@@ -8,3 +8,5 @@ groups: ...@@ -8,3 +8,5 @@ groups:
- record: volume:used_space:ratio - record: volume:used_space:ratio
expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes) expr: 1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)
- record: smartmon_self_test_age_hours
expr: smartmon_power_on_hours - (max(smartmon_self_test_hours) without (test))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment