Commit 703b5ab0 authored by ale's avatar ale

Add a NeedsReboot alert when there are pending kernel upgrades

parent a2e7e7c4
Pipeline #3440 failed with stage
in 5 minutes and 52 seconds
......@@ -13,14 +13,23 @@ groups:
summary: Host {{ $labels.host }} is down
description: 'Host {{ $labels.host }} is unreachable (icmp ping).'
- alert: NeedsReboot
expr: node_reboot_required > 0
for: 30m
labels:
severity: warn
annotations:
summary: 'Host {{ $labels.host }} needs to reboot'
description: 'Host {{ $labels.host }} needs to reboot, there are pending kernel upgrades.'
- alert: Reboot
expr: os_uptime < 600
for: 1m
expr: os_uptime < 900
for: 5m
labels:
severity: warn
annotations:
description: reboot on {{ $labels.host }}
summary: reboot on {{ $labels.host }}
summary: 'Reboot on {{ $labels.host }}'
description: 'The host {{ $labels.host }} has just rebooted. Hopefully this was expected.'
- alert: JobDown
expr: up < 1
......@@ -29,7 +38,7 @@ groups:
severity: warn
scope: host
annotations:
summary: Job {{ $labels.job }}@{{ $labels.host }} is down
summary: 'Job {{ $labels.job }}@{{ $labels.host }} is down'
description: 'Job {{ $labels.job }} on {{ $labels.host }} has been down
for more than 5 minutes. If this is a prober job, then the alert refers
to the prometheus-blackbox-exporter service itself.'
......@@ -41,9 +50,9 @@ groups:
severity: warn
scope: global
annotations:
summary: Job {{ $labels.job }} has degraded redundancy
summary: 'Job {{ $labels.job }} has degraded redundancy'
description: 'Job {{ $labels.job }} is running with slightly degraded
redundancy ({{$value}}) and may eventually be at risk.'
redundancy ({{ $value }}) and may eventually be at risk.'
- alert: JobDown
expr: job:up:ratio < 0.51
......@@ -52,8 +61,8 @@ groups:
severity: page
scope: global
annotations:
summary: Job {{ $labels.job }} is down globally
description: 'Job {{ $labels.job }} is down globally (availability {{$value}}).'
summary: 'Job {{ $labels.job }} is down globally'
description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
- alert: ProbeFailure
expr: target:probe_success:ratio{probe!="ping"} < 0.5
......@@ -62,7 +71,7 @@ groups:
severity: page
scope: host
annotations:
summary: Probe {{ $labels.probe }}@{{ $labels.target }} is failing
summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
for target {{ $labels.target }} (success ratio {{ $value }}).'
......@@ -73,6 +82,6 @@ groups:
severity: page
scope: global
annotations:
summary: Probe {{ $labels.probe }} is failing globally
summary: 'Probe {{ $labels.probe }} is failing globally'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
globally (success ratio {{ $value }}).'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment