Commit cf1c5971 authored by ale's avatar ale
Browse files

Use node-exporter metrics for CPU throttling alert

parent fa2c9b4f
Pipeline #17495 passed with stages
in 6 minutes and 6 seconds
counter kernel_traps_total by failed_program
counter kernel_ooms_total
gauge kernel_cpu_throttled by cpu
def syslog {
/^(?P<date>(?P<legacy_date>\w+\s+\d+\s+\d+:\d+:\d+)|(?P<rfc3339_date>\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d+[-+]\d{2}:\d{2}))/ + /\s+(?:\w+@)?(?P<hostname>[-\w\.]+)\s+/ {
......@@ -22,13 +21,5 @@ def syslog {
/oom_reaper: reaped process \d+/ {
kernel_ooms_total++
}
/CPU(?P<cpu>\d+): \w+ temperature above threshold, cpu clock throttled/ {
kernel_cpu_throttled[$cpu] = 1
}
/CPU(?P<cpu>\d+): \w+ temperature\/speed normal/ {
kernel_cpu_throttled[$cpu] = 0
}
}
......@@ -24,8 +24,10 @@ groups:
description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.'
runbook: '[[ alert_runbook_fmt | format("HostThrashing") ]]'
# The threshold here is set to 1 instead of 0 on purpose, to allow
# low frequency throttling events, which shouldn't be problematic.
- alert: ThermalEnvelopeThrottling
expr: host:kernel_cpu_throttled:max > 0
expr: (host:node_cpu_core_throttles_total:rate5m > 1) or (host:node_cpu_package_throttles_total:rate5m > 1)
for: 2h
labels:
scope: host
......
......@@ -9,5 +9,7 @@ groups:
expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (host, instance)
- record: instance_utilization:rate5m
expr: instance_utilization:node_cpu:rate5m / instance:node_cpus:count
- record: host:kernel_cpu_throttled:max
expr: max(kernel_cpu_throttled) by (host)
- record: host:node_cpu_core_throttles_total:rate5m
expr: sum(rate(node_cpu_core_throttles_total[5m])) by (host)
- record: host:node_cpu_package_throttles_total:rate5m
expr: sum(rate(node_cpu_package_throttles_total[5m])) by (host)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment