Commit f6f34fa4 authored by ale's avatar ale

Fix annotations for most alerts

parent 963f3941
Pipeline #3343 passed with stage
in 4 minutes and 55 seconds
......@@ -7,5 +7,5 @@ groups:
labels:
severity: warn
annotations:
DESCRIPTION: CPU utilization is high on {{ $labels.instance }}
SUMMARY: CPU utilization is high on {{ $labels.instance }}
summary: 'CPU utilization is high on {{ $labels.instance }}'
description: 'CPU utilization is high on {{ $labels.instance }}'
......@@ -6,7 +6,7 @@ groups:
for: 5m
labels:
severity: page
scope: host
annotations:
DESCRIPTION: disk {{ $labels.fs }} on {{ $labels.instance }} will be full in
less than 4 hours
SUMMARY: disk {{ $labels.fs }} on {{ $labels.instance }} is almost full
summary: 'Disk {{ $labels.fs }} on {{ $labels.instance }} is almost full'
description: 'Disk {{ $labels.fs }} on {{ $labels.instance }} will be full in less than 4 hours'
......@@ -7,33 +7,29 @@ groups:
labels:
severity: page
annotations:
DESCRIPTION: Conntrack table on {{ $labels.instance }} is more than 90% full.
SUMMARY: Conntrack table on {{ $labels.instance }} is almost full
description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.'
summary: 'Conntrack table on {{ $labels.instance }} is almost full'
- alert: NetworkErrors
expr: instance:node_network_errs:rate5m > 1
for: 15m
labels:
severity: page
annotations:
DESCRIPTION: High rate of packet errors on {{ $labels.instance }}/{{ $labels.device
}}.
SUMMARY: High rate of packet errors on {{ $labels.instance }} device {{ $labels.device
}}
summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.'
- alert: NetworkDrops
expr: instance:node_network_drop:rate5m > 1
for: 15m
labels:
severity: page
annotations:
DESCRIPTION: High rate of packet drops on {{ $labels.instance }}/{{ $labels.device
}}.
SUMMARY: High rate of packet drops on {{ $labels.instance }} device {{ $labels.device
}}
summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.'
- alert: HostUnreachable
expr: probe_success{job="blackbox_ping"} < 1
for: 5m
labels:
severity: page
annotations:
DESCRIPTION: Host {{ $labels.instance }} is unreachable (icmp).
SUMMARY: Host {{ $labels.instance }} is unreachable
summary: 'Host {{ $labels.instance }} is unreachable'
description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).'
......@@ -2,26 +2,22 @@ groups:
- name: roles/prometheus/files/rules/alerts_nginx.conf
rules:
- alert: HTTPErrorRatioHigh
expr: (global:nginx_http_requests_errs:ratio > 0.02 and global:nginx_http_requests_total:rate5m
> 0.1)
expr: (global:nginx_http_requests_errs:ratio > 0.02 and global:nginx_http_requests_total:rate5m > 0.1)
for: 5m
labels:
scope: global
service: nginx
severity: page
annotations:
DESCRIPTION: We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on
all frontends.
SUMMARY: High HTTP error ratio for {{$labels.vhost}} globally
summary: 'High HTTP error ratio for {{$labels.vhost}} globally'
description: 'We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on all frontends.'
- alert: HTTPErrorRatioHigh
expr: (instance:nginx_http_requests_errs:ratio > 0.02 and instance:nginx_http_requests_total:rate5m
> 0.1)
expr: (instance:nginx_http_requests_errs:ratio > 0.02 and instance:nginx_http_requests_total:rate5m > 0.1)
for: 10m
labels:
scope: host
service: nginx
severity: page
annotations:
DESCRIPTION: We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on
{{$labels.host}}.
SUMMARY: High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}
summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}'
description: 'We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on {{$labels.host}}.'
......@@ -6,15 +6,17 @@ groups:
for: 5m
labels:
severity: page
scope: global
annotations:
DESCRIPTION: Availability too low for service {{ $labels.float_service }}
SUMMARY: Availability too low for service {{ $labels.float_service }}
description: 'Availability too low for service {{ $labels.float_service }}'
summary: 'Availability too low for service {{ $labels.float_service }}'
- alert: ServiceDegraded
expr: float_service:ok_by_host == 0
for: 10m
labels:
severity: warn
scope: host
annotations:
DESCRIPTION: Service {{ $labels.float_service }} is failing or degraded on host {{ $labels.host }}
SUMMARY: Service {{ $labels.float_service }} is failing or degraded on host {{ $labels.host }}
description: 'Service {{ $labels.float_service }} is failing or degraded on host {{ $labels.host }}'
summary: 'Service {{ $labels.float_service }} is failing or degraded on host {{ $labels.host }}'
......@@ -7,14 +7,13 @@ groups:
labels:
severity: warn
annotations:
DESCRIPTION: The systemd unit {{ $labels.name }} has failed on {{ $labels.host
}}.
SUMMARY: '{{ $labels.name }} has failed on {{ $labels.host }}'
summary: '{{ $labels.name }} has failed on {{ $labels.host }}'
description: 'The systemd unit {{ $labels.name }} has failed on {{ $labels.host }}.'
- alert: SystemdUnitCrashLooping
expr: instance:systemd_unit_restarts:delta10m > 10
for: 30m
labels:
severity: page
annotations:
DESCRIPTION: Systemd unit {{ $labels.unit }} is being started repeatedly.
SUMMARY: Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}
summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}'
description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment