Commit 86982d3c authored by ale's avatar ale

Add more ACME prometheus alerts

parent 68332329
Pipeline #3356 failed with stage
in 4 minutes and 38 seconds
......@@ -6,8 +6,32 @@ groups:
expr: cn:cert_ok < 1
for: 3d
labels:
severity: warn
scope: global
annotations:
summary: 'Missing certificate for {{$labels.cn}}'
description: 'The ACME automation could not generate a valid certificate for the domain "{{$labels.cn}}" for 3 days.'
# The magic number '3' is selected because the acme automation runs
# every 15 minutes, so 4 times an hour.
- alert: CertRenewalFailures
expr: cn:cert_renewal_errors:rate1h > 3
for: 6h
labels:
severity: warn
scope: global
annotations:
summary: 'Failure to renew certificate for {{$labels.cn}}'
description: 'The ACME automation has failed to renew the domain "{{$labels.cn}}" for the last 6 hours. If the situation does not recover, the certificate will eventually expire.'
# Let us know if the ACME automation is completely broken or has
# failed to load its certificates.
- alert: ACMEBroken
expr: count(cert_ok == 1) < 1
for: 30m
labels:
severity: page
scope: global
annotations:
summary: 'ACME automation completely broken'
description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.'
......@@ -3,3 +3,6 @@ groups:
rules:
- record: cn:cert_ok
expr: max(cert_ok) by (cn)
- record: cn:cert_renewal_errors:rate1h
expr: sum(delta(cert_renewal_errors[1h])) by (cn)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment