From c32336a6fd09eb21b551333b44abb93ed41c8872 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Thu, 24 Apr 2025 10:17:47 +0200 Subject: [PATCH] Conflate "deferred" and "active" Postfix queues for alerting --- .../templates/rules/alerts_postfix.conf.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml index 38605a79..24a128b7 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml @@ -11,7 +11,7 @@ groups: # The postfix-out instances should be allowed to have a large # deferred queue for outbound messages. - alert: PostfixQueueTooLarge - expr: postfix_queue_length{postfix_instance="postfix-out",queue="deferred"} > 5000 + expr: postfix_queue_length{postfix_instance="postfix-out",queue=~"(deferred|active)"} > 5000 for: 10m labels: severity: warn @@ -25,7 +25,7 @@ groups: deliveries. runbook: '[[ alert_runbook_fmt | format("PostfixQueueTooLarge") ]]' - alert: PostfixQueueTooLarge - expr: sum(postfix_queue_length{postfix_instance="postfix-out",queue="deferred"}) > 10000 + expr: sum(postfix_queue_length{postfix_instance="postfix-out",queue=~"(deferred|active)"}) > 10000 for: 10m labels: severity: page @@ -71,7 +71,7 @@ groups: # all. Note the longer timeout: it is fine for queues like # 'active' or 'incoming' to accomodate temporary spikes. - alert: PostfixUnexpectedQueueTooLarge - expr: postfix_queue_length{queue!="deferred"} > 50 + expr: postfix_queue_length{queue!~"(deferred|active)"} > 50 for: 1h labels: severity: page @@ -85,7 +85,7 @@ groups: service malfunctioning, or having capacity issues. runbook: '[[ alert_runbook_fmt | format("PostfixQueueTooLarge") ]]' - alert: PostfixUnexpectedQueueTooLarge - expr: sum(postfix_queue_length{queue!="deferred"}) by (postfix_instance, queue) > 100 + expr: sum(postfix_queue_length{queue!~"(deferred|active)"}) by (postfix_instance, queue) > 100 for: 1h labels: severity: page -- GitLab