diff --git a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml index 38605a79d17b6f2a38026cdf74ed5796839e65d4..24a128b7506fc33c479e489af82ab03d8ae22042 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml @@ -11,7 +11,7 @@ groups: # The postfix-out instances should be allowed to have a large # deferred queue for outbound messages. - alert: PostfixQueueTooLarge - expr: postfix_queue_length{postfix_instance="postfix-out",queue="deferred"} > 5000 + expr: postfix_queue_length{postfix_instance="postfix-out",queue=~"(deferred|active)"} > 5000 for: 10m labels: severity: warn @@ -25,7 +25,7 @@ groups: deliveries. runbook: '[[ alert_runbook_fmt | format("PostfixQueueTooLarge") ]]' - alert: PostfixQueueTooLarge - expr: sum(postfix_queue_length{postfix_instance="postfix-out",queue="deferred"}) > 10000 + expr: sum(postfix_queue_length{postfix_instance="postfix-out",queue=~"(deferred|active)"}) > 10000 for: 10m labels: severity: page @@ -71,7 +71,7 @@ groups: # all. Note the longer timeout: it is fine for queues like # 'active' or 'incoming' to accomodate temporary spikes. - alert: PostfixUnexpectedQueueTooLarge - expr: postfix_queue_length{queue!="deferred"} > 50 + expr: postfix_queue_length{queue!~"(deferred|active)"} > 50 for: 1h labels: severity: page @@ -85,7 +85,7 @@ groups: service malfunctioning, or having capacity issues. runbook: '[[ alert_runbook_fmt | format("PostfixQueueTooLarge") ]]' - alert: PostfixUnexpectedQueueTooLarge - expr: sum(postfix_queue_length{queue!="deferred"}) by (postfix_instance, queue) > 100 + expr: sum(postfix_queue_length{queue!~"(deferred|active)"}) by (postfix_instance, queue) > 100 for: 1h labels: severity: page