diff --git a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml index 30b99ce8820ded7a629fcabbd39ae61bbf54decb..4f05a9017adc50f99a85fd178793981533294d1b 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml @@ -17,7 +17,7 @@ groups: annotations: summary: 'Too many login failures for service {{$labels.exported_service}}' description: 'The percentage of successful authentications on auth-server with service={{$labels.exported_service}} is too low ({{$value}}). This can indicate a brute-forcing attack (depending on the service), or a failure in the auth-server itself.' - runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures.md' + runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures' # We also want to check that, for some important services, there # actually are any successful logins. The threshold is low on @@ -35,5 +35,5 @@ groups: annotations: summary: 'No successful logins for service {{$labels.exported_service}}' description: 'The auth-server is not reporting successful logins with service={{$labels.exported_service}}. This might indicate something broken with the auth-server itself.' - runbook: '[[ alert_playbook_url ]]/NoLogins.md' + runbook: '[[ alert_playbook_url ]]/NoLogins' diff --git a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml index d5c98c7ebcfbf13e7cf1ad0fd4d959e8400702e0..eb07a1e9216bc574d90d9ca7340d564966086165 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml @@ -29,7 +29,7 @@ groups: annotations: summary: "Postfix is deferring many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly deferring lots of messages. Perhaps some of the expected destinations are unreachable." - runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred" - alert: PostfixHighBounces expr: 'instance:smtp_bounces:rate10m{postfix_instance!="postfix-out"} > 2' @@ -40,7 +40,7 @@ groups: annotations: summary: "Postfix is bouncing many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly bouncing lots of messages. Perhaps there are issues with the user databases." - runbook: "[[ alert_playbook_url ]]/PostfixHighBounces.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighBounces" # Warn if there are many rejects on an instance: something may (or may not) be broken. - alert: PostfixHighRejects @@ -63,7 +63,7 @@ groups: annotations: summary: "Postfix is rejecting many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is rejecting a high number of messages. Could be a spam attack, or something broken in the address delivery (if the failing deliveries are for real emails), which is often a DNS problem." - runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighRejects" # Page if there are multiple Postfix instances with high rejects: stuff is really broken. - alert: PostfixHighRejects @@ -75,7 +75,7 @@ groups: annotations: summary: "Postfix is rejecting many messages" description: "Multiple Postfix instances are rejecting a high number of messages. There is most likely something wrong with our internal email delivery." - runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighRejects" # Alert when many emails are ratelimited (likely an account compromise) - alert: PostfixHighRatelimit diff --git a/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml new file mode 100644 index 0000000000000000000000000000000000000000..d0126f3b88679fd49e41cdbe97ea6297c347632a --- /dev/null +++ b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml @@ -0,0 +1,18 @@ +groups: + - name: roles/ai3-prometheus/files/rules/alerts_security.conf + rules: + + - alert: UnexpectedConnections + expr: sum(delta(unexpected_connections{job="hark"}[1m]) by (host)) > 0 + for: 1m + labels: + severity: page + scope: host + annotations: + runbook: '[[ alert_playbook_url ]]/UnexpectedConnections' + summary: '[SECURITY] Unexpected connections to {{$labels.host}}' + description: | + The 'hark' canary listener has detected unexpected connections + on host {{$labels.host}}, this could be a sign of an intruder + running a port scan on the internal network. + diff --git a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml index 39a55d65e768b723ca00221cffb090b9cb04f0fc..8830c3c3c94c67e0f54bd3f063d011db80de5d63 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml @@ -13,7 +13,7 @@ groups: description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing for target {{ $labels.target }} (success ratio {{ $value }}). Check https://service-prober.autistici.org/ for the error details.' - runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' + runbook: '[[ alert_playbook_url ]]/ProbeFailure' - alert: ProbeFailure expr: probe:probe_success:ratio{probeset="service"} < 0.5 @@ -26,5 +26,5 @@ groups: description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing globally (success ratio {{ $value }}). Check https://service-prober.autistici.org/ for the error details.' - runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' + runbook: '[[ alert_playbook_url ]]/ProbeFailure'