From d9f6e97d54da3e6da6e2e02ba1addcba473ff2f7 Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sat, 16 May 2020 08:55:23 +0100 Subject: [PATCH] Add hark alerts --- .../templates/rules/alerts_auth.conf.yml | 4 ++-- .../templates/rules/alerts_postfix.conf.yml | 8 ++++---- .../templates/rules/alerts_security.conf.yml | 18 ++++++++++++++++++ .../rules/alerts_service_prober.conf.yml | 4 ++-- 4 files changed, 26 insertions(+), 8 deletions(-) create mode 100644 roles/ai3-prometheus/templates/rules/alerts_security.conf.yml diff --git a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml index 30b99ce8..4f05a901 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml @@ -17,7 +17,7 @@ groups: annotations: summary: 'Too many login failures for service {{$labels.exported_service}}' description: 'The percentage of successful authentications on auth-server with service={{$labels.exported_service}} is too low ({{$value}}). This can indicate a brute-forcing attack (depending on the service), or a failure in the auth-server itself.' - runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures.md' + runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures' # We also want to check that, for some important services, there # actually are any successful logins. The threshold is low on @@ -35,5 +35,5 @@ groups: annotations: summary: 'No successful logins for service {{$labels.exported_service}}' description: 'The auth-server is not reporting successful logins with service={{$labels.exported_service}}. This might indicate something broken with the auth-server itself.' - runbook: '[[ alert_playbook_url ]]/NoLogins.md' + runbook: '[[ alert_playbook_url ]]/NoLogins' diff --git a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml index d5c98c7e..eb07a1e9 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml @@ -29,7 +29,7 @@ groups: annotations: summary: "Postfix is deferring many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly deferring lots of messages. Perhaps some of the expected destinations are unreachable." - runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred" - alert: PostfixHighBounces expr: 'instance:smtp_bounces:rate10m{postfix_instance!="postfix-out"} > 2' @@ -40,7 +40,7 @@ groups: annotations: summary: "Postfix is bouncing many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly bouncing lots of messages. Perhaps there are issues with the user databases." - runbook: "[[ alert_playbook_url ]]/PostfixHighBounces.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighBounces" # Warn if there are many rejects on an instance: something may (or may not) be broken. - alert: PostfixHighRejects @@ -63,7 +63,7 @@ groups: annotations: summary: "Postfix is rejecting many messages on {{$labels.postfix_instance}}" description: "The Postfix instance {{$labels.postfix_instance}} is rejecting a high number of messages. Could be a spam attack, or something broken in the address delivery (if the failing deliveries are for real emails), which is often a DNS problem." - runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighRejects" # Page if there are multiple Postfix instances with high rejects: stuff is really broken. - alert: PostfixHighRejects @@ -75,7 +75,7 @@ groups: annotations: summary: "Postfix is rejecting many messages" description: "Multiple Postfix instances are rejecting a high number of messages. There is most likely something wrong with our internal email delivery." - runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md" + runbook: "[[ alert_playbook_url ]]/PostfixHighRejects" # Alert when many emails are ratelimited (likely an account compromise) - alert: PostfixHighRatelimit diff --git a/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml new file mode 100644 index 00000000..d0126f3b --- /dev/null +++ b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml @@ -0,0 +1,18 @@ +groups: + - name: roles/ai3-prometheus/files/rules/alerts_security.conf + rules: + + - alert: UnexpectedConnections + expr: sum(delta(unexpected_connections{job="hark"}[1m]) by (host)) > 0 + for: 1m + labels: + severity: page + scope: host + annotations: + runbook: '[[ alert_playbook_url ]]/UnexpectedConnections' + summary: '[SECURITY] Unexpected connections to {{$labels.host}}' + description: | + The 'hark' canary listener has detected unexpected connections + on host {{$labels.host}}, this could be a sign of an intruder + running a port scan on the internal network. + diff --git a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml index 39a55d65..8830c3c3 100644 --- a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml +++ b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml @@ -13,7 +13,7 @@ groups: description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing for target {{ $labels.target }} (success ratio {{ $value }}). Check https://service-prober.autistici.org/ for the error details.' - runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' + runbook: '[[ alert_playbook_url ]]/ProbeFailure' - alert: ProbeFailure expr: probe:probe_success:ratio{probeset="service"} < 0.5 @@ -26,5 +26,5 @@ groups: description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing globally (success ratio {{ $value }}). Check https://service-prober.autistici.org/ for the error details.' - runbook: '[[ alert_playbook_url ]]/ProbeFailure.md' + runbook: '[[ alert_playbook_url ]]/ProbeFailure' -- GitLab