From d9f6e97d54da3e6da6e2e02ba1addcba473ff2f7 Mon Sep 17 00:00:00 2001
From: ale <ale@incal.net>
Date: Sat, 16 May 2020 08:55:23 +0100
Subject: [PATCH] Add hark alerts

---
 .../templates/rules/alerts_auth.conf.yml       |  4 ++--
 .../templates/rules/alerts_postfix.conf.yml    |  8 ++++----
 .../templates/rules/alerts_security.conf.yml   | 18 ++++++++++++++++++
 .../rules/alerts_service_prober.conf.yml       |  4 ++--
 4 files changed, 26 insertions(+), 8 deletions(-)
 create mode 100644 roles/ai3-prometheus/templates/rules/alerts_security.conf.yml

diff --git a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml
index 30b99ce8..4f05a901 100644
--- a/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml
+++ b/roles/ai3-prometheus/templates/rules/alerts_auth.conf.yml
@@ -17,7 +17,7 @@ groups:
         annotations:
           summary: 'Too many login failures for service {{$labels.exported_service}}'
           description: 'The percentage of successful authentications on auth-server with service={{$labels.exported_service}} is too low ({{$value}}). This can indicate a brute-forcing attack (depending on the service), or a failure in the auth-server itself.'
-          runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures.md'
+          runbook: '[[ alert_playbook_url ]]/TooManyLoginFailures'
 
       # We also want to check that, for some important services, there
       # actually are any successful logins. The threshold is low on
@@ -35,5 +35,5 @@ groups:
         annotations:
           summary: 'No successful logins for service {{$labels.exported_service}}'
           description: 'The auth-server is not reporting successful logins with service={{$labels.exported_service}}. This might indicate something broken with the auth-server itself.'
-          runbook: '[[ alert_playbook_url ]]/NoLogins.md'
+          runbook: '[[ alert_playbook_url ]]/NoLogins'
 
diff --git a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml
index d5c98c7e..eb07a1e9 100644
--- a/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml
+++ b/roles/ai3-prometheus/templates/rules/alerts_postfix.conf.yml
@@ -29,7 +29,7 @@ groups:
         annotations:
           summary: "Postfix is deferring many messages on {{$labels.postfix_instance}}"
           description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly deferring lots of messages. Perhaps some of the expected destinations are unreachable."
-          runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred.md"
+          runbook: "[[ alert_playbook_url ]]/PostfixHighDeferred"
 
       - alert: PostfixHighBounces
         expr: 'instance:smtp_bounces:rate10m{postfix_instance!="postfix-out"} > 2'
@@ -40,7 +40,7 @@ groups:
         annotations:
           summary: "Postfix is bouncing many messages on {{$labels.postfix_instance}}"
           description: "The Postfix instance {{$labels.postfix_instance}} is unexpectedly bouncing lots of messages. Perhaps there are issues with the user databases."
-          runbook: "[[ alert_playbook_url ]]/PostfixHighBounces.md"
+          runbook: "[[ alert_playbook_url ]]/PostfixHighBounces"
 
       # Warn if there are many rejects on an instance: something may (or may not) be broken.
       - alert: PostfixHighRejects
@@ -63,7 +63,7 @@ groups:
         annotations:
           summary: "Postfix is rejecting many messages on {{$labels.postfix_instance}}"
           description: "The Postfix instance {{$labels.postfix_instance}} is rejecting a high number of messages. Could be a spam attack, or something broken in the address delivery (if the failing deliveries are for real emails), which is often a DNS problem."
-          runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md"
+          runbook: "[[ alert_playbook_url ]]/PostfixHighRejects"
 
       # Page if there are multiple Postfix instances with high rejects: stuff is really broken.
       - alert: PostfixHighRejects
@@ -75,7 +75,7 @@ groups:
         annotations:
           summary: "Postfix is rejecting many messages"
           description: "Multiple Postfix instances are rejecting a high number of messages. There is most likely something wrong with our internal email delivery."
-          runbook: "[[ alert_playbook_url ]]/PostfixHighRejects.md"
+          runbook: "[[ alert_playbook_url ]]/PostfixHighRejects"
 
       # Alert when many emails are ratelimited (likely an account compromise)
       - alert: PostfixHighRatelimit
diff --git a/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml
new file mode 100644
index 00000000..d0126f3b
--- /dev/null
+++ b/roles/ai3-prometheus/templates/rules/alerts_security.conf.yml
@@ -0,0 +1,18 @@
+groups:
+  - name: roles/ai3-prometheus/files/rules/alerts_security.conf
+    rules:
+
+      - alert: UnexpectedConnections
+        expr: sum(delta(unexpected_connections{job="hark"}[1m]) by (host)) > 0
+        for: 1m
+        labels:
+          severity: page
+          scope: host
+        annotations:
+          runbook: '[[ alert_playbook_url ]]/UnexpectedConnections'
+          summary: '[SECURITY] Unexpected connections to {{$labels.host}}'
+          description: |
+              The 'hark' canary listener has detected unexpected connections
+              on host {{$labels.host}}, this could be a sign of an intruder
+              running a port scan on the internal network.
+
diff --git a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml
index 39a55d65..8830c3c3 100644
--- a/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml
+++ b/roles/ai3-prometheus/templates/rules/alerts_service_prober.conf.yml
@@ -13,7 +13,7 @@ groups:
       description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
         for target {{ $labels.target }} (success ratio {{ $value }}). Check
         https://service-prober.autistici.org/ for the error details.'
-      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure'
 
   - alert: ProbeFailure
     expr: probe:probe_success:ratio{probeset="service"} < 0.5
@@ -26,5 +26,5 @@ groups:
       description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
         globally (success ratio {{ $value }}). Check
         https://service-prober.autistici.org/ for the error details.'
-      runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
+      runbook: '[[ alert_playbook_url ]]/ProbeFailure'
 
-- 
GitLab