From af97738cf5190ae009d033c7301821fc498514fb Mon Sep 17 00:00:00 2001 From: ale <ale@incal.net> Date: Sun, 28 Apr 2019 16:01:02 +0100 Subject: [PATCH] Update Prometheus config to support multiple instances Enable cross-scraping of probers, and add proper prober alerting rules. Configure alertmanagers' mesh protocol so they talk to each other. Make Prometheus talk to all alertmanagers. Fixes issue #6. --- passwords.yml.default | 4 + .../files/rules/alerts_base.conf.yml | 74 ++++++++- .../files/rules/rules_base.conf.yml | 32 ++++ .../templates/alertmanager.default.j2 | 2 +- .../prometheus/templates/alertmanager.yml.j2 | 11 +- roles/prometheus/templates/prometheus.yml.j2 | 146 +++++++++++------- services.yml.default | 2 + 7 files changed, 198 insertions(+), 73 deletions(-) create mode 100644 roles/prometheus/files/rules/rules_base.conf.yml diff --git a/passwords.yml.default b/passwords.yml.default index f463d00b..34e506c2 100644 --- a/passwords.yml.default +++ b/passwords.yml.default @@ -30,3 +30,7 @@ - name: acme_tsig_key type: tsig + +- name: prometheus_alertmanager_mesh_secret + description: Password for the Prometheus alertmanager mesh protocol + length: 32 diff --git a/roles/prometheus/files/rules/alerts_base.conf.yml b/roles/prometheus/files/rules/alerts_base.conf.yml index 4b68c0a7..b6dd693f 100644 --- a/roles/prometheus/files/rules/alerts_base.conf.yml +++ b/roles/prometheus/files/rules/alerts_base.conf.yml @@ -1,20 +1,78 @@ groups: - name: roles/prometheus/files/rules/alerts_base.conf rules: - - alert: InstanceDown - expr: up < 1 - for: 10m + + # HostUnreachable is used as a gate for most other host-based pages + # (via inhibit rules in the alertmanager configuration). + - alert: HostUnreachable + expr: host_reachable == 0 + for: 1m labels: severity: warn annotations: - DESCRIPTION: 'Job {{ $labels.job }} on {{ $labels.instance }} has been down - for more than 10 minutes.' - SUMMARY: Instance {{ $labels.job }} / {{ $labels.instance }} is down + summary: Host {{ $labels.host }} is down + description: 'Host {{ $labels.host }} is unreachable (icmp ping).' + - alert: Reboot expr: os_uptime < 600 for: 1m labels: severity: warn annotations: - DESCRIPTION: reboot on {{ $labels.instance }} - SUMMARY: reboot on {{ $labels.instance }} + description: reboot on {{ $labels.host }} + summary: reboot on {{ $labels.host }} + + - alert: JobDown + expr: up < 1 + for: 5m + labels: + severity: warn + scope: host + annotations: + summary: Job {{ $labels.job }}@{{ $labels.host }} is down + description: 'Job {{ $labels.job }} on {{ $labels.host }} has been down + for more than 5 minutes. If this is a prober job, then the alert refers + to the prometheus-blackbox-exporter service itself.' + + - alert: JobDown + expr: job:up:ratio < 1 + for: 5m + labels: + severity: warn + scope: global + annotations: + summary: Job {{ $labels.job }} has degraded redundancy + description: 'Job {{ $labels.job }} is running with slightly degraded + redundancy ({{$value}}) and may eventually be at risk.' + + - alert: JobDown + expr: job:up:ratio < 0.51 + for: 5m + labels: + severity: page + scope: global + annotations: + summary: Job {{ $labels.job }} is down globally + description: 'Job {{ $labels.job }} is down globally (availability {{$value}}).' + + - alert: ProbeFailure + expr: target:probe_success:ratio{probe!="ping"} < 0.5 + for: 5m + labels: + severity: page + scope: host + annotations: + summary: Probe {{ $labels.probe }}@{{ $labels.target }} is failing + description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing + for target {{ $labels.target }} (success ratio {{ $value }}).' + + - alert: ProbeFailure + expr: probe:probe_success:ratio{probe!="ping"} < 0.5 + for: 5m + labels: + severity: page + scope: global + annotations: + summary: Probe {{ $labels.probe }} is failing globally + description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing + globally (success ratio {{ $value }}).' diff --git a/roles/prometheus/files/rules/rules_base.conf.yml b/roles/prometheus/files/rules/rules_base.conf.yml new file mode 100644 index 00000000..cdc16006 --- /dev/null +++ b/roles/prometheus/files/rules/rules_base.conf.yml @@ -0,0 +1,32 @@ +groups: + - name: roles/prometheus/files/rules/rules_base.conf + rules: + - record: job:up:count + expr: count(up) by (job) + - record: job:up:sum + expr: sum(up) by (job) + - record: job:up:ratio + expr: job:up:sum / job:up:count + + # Sum prober metrics over the probers (hosts), producing + # an aggregation by target. + - record: target:probe_success:count + expr: count(probe_success) by (probe,zone,target) + - record: target:probe_success:sum + expr: sum(probe_success) by (probe,zone,target) + - record: target:probe_success:ratio + expr: target:probe_success:sum / target:probe_success:count + + # Sum prober metrics over targets, aggregating by probe. + - record: probe:probe_success:count + expr: count(probe_success) by (probe,zone) + - record: probe:probe_success:sum + expr: sum(probe_success) by (probe,zone) + - record: probe:probe_success:ratio + expr: probe:probe_success:sum / probe:probe_success:count + + # Special metric for the ping probe. The label_replace() sets + # the host to the value of the target label (instead of the host + # running the prober). + - record: host_reachable + expr: label_replace(target:probe_success:ratio{probe="ping"} > 0.6, "host", "$1", "target", "(.*)") diff --git a/roles/prometheus/templates/alertmanager.default.j2 b/roles/prometheus/templates/alertmanager.default.j2 index fa1d17c3..61b85cec 100644 --- a/roles/prometheus/templates/alertmanager.default.j2 +++ b/roles/prometheus/templates/alertmanager.default.j2 @@ -1,2 +1,2 @@ -ARGS="-web.external-url=https://alertmanager.{{ domain_public[0] }}" +ARGS="-web.external-url=https://alertmanager.{{ domain_public[0] }} --mesh.password={{ prometheus_alertmanager_mesh_secret }} --mesh.listen-address=:6783{% for h in groups['prometheus'] %} --mesh.peer={{ h }}.prometheus.{{ domain }}:6783{% endfor %}" diff --git a/roles/prometheus/templates/alertmanager.yml.j2 b/roles/prometheus/templates/alertmanager.yml.j2 index 042f7e58..716173ff 100644 --- a/roles/prometheus/templates/alertmanager.yml.j2 +++ b/roles/prometheus/templates/alertmanager.yml.j2 @@ -12,19 +12,22 @@ route: repeat_interval: 3h receiver: default +# The following inhibit rules are meant to work both with the job/host +# hierarchy (for standard metrics) and with the probe/target one (for +# the blackbox prober metrics). inhibit_rules: - # Inhibit severity=warning alerts if the same alert is already active with severity=page. + # Inhibit severity=warn alerts if the same alert is already active with severity=page. - source_match: severity: 'page' target_match: - severity: 'warning' - equal: ['alertname', 'service'] + severity: 'warn' + equal: ['alertname', 'job', 'probe'] # Inhibit scope=host alerts if the same alert is active with scope=global. - source_match: scope: 'global' target_match: scope: 'host' - equal: ['alertname', 'service'] + equal: ['alertname', 'job', 'probe'] # Inhibit all host-level alerts if HostUnreachable is firing. - source_match: alertname: HostUnreachable diff --git a/roles/prometheus/templates/prometheus.yml.j2 b/roles/prometheus/templates/prometheus.yml.j2 index 576b9761..6374e5da 100644 --- a/roles/prometheus/templates/prometheus.yml.j2 +++ b/roles/prometheus/templates/prometheus.yml.j2 @@ -7,60 +7,37 @@ replacement: "${1}" {% endmacro %} -global: - scrape_interval: "10s" - external_labels: - monitor: "{{ inventory_hostname }}" - -alerting: - alertmanagers: - - static_configs: +{# Generate static targets for hosts in an Ansible group #} +{% macro targets_for_group(group, port) %} - targets: - - localhost:9093 - -scrape_configs: - - job_name: "node" - static_configs: - - targets: -{% for host in groups['all'] %}{% if host != 'localhost' %} - - "{{ host }}:9100" -{% endif %}{% endfor %} -{{ relabel_configs() }} - - - job_name: "cgroups-exporter" - static_configs: - - targets: -{% for host in groups['all'] %}{% if host != 'localhost' %} - - "{{ host }}:3909" -{% endif %}{% endfor %} -{{ relabel_configs() }} - - - job_name: "mtail" - static_configs: - - targets: -{% for host in groups['all'] %}{% if host != 'localhost' %} - - "{{ host }}:3903" -{% endif %}{% endfor %} -{{ relabel_configs() }} +{% for host in groups[group] %} + - "{{ host }}:{{ port }}" +{% endfor %} +{% endmacro %} - - job_name: "docker" +{# Generate a static_configs entry for a scrape config #} +{% macro static_configs_for_group(group, port, service_label='') %} static_configs: - - targets: -{% for host in groups['all'] %}{% if host != 'localhost' %} - - "{{ host }}:9323" -{% endif %}{% endfor %} -{{ relabel_configs() }} +{{ targets_for_group(group, port) }} +{% if service_label %} + labels: + service: "{{ service_label }}" +{% endif %} + relabel_configs: + - source_labels: [__address__] + target_label: host + regex: "([^.]*).*:[0-9]+" + replacement: "${1}" +{% endmacro %} - - job_name: "backup-agent" - static_configs: - - targets: -{% for host in groups['all'] %}{% if host != 'localhost' %} - - "{{ host }}:5331" -{% endif %}{% endfor %} -{{ relabel_configs() }} +{# Simple group-based job scrape config #} +{% macro job_static_config(job_name, group, port) %} + - job_name: "{{ job_name }}" +{{ static_configs_for_group(group, port) }} +{% endmacro %} -{# Template for a static target. #} -{% macro static_target(service_name, target_config) %} +{# Job scrape config for a float service. #} +{% macro job_service_config(service_name, target_config) %} - job_name: "{{ target_config.get('job_name', service_name) }}" scheme: "{{ target_config.get('scheme', 'https') }}" {% if target_config.get('scheme', 'https') == 'https' %} @@ -69,42 +46,91 @@ scrape_configs: cert_file: /etc/credentials/x509/prometheus/client/cert.pem key_file: /etc/credentials/x509/prometheus/client/private_key.pem {% endif %} - static_configs: - - targets: -{% for host in groups[target_config.get('group', service_name)] %} - - "{{ host }}.{{ service_name }}.{{ domain }}:{{ target_config['port'] }}" -{% endfor %} -{{ relabel_configs() }} - +{{ static_configs_for_group(target_config.get('group', service_name), target_config.port, service_name) }} {% endmacro %} +global: + scrape_interval: "10s" + external_labels: + monitor: "{{ inventory_hostname }}" + +alerting: + alertmanagers: + - static_configs: +{{ targets_for_group('prometheus', 9093) }} + +scrape_configs: +{{ job_static_config('node', 'all', 9100) }} +{{ job_static_config('cgroups-exporter', 'all', 3909) }} +{{ job_static_config('mtail', 'all', 3903) }} +{{ job_static_config('docker', 'all', 9323) }} +{{ job_static_config('backup-agent', 'all', 5331) }} + + {# Iterate over monitoring endpoints of all services. #} {% for service_name, service in services.items() %} {%- for target_config in service.get('monitoring_endpoints', []) %} -{{ static_target(service_name, target_config) }} +{{ job_service_config(service_name, target_config) }} {%- endfor %} {% endfor %} {# Blackbox prober configs: host targets (icmp) #} -{# TODO: scrape *all* blackbox exporters #} - - job_name: "blackbox_ping" +{% for prober_host in groups['prometheus'] %} + + - job_name: "prober_ping_{{ loop.index }}" metrics_path: "/probe" params: module: [ping] + relabel_configs: + - source_labels: [__address__] + target_label: target + regex: "([^.]*).*" + replacement: "${1}" + - source_labels: [__address__] + target_label: __param_target + - source_labels: [__param_target] + target_label: instance + - target_label: __address__ + replacement: {{ prober_host }}:9115 + - target_label: host + replacement: {{ prober_host }} static_configs: - targets: {% for host in groups['all'] %} - "{{ host }}" {% endfor %} + labels: + zone: internal + probe: ping + + - job_name: "prober_https_{{ loop.index }}" + metrics_path: "/probe" + params: + module: [https] relabel_configs: + - source_labels: [__address__] + target_label: target + regex: "http://([^.:/]*).*" + replacement: "${1}" - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ - replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port. + replacement: {{ prober_host }}:9115 + - target_label: host + replacement: {{ prober_host }} + static_configs: + - targets: +{% for host in groups['frontend'] %} + - "https://{{ host }}/" +{% endfor %} + labels: + zone: public + probe: https +{% endfor %} rule_files: - /etc/prometheus/rules/*.yml diff --git a/services.yml.default b/services.yml.default index d68f94a9..60f62ca0 100644 --- a/services.yml.default +++ b/services.yml.default @@ -102,6 +102,8 @@ prometheus: - prometheus.service - prometheus-alertmanager.service - prometheus-blackbox-exporter.service + ports: + - 6783 sso-server: num_instances: 1 -- GitLab