diff --git a/docs/reference.md b/docs/reference.md index ff5160bc571871933da4ed4ae3345b9f47a65d23..72b75f63f5ecf98f271d48b4a67c3040cade3883 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1120,7 +1120,7 @@ all the XML and weird enterprise edge cases. / Apache2 modules. * [git.autistici.org/id/go-sso](https://git.autistici.org/id/go-sso) SSO server and SSO proxy implementation. -* [the sso-server role README](../roles/sso-server/README.md) has +* [the sso-server role README](../roles/float-infra-sso-server/README.md) has details about the Ansible configuration of SSO parameters. ### User-encrypted secrets @@ -1265,8 +1265,7 @@ Monitoring dashboards are provided by Grafana. A few alerting rules are provided by default in -[roles/prometheus/files/rules/](roles/prometheus/files/rules/). This -includes: +[roles/float-infra-prometheus/templates/rules/](roles/float-infra-prometheus/templates/rules/). This includes: * host-level alerts (high CPU usage, disk full, network errors...) * service failures (systemd services down, or crash-looping) @@ -1472,7 +1471,7 @@ indexes. Float uses the following index types: * *audit-\** for audit logs, which usually have a longer retention We use Elasticsearch index templates (in -roles/log-collector/templates/elasticsearch/templates) to optimize the +roles/float-infra-log-collector/templates/elasticsearch/templates) to optimize the schema a bit, disabling indexing on problematic fields, and setting sane replication options. diff --git a/roles/float-infra-prometheus/README.md b/roles/float-infra-prometheus/README.md index 2d70f4b6d17d3ca19254182a130791586ee41221..fcfb3749ca1801736e96ad7e5bc61781bfb7ad24 100644 --- a/roles/float-infra-prometheus/README.md +++ b/roles/float-infra-prometheus/README.md @@ -48,7 +48,7 @@ but it will still be active and functional (via *amtool*). A few alerting rules are provided by default in -[roles/prometheus/files/rules/](roles/prometheus/files/rules/). This +[roles/float-infra-prometheus/templates/rules/](roles/float-infra-prometheus/templates/rules/). This includes: * host-level alerts (high CPU usage, disk full, network errors...) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_acme.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_acme.conf.yml index a6bdf74e6df3070ccfc9f5f57144b401ce993037..baa8ab552f4012740ecce3637b16b1a9b5353efb 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_acme.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_acme.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_acme.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_acme.conf rules: - alert: CertMissing diff --git a/roles/float-infra-prometheus/templates/rules/alerts_backup.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_backup.conf.yml index 1937f76689040fd13f4856560928a139a7021427..429d90c5d33e4f8891ce90ee050c20311cbb1744 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_backup.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_backup.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_backup.conf +- name: roles/float-infra-prometheus/template/rules/alerts_backup.conf rules: - alert: BackupFailed expr: backup_ok != 1 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml index e4a2061e1408ad2ea942b05a789d5e725138cc13..ab5bbb00f673d5465c86ab9716de930477b67143 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_base.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_base.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_base.conf rules: # HostUnreachable is used as a gate for most other host-based pages diff --git a/roles/float-infra-prometheus/templates/rules/alerts_cpu.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_cpu.conf.yml index 4a94bde60ada47845c00fe6bd73466fe42338ff3..ec1b3a5d4db3a8d77845993aae59c489c0c5823a 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_cpu.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_cpu.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_cpu.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_cpu.conf rules: - alert: CPUUsageHigh expr: instance_utilization:rate5m > 0.96 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_disk.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_disk.conf.yml index 1366f74d5edc9b7cd04128a0dee14d223ed7989e..41a20cde2eef421924cf0bf551ca6a56a63427ee 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_disk.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_disk.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_disk.conf +- name: roles/float-infra-prometheus/template/rules/alerts_disk.conf rules: - alert: DiskWillFillIn4Hours expr: (predict_linear(node_filesystem_free_bytes[1h], 4 * 3600) < 0) and (node_filesystem_free_bytes / node_filesystem_size_bytes < 0.6) diff --git a/roles/float-infra-prometheus/templates/rules/alerts_mtail.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_mtail.conf.yml index e05b8e417f3e982c1604316fe2cc57609128fbb7..9c2061164153e12b4daff8f250e82d59d99e7549 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_mtail.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_mtail.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_mtail.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_mtail.conf rules: - alert: MtailProgramErrors expr: delta(mtail_prog_load_errors_total[1h]) > 0 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_mysql.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_mysql.conf.yml index 3b77e822e484061d0867a83894c36c5d95621cb7..9521b6d6b0253dec25966c08622e789dc230a419 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_mysql.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_mysql.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/alerts_mysql.conf + - name: roles/float-infra-prometheus/templates/rules/alerts_mysql.conf rules: - alert: MySQLReplicationBroken expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_net.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_net.conf.yml index 22c0ef239ab369c8fccd50bd68d0f2a7f4510a9e..30683b6dc2e9d6ec6f5aa5fb95448be3e294a3f2 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_net.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_net.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_net.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_net.conf rules: - alert: ConntrackTableFull expr: instance:conntrack_full:ratio > 0.9 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_nginx.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_nginx.conf.yml index 7c9279659c6facc8cb1bd5e3db627ec13808dc1e..5c63354f0762ed924b9cda9bff6cb36a0b5a9182 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_nginx.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_nginx.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_nginx.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_nginx.conf rules: - alert: HTTPErrorRatioHigh diff --git a/roles/float-infra-prometheus/templates/rules/alerts_prometheus.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_prometheus.conf.yml index c7f539ebe9a4dba3bcea6ab62ae1acd166e31853..3f1b746005d2c4031499b2ecc2205138d782ea6a 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_prometheus.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_prometheus.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_prometheus.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_prometheus.conf rules: - alert: NodeExporterBroken diff --git a/roles/float-infra-prometheus/templates/rules/alerts_services.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_services.conf.yml index eb49748d4806585d534775a54f406e0158409174..fc486f8c5983b7b0bedcbbd181cd2ba108be0e89 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_services.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_services.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/alerts_services.conf + - name: roles/float-infra-prometheus/templates/rules/alerts_services.conf rules: - alert: ServiceAvailabilityTooLow expr: float_service:ok:ratio < 0.6 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_ssl_probes.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_ssl_probes.conf.yml index ae326f859429a4d24fdfd1ef7caeb665ca4e27f1..41de5c73ddfa4eca75c0bc6ca51c6e0c940cdf6d 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_ssl_probes.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_ssl_probes.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_ssl_probes.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_ssl_probes.conf rules: - alert: SSLCertificateAboutToExpire expr: target:probe_ssl_cert_expiry:days < 10 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_syslog.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_syslog.conf.yml index 8f239c792ea4e3787657a6443d950f643fca028f..7bd61974a2d68e7fafd518818caaf94b82f1446c 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_syslog.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_syslog.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/alerts_syslog.conf + - name: roles/float-infra-prometheus/templates/rules/alerts_syslog.conf rules: - alert: SyslogDroppingLogs expr: rsyslog_queue_discarded:rate5m{job="rsyslog-collector"} > 0.04 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml index a8424bef94ffa6fbffa44e169ec46cba8cc2da54..b29f903034e597436224cfaa062e770264a78cb5 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_system_health.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_system_health.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_system_health.conf rules: - alert: DiskUnhealthy expr: smartmon_device_smart_healthy < 1 diff --git a/roles/float-infra-prometheus/templates/rules/alerts_systemd.conf.yml b/roles/float-infra-prometheus/templates/rules/alerts_systemd.conf.yml index a22faee75eaf1c519deade19b61a820456db7ced..4a9cdacfdbafbb553b4d92015685086c15f49ddf 100644 --- a/roles/float-infra-prometheus/templates/rules/alerts_systemd.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/alerts_systemd.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/alerts_systemd.conf +- name: roles/float-infra-prometheus/templates/rules/alerts_systemd.conf rules: - alert: SystemdUnitFailed expr: node_systemd_unit_state{state="failed"} > 0 diff --git a/roles/float-infra-prometheus/templates/rules/rules_acme.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_acme.conf.yml index 13792587299d30d3c3e83c03262e7332ac92fd8a..58da68273c97d60e6766a00e6a10a4b5b73db8cb 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_acme.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_acme.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_acme.conf +- name: roles/float-infra-prometheus/templates/rules/rules_acme.conf rules: - record: cn:cert_ok expr: max(cert_ok) by (cn) diff --git a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml index cd3b4ab3f25fcd141824cc4c7076bf1935222293..1dd308271052710bfbfbc1e315fc977833a20b3f 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_base.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_base.conf + - name: roles/float-infra-prometheus/templates/rules/rules_base.conf rules: - record: job:up:count expr: count(up) by (job) diff --git a/roles/float-infra-prometheus/templates/rules/rules_cpu.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_cpu.conf.yml index 3d8f45ca2d206951067e182b7dcb800c64174c9f..4c29f336a3f028a601726ccca3a8db54237e9012 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_cpu.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_cpu.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_cpu.conf +- name: roles/float-infra-prometheus/templates/rules/rules_cpu.conf rules: - record: instance:node_cpus:count expr: count(node_cpu_seconds_total{mode="idle"}) by (host, instance) diff --git a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml index d7e5cc042df4f1de37ea2cbcd59118a12f13e0ac..73d168de041192586bd30731c51f67d7b40e151c 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_disk.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_disk.conf +- name: roles/float-infra-prometheus/templates/rules/rules_disk.conf rules: - record: instance:node_disk_writes_completed_total:irate1m expr: sum(irate(node_disk_writes_completed_total{device=~"([vs]d|nvme).*"}[1m])) WITHOUT (device) diff --git a/roles/float-infra-prometheus/templates/rules/rules_elasticsearch.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_elasticsearch.conf.yml index 9da07ba5c8f0dcf0796c8ce2dceeedc929b515f5..6d939be0a8ebf0273e163a58c012b730eba55219 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_elasticsearch.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_elasticsearch.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_elasticsearch.conf + - name: roles/float-infra-prometheus/templates/rules/rules_elasticsearch.conf rules: - record: elasticsearch_filesystem_data_used_percent expr: 100 * (elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_free_bytes) / elasticsearch_filesystem_data_size_bytes diff --git a/roles/float-infra-prometheus/templates/rules/rules_mysql.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_mysql.conf.yml index 37bd767f8e69332a33f6285c33f593ea2807c558..48cb1734152ae8136696e39ef733bfa64523947f 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_mysql.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_mysql.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_mysql.conf + - name: roles/float-infra-prometheus/templates/rules/rules_mysql.conf rules: - record: mysql_slave_lag_seconds expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay diff --git a/roles/float-infra-prometheus/templates/rules/rules_net.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_net.conf.yml index 03e3a64cfafbd8bc1e6f72b491ed2d7438a94258..ba32532b1d9fd2a09a8914d715f5b422dd764bcf 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_net.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_net.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_net.conf +- name: roles/float-infra-prometheus/templates/rules/rules_net.conf rules: - record: instance:conntrack_full:ratio expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit diff --git a/roles/float-infra-prometheus/templates/rules/rules_nginx.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_nginx.conf.yml index 022edbffa1dc0dab8bbe3ba298adbf0accd42100..5eaee6b2ad98a1f8aed93a9a99ff951572b5b2a9 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_nginx.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_nginx.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_nginx.conf +- name: roles/float-infra-prometheus/templates/rules/rules_nginx.conf rules: - record: host:nginx_http_requests_total:rate5m expr: sum(rate(nginx_http_requests[5m])) by (host, vhost) diff --git a/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml index 553f00c75542b373b2c633f99b3bb6e75d085fc2..abe03a8e6bd3b39d1efb75bfa93bc8c8032c36eb 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_services.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_services.conf + - name: roles/float-infra-prometheus/templates/rules/rules_services.conf rules: - record: job:total:count expr: count(up) by (job) diff --git a/roles/float-infra-prometheus/templates/rules/rules_ssl_probes.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_ssl_probes.conf.yml index 4db3107abf7f416b79ae5238d3d4d2be5a669b4c..1997eb1fe28863d97d97d88689384508585a0999 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_ssl_probes.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_ssl_probes.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_ssl_probes.conf + - name: roles/float-infra-prometheus/templates/rules/rules_ssl_probes.conf rules: - record: target:probe_ssl_cert_expiry:days expr: ((min(probe_ssl_earliest_cert_expiry) by (probe,target)) - time()) / 86400 diff --git a/roles/float-infra-prometheus/templates/rules/rules_syslog.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_syslog.conf.yml index 6e16ceaa2fc13586c56c979b20e11a27184c3328..57b8ec80dbe9239fc0cac2ac21f970cbacef0855 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_syslog.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_syslog.conf.yml @@ -1,5 +1,5 @@ groups: - - name: roles/prometheus/files/rules/rules_syslog.conf + - name: roles/float-infra-prometheus/templates/rules/rules_syslog.conf rules: - record: rsyslog_action_failed:rate5m expr: rate(rsyslog_action_failed[5m]) diff --git a/roles/float-infra-prometheus/templates/rules/rules_systemd.conf.yml b/roles/float-infra-prometheus/templates/rules/rules_systemd.conf.yml index 0e7c4902c6d82e2a33bbcf8c3b05c53392413629..2aee5c53bcd8a41f1584aa39bb82d9680e6cca10 100644 --- a/roles/float-infra-prometheus/templates/rules/rules_systemd.conf.yml +++ b/roles/float-infra-prometheus/templates/rules/rules_systemd.conf.yml @@ -1,5 +1,5 @@ groups: -- name: roles/prometheus/files/rules/rules_systemd.conf +- name: roles/float-infra-prometheus/templates/rules/rules_systemd.conf rules: - record: instance:systemd_unit_restarts:delta10m expr: delta(systemd_unit_restarts[10m])