Commit d17ff9a4 authored by ale's avatar ale

Drop md extension from runbook URLs

parent 83ab0f1f
Pipeline #6774 failed with stages
in 22 minutes and 36 seconds
......@@ -35,4 +35,4 @@ groups:
annotations:
summary: 'ACME automation completely broken'
description: 'It seems that the ACME automation has loaded zero valid certificates, something must be broken.'
runbook: '[[ alert_playbook_url ]]/ACMEBroken.md'
runbook: '[[ alert_playbook_url ]]/ACMEBroken'
......@@ -17,4 +17,4 @@ groups:
annotations:
summary: '{{ $labels.dataset }} backup failure on {{ $labels.host }}'
description: 'Dataset {{ $labels.dataset }} has failed its backups on {{ $labels.host }} for two days.'
runbook: '[[ alert_playbook_url ]]/BackupFailed.md'
runbook: '[[ alert_playbook_url ]]/BackupFailed'
......@@ -63,7 +63,7 @@ groups:
annotations:
summary: 'Job {{ $labels.job }} is down globally'
description: 'Job {{ $labels.job }} is down globally (availability {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/JobDown.md'
runbook: '[[ alert_playbook_url ]]/JobDown'
- alert: ProbeFailure
expr: target:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
......@@ -75,7 +75,7 @@ groups:
summary: 'Probe {{ $labels.probe }}@{{ $labels.target }} is failing'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
for target {{ $labels.target }} (success ratio {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
runbook: '[[ alert_playbook_url ]]/ProbeFailure'
- alert: ProbeFailure
expr: probe:probe_success:ratio{probe!="ping",probeset!="service"} < 0.5
......@@ -87,4 +87,4 @@ groups:
summary: 'Probe {{ $labels.probe }} is failing globally'
description: 'Probe {{ $labels.probe }} ({{ $labels.zone }}) is failing
globally (success ratio {{ $value }}).'
runbook: '[[ alert_playbook_url ]]/ProbeFailure.md'
runbook: '[[ alert_playbook_url ]]/ProbeFailure'
......@@ -22,5 +22,5 @@ groups:
annotations:
summary: 'Host {{$labels.host}} is thrashing'
description: 'Load average on host {{$labels.host}} is very high ({{$value}}), the host is likely unresponsive.'
runbook: '[[ alert_playbook_url ]]/HostThrashing.md'
runbook: '[[ alert_playbook_url ]]/HostThrashing'
......@@ -10,4 +10,4 @@ groups:
annotations:
summary: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} is almost full'
description: 'Disk {{ $labels.mountpoint }} on {{ $labels.instance }} will be full in less than 4 hours'
runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours.md'
runbook: '[[ alert_playbook_url ]]/DiskWillFillIn4Hours'
......@@ -9,7 +9,7 @@ groups:
annotations:
description: Slave replication (IO or SQL) has been down for more than 2 minutes on {{ $labels.job }}@{{ $labels.host }}.
summary: Slave replication is not running for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken.md'
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBroken'
- alert: MySQLReplicationBehind
expr: (mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0)
......@@ -19,7 +19,7 @@ groups:
annotations:
description: The mysql slave replication has fallen behind and is not recovering on {{ $labels.job }}@{{ $labels.host }}.
summary: MySQL slave replication is lagging for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind.md'
runbook: '[[ alert_playbook_url ]]/MySQLReplicationBehind'
- alert: MySQLInnoDBLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
......@@ -28,5 +28,5 @@ groups:
annotations:
description: The innodb logs are waiting for disk at a rate of {{$value}} / second on {{ $labels.job }}@{{ $labels.host }}
summary: MySQL innodb log writes stalling for {{ $labels.job }}
runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits.md'
runbook: '[[ alert_playbook_url ]]/MySQLInnoDBLogWaits'
......@@ -9,7 +9,7 @@ groups:
annotations:
description: 'Conntrack table on {{ $labels.instance }} is more than 90% full.'
summary: 'Conntrack table on {{ $labels.instance }} is almost full'
runbook: '[[ alert_playbook_url ]]/ConntrackTableFull.md'
runbook: '[[ alert_playbook_url ]]/ConntrackTableFull'
- alert: NetworkErrors
expr: instance:node_network_errs_total:rate5m > 1
......@@ -19,7 +19,7 @@ groups:
annotations:
summary: 'High rate of packet errors on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet errors on {{ $labels.instance }} device {{ $labels.device }}.'
runbook: '[[ alert_playbook_url ]]/NetworkErrors.md'
runbook: '[[ alert_playbook_url ]]/NetworkErrors'
- alert: NetworkDrops
expr: instance:node_network_drop_total:rate5m > 1
......@@ -29,7 +29,7 @@ groups:
annotations:
summary: 'High rate of packet drops on {{ $labels.instance }}/{{ $labels.device }}'
description: 'High rate of packet drops on {{ $labels.instance }} device {{ $labels.device }}.'
runbook: '[[ alert_playbook_url ]]/NetworkDrops.md'
runbook: '[[ alert_playbook_url ]]/NetworkDrops'
- alert: HostUnreachable
expr: probe_success{job="blackbox_ping"} < 1
......@@ -39,4 +39,4 @@ groups:
annotations:
summary: 'Host {{ $labels.instance }} is unreachable'
description: 'Host {{ $labels.instance }} is unreachable (does not respond to icmp).'
runbook: '[[ alert_playbook_url ]]/HostUnreachable.md'
runbook: '[[ alert_playbook_url ]]/HostUnreachable'
......@@ -12,7 +12,7 @@ groups:
annotations:
summary: 'High HTTP error ratio for {{$labels.vhost}} globally'
description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh'
- alert: HTTPErrorRatioHigh
expr: (host:nginx_http_requests_errs:ratio > 0.2 and host:nginx_http_requests_total:rate5m > 0.1)
......@@ -24,5 +24,5 @@ groups:
annotations:
summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}'
description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh.md'
runbook: '[[ alert_playbook_url ]]/HTTPErrorRatioHigh'
......@@ -49,7 +49,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} has not ingested any samples
in the last 10 minutes.'
summary: '{{$labels.job}} is not ingesting samples'
runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples.md'
runbook: '[[ alert_playbook_url ]]/PrometheusNotIngestingSamples'
- alert: PrometheusPersistErrors
expr: rate(prometheus_local_storage_persist_errors_total{job=~"prometheus.*"}[10m])
......@@ -73,7 +73,7 @@ groups:
notifications queue. The queue has not been empty for 10 minutes. Current
queue length: {{$value}}.'
summary: '{{$labels.job}} is backlogging on the notifications queue'
runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog.md'
runbook: '[[ alert_playbook_url ]]/PrometheusNotificationsBacklog'
- alert: PrometheusScrapingSlowly
expr: prometheus_target_interval_length_seconds{interval!~".*m.*",job=~"prometheus.*",quantile="0.9"}
......@@ -123,7 +123,7 @@ groups:
description: '{{$labels.job}} at {{$labels.instance}} is approaching critical
persistence pressure. Throttled ingestion expected within the next 2h.'
summary: '{{$labels.job}} can not keep up persisting'
runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh.md'
runbook: '[[ alert_playbook_url ]]/PrometheusPersistencePressureTooHigh'
- alert: PrometheusSeriesMaintenanceStalled
expr: prometheus_local_storage_memory_series{job=~"prometheus.*"} / ON(job, instance)
......@@ -149,7 +149,7 @@ groups:
description: The configuration file for {{$labels.job}} at {{$labels.instance}}
is invalid and was therefore not reloaded.
summary: '{{$labels.job}} has an invalid config'
runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile.md'
runbook: '[[ alert_playbook_url ]]/PrometheusInvalidConfigFile'
- alert: PrometheusOutOfOrderSamplesDiscarded
expr: increase(prometheus_local_storage_out_of_order_samples_total{job=~"prometheus.*"}[10m])
......
......@@ -10,7 +10,7 @@ groups:
annotations:
description: 'Availability too low for service {{ $labels.float_service }}'
summary: 'Availability too low for service {{ $labels.float_service }}'
runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow.md'
runbook: '[[ alert_playbook_url ]]/ServiceAvailabilityTooLow'
- alert: ServiceDegraded
expr: float_service:ok_by_host == 0
......
......@@ -10,5 +10,5 @@ groups:
annotations:
summary: 'SSL certificate about to expire for {{ $labels.job }}@{{ $labels.target }}'
description: 'The "{{ $labels.job }}" prober reports that {{ $labels.target }} is serving a SSL certificate that will expire in {{ $value }} days.'
runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire.md'
runbook: '[[ alert_playbook_url ]]/SSLCertificateAboutToExpire'
......@@ -10,4 +10,4 @@ groups:
annotations:
description: 'Syslog collector is dropping logs on {{ $labels.host }}'
summary: 'Syslog collector is dropping logs'
runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs.md'
runbook: '[[ alert_playbook_url ]]/SyslogDroppingLogs'
......@@ -28,5 +28,5 @@ groups:
annotations:
summary: "A physical component is running too hot on {{ $labels.host }}"
description: "A sensor is reporting that a physical component ({{ $labels.sensor }}/{{ $labels.chip }}) on {{ $labels.host }} has been running very close to the critical level ({{ $value }}) for the last 2 hours."
runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot.md'
runbook: '[[ alert_playbook_url ]]/PhysicalComponentTooHot'
......@@ -19,5 +19,5 @@ groups:
annotations:
summary: 'Service {{ $labels.unit }} is crash-looping on {{ $labels.host }}'
description: 'Systemd unit {{ $labels.unit }} is being restarted repeatedly. Likely a configuration problem.'
runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping.md'
runbook: '[[ alert_playbook_url ]]/SystemdUnitCrashLooping'
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment