Skip to content
Snippets Groups Projects
Commit 3dd2fe61 authored by ale's avatar ale
Browse files

Convert network alerts to use ratios

parent aaed8153
No related branches found
No related tags found
No related merge requests found
...@@ -12,17 +12,21 @@ groups: ...@@ -12,17 +12,21 @@ groups:
runbook: '[[ alert_runbook_fmt | format("ConntrackTableFull") ]]' runbook: '[[ alert_runbook_fmt | format("ConntrackTableFull") ]]'
- alert: NetworkErrors - alert: NetworkErrors
expr: instance:node_network_errs_total:rate5m > [[ prometheus_network_error_rate_threshold | default(5) ]] expr: >
(instance:node_network_errs:ratio > [[ prometheus_network_error_ratio_threshold | default(0.1) ]]) and
(instance:node_network_packets_total:rate5m > 10)
for: 15m for: 15m
labels: labels:
severity: warn severity: page
annotations: annotations:
summary: 'High rate of packet errors on {{ $labels.host }}/{{ $labels.device }}' summary: 'High rate of packet errors on {{ $labels.host }}/{{ $labels.device }}'
description: 'High rate of packet errors on {{ $labels.host }} device {{ $labels.device }}.' description: 'High rate of packet errors on {{ $labels.host }} device {{ $labels.device }}.'
runbook: '[[ alert_runbook_fmt | format("NetworkErrors") ]]' runbook: '[[ alert_runbook_fmt | format("NetworkErrors") ]]'
- alert: NetworkDrops - alert: NetworkDrops
expr: instance:node_network_drop_total:rate5m > [[ prometheus_network_drop_rate_threshold | default(10) ]] expr: >
(instance:node_network_drop:ratio > [[ prometheus_network_drop_ratio_threshold | default(0.1) ]]) and
(instance:node_network_packets_total:rate5m > 10)
for: 15m for: 15m
labels: labels:
severity: warn severity: warn
......
...@@ -3,14 +3,24 @@ groups: ...@@ -3,14 +3,24 @@ groups:
rules: rules:
- record: instance:conntrack_full:ratio - record: instance:conntrack_full:ratio
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit
- record: instance:node_network_errs_total - record: instance:node_network_errs_total
expr: node_network_receive_errs_total + node_network_transmit_errs_total expr: node_network_receive_errs_total + node_network_transmit_errs_total
- record: instance:node_network_drop_total - record: instance:node_network_drop_total
expr: node_network_receive_drop_total + node_network_transmit_drop_total expr: node_network_receive_drop_total + node_network_transmit_drop_total
- record: instance:node_packets_toal
expr: node_network_receive_packets_total + node_network_transmit_packets_total
- record: instance:node_network_errs_total:rate5m - record: instance:node_network_errs_total:rate5m
expr: rate(instance:node_network_errs_total[5m]) expr: rate(instance:node_network_errs_total[5m])
- record: instance:node_network_drop_total:rate5m - record: instance:node_network_drop_total:rate5m
expr: rate(instance:node_network_drop_total[5m]) expr: rate(instance:node_network_drop_total[5m])
- record: instance:node_network_packets_total:rate5m
expr: rate(instance:node_network_packets_total[5m])
- record: instance:node_network_errs:ratio
expr: instance:node_network_errs_total:rate5m / instance:node_network_packets_total:rate5m
- record: instance:node_network_drop:ratio
expr: instance:node_network_drop_total:rate5m / instance:node_network_packets_total:rate5m
- record: instance:network_transmit_bytes_total:rate5m - record: instance:network_transmit_bytes_total:rate5m
expr: sum(rate(node_network_transmit_bytes_total{device!="lo"}[5m])) without (device) expr: sum(rate(node_network_transmit_bytes_total{device!="lo"}[5m])) without (device)
- record: instance:network_receive_bytes_total:rate5m - record: instance:network_receive_bytes_total:rate5m
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment