From d5c5e2264043830c242ee975b0dbb2950e508523 Mon Sep 17 00:00:00 2001 From: ale Date: Mon, 10 Jun 2019 19:05:50 +0100 Subject: [PATCH] NGINX error metrics should only include 5xx errors Previously we were also counting 4xx errors, which are not relevant as a reliability indicator. We may eventually want to restrict this further to 502/503 (for backend timeouts only). --- roles/prometheus/files/rules/alerts_nginx.conf.yml | 4 ++-- roles/prometheus/files/rules/rules_nginx.conf.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/roles/prometheus/files/rules/alerts_nginx.conf.yml b/roles/prometheus/files/rules/alerts_nginx.conf.yml index 2b5d386..790a122 100644 --- a/roles/prometheus/files/rules/alerts_nginx.conf.yml +++ b/roles/prometheus/files/rules/alerts_nginx.conf.yml @@ -10,7 +10,7 @@ groups: severity: page annotations: summary: 'High HTTP error ratio for {{$labels.vhost}} globally' - description: 'We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on all frontends.' + description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on all frontends.' - alert: HTTPErrorRatioHigh expr: (instance:nginx_http_requests_errs:ratio > 0.02 and instance:nginx_http_requests_total:rate5m > 0.1) for: 10m @@ -20,4 +20,4 @@ groups: severity: page annotations: summary: 'High HTTP error ratio for {{$labels.vhost}} on {{$labels.host}}' - description: 'We are serving lots of 4xx/5xx errors for {{$labels.vhost}} on {{$labels.host}}.' + description: 'We are serving lots of 5xx errors for {{$labels.vhost}} on {{$labels.host}}.' diff --git a/roles/prometheus/files/rules/rules_nginx.conf.yml b/roles/prometheus/files/rules/rules_nginx.conf.yml index a37d670..1714c41 100644 --- a/roles/prometheus/files/rules/rules_nginx.conf.yml +++ b/roles/prometheus/files/rules/rules_nginx.conf.yml @@ -4,14 +4,14 @@ groups: - record: instance:nginx_http_requests_total:rate5m expr: sum(rate(nginx_http_requests[5m])) without (backend, method, code) - record: instance:nginx_http_requests_errs:rate5m - expr: sum(rate(nginx_http_requests{code=~"[45].*"}[5m])) without (backend, method, code) + expr: sum(rate(nginx_http_requests{code=~"5.*"}[5m])) without (backend, method, code) - record: instance:nginx_http_requests_errs:ratio expr: (instance:nginx_http_requests_errs:rate5m / instance:nginx_http_requests_total:rate5m) - record: global:nginx_http_requests_total:rate5m expr: sum(rate(nginx_http_requests[5m])) without (instance, host, exported_instance, exported_host, backend, method, code) - record: global:nginx_http_requests_errs:rate5m - expr: sum(rate(nginx_http_requests{code=~"[45].*"}[5m])) without (instance, host, + expr: sum(rate(nginx_http_requests{code=~"5.*"}[5m])) without (instance, host, exported_instance, exported_host, backend, method, code) - record: global:nginx_http_requests_errs:ratio expr: (global:nginx_http_requests_errs:rate5m / global:nginx_http_requests_total:rate5m) -- GitLab