From 18fdb895548b0c0a9912fbc3c60cbb0b4d640d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pedro=20Ara=C3=BAjo?= Date: Mon, 20 Dec 2021 14:12:37 +0000 Subject: [PATCH] grafana-mixin: Fix `GrafanaRequestsFailing` alert (#43116) Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away. --- grafana-mixin/alerts/alerts.libsonnet | 31 +++++++++++++++++++ grafana-mixin/alerts/alerts.yaml | 14 --------- grafana-mixin/dashboards/dashboards.libsonnet | 5 +++ grafana-mixin/mixin.libsonnet | 18 ++--------- grafana-mixin/rules/rules.libsonnet | 17 ++++++++++ grafana-mixin/rules/rules.yaml | 7 ----- 6 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 grafana-mixin/alerts/alerts.libsonnet delete mode 100644 grafana-mixin/alerts/alerts.yaml create mode 100644 grafana-mixin/dashboards/dashboards.libsonnet create mode 100644 grafana-mixin/rules/rules.libsonnet delete mode 100644 grafana-mixin/rules/rules.yaml diff --git a/grafana-mixin/alerts/alerts.libsonnet b/grafana-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000000..e38f2a4a72a --- /dev/null +++ b/grafana-mixin/alerts/alerts.libsonnet @@ -0,0 +1,31 @@ +{ + _config+:: { + grafanaRequestsFailingThresholdPercent: 50, + }, + + prometheusAlerts+:: { + groups+: [ + { + name: 'GrafanaAlerts', + rules: [ + { + alert: 'GrafanaRequestsFailing', + expr: ||| + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > %(grafanaRequestsFailingThresholdPercent)s + ||| % $._config, + labels: { + severity: 'warning', + }, + annotations: { + message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors', + }, + 'for': '5m', + }, + ], + }, + ], + }, +} diff --git a/grafana-mixin/alerts/alerts.yaml b/grafana-mixin/alerts/alerts.yaml deleted file mode 100644 index 2acc9d0794b..00000000000 --- a/grafana-mixin/alerts/alerts.yaml +++ /dev/null @@ -1,14 +0,0 @@ -groups: - - name: GrafanaAlerts - rules: - - alert: GrafanaRequestsFailing - for: 5m - expr: | - 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} - / - namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"} - > 0.5 - labels: - severity: 'warning' - annotations: - message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors" diff --git a/grafana-mixin/dashboards/dashboards.libsonnet b/grafana-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000000..e7589aaedfb --- /dev/null +++ b/grafana-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,5 @@ +{ + grafanaDashboards+:: { + 'grafana-overview.json': (import 'grafana-overview.json'), + }, +} diff --git a/grafana-mixin/mixin.libsonnet b/grafana-mixin/mixin.libsonnet index c60e0e736c5..4a7437b948c 100644 --- a/grafana-mixin/mixin.libsonnet +++ b/grafana-mixin/mixin.libsonnet @@ -1,15 +1,3 @@ -{ - grafanaDashboards: { - 'grafana-overview.json': (import 'dashboards/grafana-overview.json'), - }, - - // Helper function to ensure that we don't override other rules, by forcing - // the patching of the groups list, and not the overall rules object. - local importRules(rules) = { - groups+: std.native('parseYaml')(rules)[0].groups, - }, - - prometheusRules+: importRules(importstr 'rules/rules.yaml'), - - prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), -} +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/grafana-mixin/rules/rules.libsonnet b/grafana-mixin/rules/rules.libsonnet new file mode 100644 index 00000000000..8cb28433929 --- /dev/null +++ b/grafana-mixin/rules/rules.libsonnet @@ -0,0 +1,17 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'grafana_rules', + rules: [ + { + record: 'namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m', + expr: ||| + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + |||, + }, + ], + }, + ], + }, +} diff --git a/grafana-mixin/rules/rules.yaml b/grafana-mixin/rules/rules.yaml deleted file mode 100644 index f1fe8fb2e94..00000000000 --- a/grafana-mixin/rules/rules.yaml +++ /dev/null @@ -1,7 +0,0 @@ -groups: - - name: grafana_rules - rules: - # Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests - - record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m - expr: | - sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))