grafana-mixin: Fix GrafanaRequestsFailing alert (#43116)

Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.
This commit is contained in:
Pedro Araújo 2021-12-20 14:12:37 +00:00 committed by GitHub
parent 4dc63698ac
commit 18fdb89554
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 56 additions and 36 deletions

View File

@ -0,0 +1,31 @@
{
_config+:: {
grafanaRequestsFailingThresholdPercent: 50,
},
prometheusAlerts+:: {
groups+: [
{
name: 'GrafanaAlerts',
rules: [
{
alert: 'GrafanaRequestsFailing',
expr: |||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/ ignoring (status_code)
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> %(grafanaRequestsFailingThresholdPercent)s
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors',
},
'for': '5m',
},
],
},
],
},
}

View File

@ -1,14 +0,0 @@
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
for: 5m
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/
namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}
> 0.5
labels:
severity: 'warning'
annotations:
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors"

View File

@ -0,0 +1,5 @@
{
grafanaDashboards+:: {
'grafana-overview.json': (import 'grafana-overview.json'),
},
}

View File

@ -1,15 +1,3 @@
{ (import 'alerts/alerts.libsonnet') +
grafanaDashboards: { (import 'dashboards/dashboards.libsonnet') +
'grafana-overview.json': (import 'dashboards/grafana-overview.json'), (import 'rules/rules.libsonnet')
},
// Helper function to ensure that we don't override other rules, by forcing
// the patching of the groups list, and not the overall rules object.
local importRules(rules) = {
groups+: std.native('parseYaml')(rules)[0].groups,
},
prometheusRules+: importRules(importstr 'rules/rules.yaml'),
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'),
}

View File

@ -0,0 +1,17 @@
{
prometheusRules+:: {
groups+: [
{
name: 'grafana_rules',
rules: [
{
record: 'namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m',
expr: |||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|||,
},
],
},
],
},
}

View File

@ -1,7 +0,0 @@
groups:
- name: grafana_rules
rules:
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests
- record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
expr: |
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))