grafana-mixin: Fix GrafanaRequestsFailing alert (#43116)

Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.
This commit is contained in:
Pedro Araújo
2021-12-20 14:12:37 +00:00
committed by GitHub
parent 4dc63698ac
commit 18fdb89554
6 changed files with 56 additions and 36 deletions

View File

@@ -0,0 +1,31 @@
{
_config+:: {
grafanaRequestsFailingThresholdPercent: 50,
},
prometheusAlerts+:: {
groups+: [
{
name: 'GrafanaAlerts',
rules: [
{
alert: 'GrafanaRequestsFailing',
expr: |||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/ ignoring (status_code)
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> %(grafanaRequestsFailingThresholdPercent)s
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors',
},
'for': '5m',
},
],
},
],
},
}

View File

@@ -1,14 +0,0 @@
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
for: 5m
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/
namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}
> 0.5
labels:
severity: 'warning'
annotations:
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors"