mirror of
https://github.com/grafana/grafana.git
synced 2024-11-22 08:56:43 -06:00
c792af3ad0
Fix expression for GrafanaRequestsFailing alert The intent of the alert is to get the ratio of 5xx to all status codes [^1]. With the original expression, the left hand side can have more than one row with the same labels except for the status code. This results in a promql error because it is doing a many-to-one matching. Doing a sum on the left hand side first should preserve the intent of the alert and resolve the issue. [^1]: https://github.com/grafana/grafana/pull/43116
32 lines
1.1 KiB
Plaintext
32 lines
1.1 KiB
Plaintext
{
|
|
_config+:: {
|
|
grafanaRequestsFailingThresholdPercent: 50,
|
|
},
|
|
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'GrafanaAlerts',
|
|
rules: [
|
|
{
|
|
alert: 'GrafanaRequestsFailing',
|
|
expr: |||
|
|
100 * sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."})
|
|
/
|
|
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
|
> %(grafanaRequestsFailingThresholdPercent)s
|
|
||| % $._config,
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors',
|
|
},
|
|
'for': '5m',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}
|