mirror of
https://github.com/grafana/grafana.git
synced 2025-01-09 15:43:23 -06:00
18fdb89554
Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.
32 lines
1.1 KiB
Plaintext
32 lines
1.1 KiB
Plaintext
{
|
|
_config+:: {
|
|
grafanaRequestsFailingThresholdPercent: 50,
|
|
},
|
|
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'GrafanaAlerts',
|
|
rules: [
|
|
{
|
|
alert: 'GrafanaRequestsFailing',
|
|
expr: |||
|
|
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
|
/ ignoring (status_code)
|
|
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
|
> %(grafanaRequestsFailingThresholdPercent)s
|
|
||| % $._config,
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors',
|
|
},
|
|
'for': '5m',
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}
|