mirror of
https://github.com/grafana/grafana.git
synced 2024-11-24 18:00:31 -06:00
grafana-mixin: Fix GrafanaRequestsFailing
alert (#43116)
Due to PromQL's label matching, both sides of the division will have same series; that means that, whenever there's a 5xx error, both sides will have the same value and the division will be `1`. I believe the idea was to get the ratio of 5xx compared will all status code, and to do that, we need to aggregate the `status_code` dimension away.
This commit is contained in:
parent
4dc63698ac
commit
18fdb89554
31
grafana-mixin/alerts/alerts.libsonnet
Normal file
31
grafana-mixin/alerts/alerts.libsonnet
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
_config+:: {
|
||||
grafanaRequestsFailingThresholdPercent: 50,
|
||||
},
|
||||
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'GrafanaAlerts',
|
||||
rules: [
|
||||
{
|
||||
alert: 'GrafanaRequestsFailing',
|
||||
expr: |||
|
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
||||
/ ignoring (status_code)
|
||||
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
||||
> %(grafanaRequestsFailingThresholdPercent)s
|
||||
||| % $._config,
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: '{{ $labels.namespace }}/{{ $labels.job }}/{{ $labels.handler }} is experiencing {{ $value | humanize }}% errors',
|
||||
},
|
||||
'for': '5m',
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
groups:
|
||||
- name: GrafanaAlerts
|
||||
rules:
|
||||
- alert: GrafanaRequestsFailing
|
||||
for: 5m
|
||||
expr: |
|
||||
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
||||
/
|
||||
namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}
|
||||
> 0.5
|
||||
labels:
|
||||
severity: 'warning'
|
||||
annotations:
|
||||
message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors"
|
5
grafana-mixin/dashboards/dashboards.libsonnet
Normal file
5
grafana-mixin/dashboards/dashboards.libsonnet
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
'grafana-overview.json': (import 'grafana-overview.json'),
|
||||
},
|
||||
}
|
@ -1,15 +1,3 @@
|
||||
{
|
||||
grafanaDashboards: {
|
||||
'grafana-overview.json': (import 'dashboards/grafana-overview.json'),
|
||||
},
|
||||
|
||||
// Helper function to ensure that we don't override other rules, by forcing
|
||||
// the patching of the groups list, and not the overall rules object.
|
||||
local importRules(rules) = {
|
||||
groups+: std.native('parseYaml')(rules)[0].groups,
|
||||
},
|
||||
|
||||
prometheusRules+: importRules(importstr 'rules/rules.yaml'),
|
||||
|
||||
prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'),
|
||||
}
|
||||
(import 'alerts/alerts.libsonnet') +
|
||||
(import 'dashboards/dashboards.libsonnet') +
|
||||
(import 'rules/rules.libsonnet')
|
||||
|
17
grafana-mixin/rules/rules.libsonnet
Normal file
17
grafana-mixin/rules/rules.libsonnet
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'grafana_rules',
|
||||
rules: [
|
||||
{
|
||||
record: 'namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m',
|
||||
expr: |||
|
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|
||||
|||,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
groups:
|
||||
- name: grafana_rules
|
||||
rules:
|
||||
# Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests
|
||||
- record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
|
||||
expr: |
|
||||
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|
Loading…
Reference in New Issue
Block a user