From 6002df580ff22f0f5f6df84c074f39bda67cb7b4 Mon Sep 17 00:00:00 2001 From: Carl Bergquist Date: Thu, 15 Oct 2020 17:05:56 +0200 Subject: [PATCH] Add monitoring mixing for Grafana (#28285) Co-authored-by: Tom Wilkie --- grafana-mixin/.gitignore | 3 + grafana-mixin/Makefile | 21 + grafana-mixin/README.md | 28 + grafana-mixin/alerts/alerts.yaml | 14 + .../dashboards/grafana-overview.json | 528 ++++++++++++++++++ grafana-mixin/mixin.libsonnet | 15 + grafana-mixin/rules/rules.yaml | 7 + scripts/lib.star | 1 + scripts/mixin-check.sh | 7 + 9 files changed, 624 insertions(+) create mode 100644 grafana-mixin/.gitignore create mode 100644 grafana-mixin/Makefile create mode 100644 grafana-mixin/README.md create mode 100644 grafana-mixin/alerts/alerts.yaml create mode 100644 grafana-mixin/dashboards/grafana-overview.json create mode 100644 grafana-mixin/mixin.libsonnet create mode 100644 grafana-mixin/rules/rules.yaml create mode 100755 scripts/mixin-check.sh diff --git a/grafana-mixin/.gitignore b/grafana-mixin/.gitignore new file mode 100644 index 00000000000..1794e36498f --- /dev/null +++ b/grafana-mixin/.gitignore @@ -0,0 +1,3 @@ +alerts.yaml +rules.yaml +dashboards_out \ No newline at end of file diff --git a/grafana-mixin/Makefile b/grafana-mixin/Makefile new file mode 100644 index 00000000000..a2a7145b960 --- /dev/null +++ b/grafana-mixin/Makefile @@ -0,0 +1,21 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +all: fmt lint build clean + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +lint: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + mixtool lint mixin.libsonnet + +build: + mixtool generate all mixin.libsonnet + +clean: + rm -rf dashboards_out alerts.yaml rules.yaml \ No newline at end of file diff --git a/grafana-mixin/README.md b/grafana-mixin/README.md new file mode 100644 index 00000000000..60feb1df154 --- /dev/null +++ b/grafana-mixin/README.md @@ -0,0 +1,28 @@ +# Grafana Mixin + +_This is a work in progress. We aim for it to become a good role model for alerts +and dashboards eventually, but it is not quite there yet._ + +The Grafana Mixin is a set of configurable, reusable, and extensible alerts and +dashboards based on the metrics exported by Grafana. The mixin creates +recording and alerting rules for Prometheus and suitable dashboard descriptions +for Grafana. + +To use them, you need to have `mixtool` and `jsonnetfmt` installed. If you +have a working Go development environment, it's easiest to run the following: + +```bash +$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool +$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt +``` + +You can then build the Prometheus rules files `alerts.yaml` and +`rules.yaml` and a directory `dashboard_out` with the JSON dashboard files +for Grafana: + +```bash +$ make build +``` + +For more advanced uses of mixins, see +https://github.com/monitoring-mixins/docs. diff --git a/grafana-mixin/alerts/alerts.yaml b/grafana-mixin/alerts/alerts.yaml new file mode 100644 index 00000000000..1464e89db74 --- /dev/null +++ b/grafana-mixin/alerts/alerts.yaml @@ -0,0 +1,14 @@ +groups: +- name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + for: 5m + expr: | + 100 * namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query", statuscode=~"5.."} + / + namespace_job_handler_statuscode:http_request_total:rate5m{handler!~"/datasources/proxy/:id.*|/ds/query|/tsdb/query"} + > 0.5 + labels: + severity: 'critical' + annotations: + message: "'{{ $labels.namespace }}' / '{{ $labels.job }}' / '{{ $labels.handler }}' is experiencing {{ $value | humanize }}% errors" diff --git a/grafana-mixin/dashboards/grafana-overview.json b/grafana-mixin/dashboards/grafana-overview.json new file mode 100644 index 00000000000..839e272ab0a --- /dev/null +++ b/grafana-mixin/dashboards/grafana-overview.json @@ -0,0 +1,528 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 35, + "iteration": 1602761142538, + "links": [], + "panels": [ + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.4", + "targets": [ + { + "expr": "grafana_alerting_result_total{job=~\"$job\", instance=~\"$instance\", state=\"alerting\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Firing Alerts", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.4", + "targets": [ + { + "expr": "sum(grafana_stat_totals_dashboard{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Dashboards", + "type": "stat" + }, + { + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 10, + "options": { + "showHeader": true + }, + "pluginVersion": "7.0.4", + "targets": [ + { + "expr": "grafana_build_info{job=~\"$job\", instance=~\"$instance\"}", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Build Info", + "transformations": [ + { + "id": "labelsToFields", + "options": {} + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "branch": true, + "container": true, + "goversion": true, + "namespace": true, + "pod": true, + "revision": true + }, + "indexByName": { + "Time": 7, + "Value": 11, + "branch": 4, + "container": 8, + "edition": 2, + "goversion": 6, + "instance": 1, + "job": 0, + "namespace": 9, + "pod": 10, + "revision": 5, + "version": 3 + }, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (statuscode) (irate(http_request_total{job=~\"$job\", instance=~\"$instance\"}[1m])) ", + "interval": "", + "legendFormat": "{{statuscode}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "RPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:157", + "format": "reqps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:158", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.99\"})", + "interval": "", + "legendFormat": "max-99th", + "refId": "A" + }, + { + "expr": "max(http_request_duration_milliseconds{job=~\"$job\", instance=~\"$instance\", quantile=\"0.9\"})", + "interval": "", + "legendFormat": "max-90th", + "refId": "B" + }, + { + "expr": "sum(irate(http_request_duration_milliseconds_sum{job=~\"$job\", instance=~\"$instance\"}[$__interval])) / sum(irate(http_request_duration_milliseconds_count{job=~\"$job\", instance=~\"$instance\"}[$__interval])) ", + "interval": "", + "legendFormat": "avg", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Request Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:210", + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:211", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "tags": [], + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, job)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "job", + "options": [], + "query": "label_values(grafana_build_info, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "$datasource", + "definition": "label_values(grafana_build_info, instance)", + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "instance", + "options": [], + "query": "label_values(grafana_build_info, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Grafana Overview", + "uid": "6be0s85Mk", + "version": 4 +} \ No newline at end of file diff --git a/grafana-mixin/mixin.libsonnet b/grafana-mixin/mixin.libsonnet new file mode 100644 index 00000000000..c60e0e736c5 --- /dev/null +++ b/grafana-mixin/mixin.libsonnet @@ -0,0 +1,15 @@ +{ + grafanaDashboards: { + 'grafana-overview.json': (import 'dashboards/grafana-overview.json'), + }, + + // Helper function to ensure that we don't override other rules, by forcing + // the patching of the groups list, and not the overall rules object. + local importRules(rules) = { + groups+: std.native('parseYaml')(rules)[0].groups, + }, + + prometheusRules+: importRules(importstr 'rules/rules.yaml'), + + prometheusAlerts+: importRules(importstr 'alerts/alerts.yaml'), +} diff --git a/grafana-mixin/rules/rules.yaml b/grafana-mixin/rules/rules.yaml new file mode 100644 index 00000000000..9eab65e64db --- /dev/null +++ b/grafana-mixin/rules/rules.yaml @@ -0,0 +1,7 @@ +groups: + - name: grafana_rules + rules: + # Record error rate of http requests excluding dataproxy, /ds/query and /tsdb/query requests + - record: namespace_job_handler_statuscode:http_request_total:rate5m + expr: | + sum by (namespace, job, handler, statuscode) (rate(http_request_total[5m])) diff --git a/scripts/lib.star b/scripts/lib.star index 5a463efb613..145202dec61 100644 --- a/scripts/lib.star +++ b/scripts/lib.star @@ -224,6 +224,7 @@ def lint_backend_step(edition): 'revive -formatter stylish -config scripts/go/configs/revive.toml ./pkg/...', './scripts/revive-strict', './scripts/tidy-check.sh', + './scripts/mixin-check.sh, ], } diff --git a/scripts/mixin-check.sh b/scripts/mixin-check.sh new file mode 100755 index 00000000000..60919b85e31 --- /dev/null +++ b/scripts/mixin-check.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -eo pipefail + +cd grafana-mixin +go install github.com/monitoring-mixins/mixtool/cmd/mixtool +go install github.com/google/go-jsonnet/cmd/jsonnetfmt +make lint build