From 68691d7775809fc5780033e4e51e6ea097b5a718 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Wed, 15 Jun 2022 13:19:43 +0200 Subject: [PATCH] Convert some metrics to Histograms (#50420) Because Summary metrics can not be aggreated, convert them to histograms so that users with HA deployments can use these metrics. * Convert metrics registration to promauto. * Improve help text style. Signed-off-by: SuperQ --- .../datasource_metrics_middleware.go | 80 +++++++++---------- .../instrumentation/instrumentation.go | 22 ++--- pkg/services/ngalert/metrics/ngalert.go | 16 ++-- .../ngalert/schedule/schedule_unit_test.go | 20 ++++- 4 files changed, 70 insertions(+), 68 deletions(-) diff --git a/pkg/infra/httpclient/httpclientprovider/datasource_metrics_middleware.go b/pkg/infra/httpclient/httpclientprovider/datasource_metrics_middleware.go index bfb722ef4cd..3943099f45b 100644 --- a/pkg/infra/httpclient/httpclientprovider/datasource_metrics_middleware.go +++ b/pkg/infra/httpclient/httpclientprovider/datasource_metrics_middleware.go @@ -7,52 +7,48 @@ import ( "github.com/grafana/grafana/pkg/infra/httpclient" "github.com/grafana/grafana/pkg/infra/metrics/metricutil" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/client_golang/prometheus/promhttp" ) -var datasourceRequestCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: "grafana", - Name: "datasource_request_total", - Help: "A counter for outgoing requests for a datasource", - }, - []string{"datasource", "code", "method"}, -) +var ( + datasourceRequestCounter = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "grafana", + Name: "datasource_request_total", + Help: "A counter for outgoing requests for a data source", + }, + []string{"datasource", "code", "method"}, + ) -var datasourceRequestSummary = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: "grafana", - Name: "datasource_request_duration_seconds", - Help: "summary of outgoing datasource requests sent from Grafana", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }, []string{"datasource", "code", "method"}, -) + datasourceRequestHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "grafana", + Name: "datasource_request_duration_seconds", + Help: "summary of outgoing data source requests sent from Grafana", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100}, + }, []string{"datasource", "code", "method"}, + ) -var datasourceResponseSummary = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: "grafana", - Name: "datasource_response_size_bytes", - Help: "summary of datasource response sizes returned to Grafana", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, - }, []string{"datasource"}, -) + datasourceResponseHistogram = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "grafana", + Name: "datasource_response_size_bytes", + Help: "summary of data source response sizes returned to Grafana", + Buckets: []float64{128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576}, + }, []string{"datasource"}, + ) -var datasourceRequestsInFlight = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "grafana", - Name: "datasource_request_in_flight", - Help: "A gauge of outgoing datasource requests currently being sent by Grafana", - }, - []string{"datasource"}, + datasourceRequestsInFlight = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "grafana", + Name: "datasource_request_in_flight", + Help: "A gauge of outgoing data source requests currently being sent by Grafana", + }, + []string{"datasource"}, + ) ) -func init() { - prometheus.MustRegister(datasourceRequestSummary, - datasourceRequestCounter, - datasourceRequestsInFlight, - datasourceResponseSummary) -} - const DataSourceMetricsMiddlewareName = "metrics" var executeMiddlewareFunc = executeMiddleware @@ -84,11 +80,11 @@ func DataSourceMetricsMiddleware() sdkhttpclient.Middleware { func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels) http.RoundTripper { return sdkhttpclient.RoundTripperFunc(func(r *http.Request) (*http.Response, error) { requestCounter := datasourceRequestCounter.MustCurryWith(datasourceLabel) - requestSummary := datasourceRequestSummary.MustCurryWith(datasourceLabel) + requestHistogram := datasourceRequestHistogram.MustCurryWith(datasourceLabel) requestInFlight := datasourceRequestsInFlight.With(datasourceLabel) - responseSizeSummary := datasourceResponseSummary.With(datasourceLabel) + responseSizeHistogram := datasourceResponseHistogram.With(datasourceLabel) - res, err := promhttp.InstrumentRoundTripperDuration(requestSummary, + res, err := promhttp.InstrumentRoundTripperDuration(requestHistogram, promhttp.InstrumentRoundTripperCounter(requestCounter, promhttp.InstrumentRoundTripperInFlight(requestInFlight, next))). RoundTrip(r) @@ -98,7 +94,7 @@ func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels if res != nil && res.StatusCode != http.StatusSwitchingProtocols { res.Body = httpclient.CountBytesReader(res.Body, func(bytesRead int64) { - responseSizeSummary.Observe(float64(bytesRead)) + responseSizeHistogram.Observe(float64(bytesRead)) }) } diff --git a/pkg/plugins/backendplugin/instrumentation/instrumentation.go b/pkg/plugins/backendplugin/instrumentation/instrumentation.go index 21831ece3f0..a7bf104e6b9 100644 --- a/pkg/plugins/backendplugin/instrumentation/instrumentation.go +++ b/pkg/plugins/backendplugin/instrumentation/instrumentation.go @@ -5,29 +5,23 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" ) var ( - pluginRequestCounter *prometheus.CounterVec - pluginRequestDuration *prometheus.SummaryVec -) - -func init() { - pluginRequestCounter = prometheus.NewCounterVec(prometheus.CounterOpts{ + pluginRequestCounter = promauto.NewCounterVec(prometheus.CounterOpts{ Namespace: "grafana", Name: "plugin_request_total", Help: "The total amount of plugin requests", }, []string{"plugin_id", "endpoint", "status"}) - pluginRequestDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ - Namespace: "grafana", - Name: "plugin_request_duration_milliseconds", - Help: "Plugin request duration", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + pluginRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "grafana", + Name: "plugin_request_duration_milliseconds", + Help: "Plugin request duration", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100}, }, []string{"plugin_id", "endpoint"}) - - prometheus.MustRegister(pluginRequestCounter, pluginRequestDuration) -} +) // instrumentPluginRequest instruments success rate and latency of `fn` func instrumentPluginRequest(pluginID string, endpoint string, fn func() error) error { diff --git a/pkg/services/ngalert/metrics/ngalert.go b/pkg/services/ngalert/metrics/ngalert.go index 3df7fe750aa..b9941400a27 100644 --- a/pkg/services/ngalert/metrics/ngalert.go +++ b/pkg/services/ngalert/metrics/ngalert.go @@ -50,7 +50,7 @@ type Scheduler struct { BehindSeconds prometheus.Gauge EvalTotal *prometheus.CounterVec EvalFailures *prometheus.CounterVec - EvalDuration *prometheus.SummaryVec + EvalDuration *prometheus.HistogramVec SchedulePeriodicDuration prometheus.Histogram SchedulableAlertRules prometheus.Gauge SchedulableAlertRulesHash prometheus.Gauge @@ -156,13 +156,13 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler { }, []string{"org"}, ), - EvalDuration: promauto.With(r).NewSummaryVec( - prometheus.SummaryOpts{ - Namespace: Namespace, - Subsystem: Subsystem, - Name: "rule_evaluation_duration_seconds", - Help: "The duration for a rule to execute.", - Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, + EvalDuration: promauto.With(r).NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_evaluation_duration_seconds", + Help: "The duration for a rule to execute.", + Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100}, }, []string{"org"}, ), diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index adba0bc1e36..958ef2381a5 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -441,10 +441,22 @@ func TestSchedule_ruleRoutine(t *testing.T) { // duration metric has 0 values because of mocked clock that do not advance expectedMetric := fmt.Sprintf( `# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute. - # TYPE grafana_alerting_rule_evaluation_duration_seconds summary - grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.5"} 0 - grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.9"} 0 - grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.99"} 0 + # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.