Convert some metrics to Histograms (#50420)

Because Summary metrics can not be aggreated, convert them to histograms
so that users with HA deployments can use these metrics.
* Convert metrics registration to promauto.
* Improve help text style.

Signed-off-by: SuperQ <superq@gmail.com>
This commit is contained in:
Ben Kochie 2022-06-15 13:19:43 +02:00 committed by GitHub
parent 390b7d084e
commit 68691d7775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 70 additions and 68 deletions

View File

@ -7,52 +7,48 @@ import (
"github.com/grafana/grafana/pkg/infra/httpclient" "github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/metrics/metricutil" "github.com/grafana/grafana/pkg/infra/metrics/metricutil"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/client_golang/prometheus/promhttp"
) )
var datasourceRequestCounter = prometheus.NewCounterVec( var (
prometheus.CounterOpts{ datasourceRequestCounter = promauto.NewCounterVec(
Namespace: "grafana", prometheus.CounterOpts{
Name: "datasource_request_total", Namespace: "grafana",
Help: "A counter for outgoing requests for a datasource", Name: "datasource_request_total",
}, Help: "A counter for outgoing requests for a data source",
[]string{"datasource", "code", "method"}, },
) []string{"datasource", "code", "method"},
)
var datasourceRequestSummary = prometheus.NewSummaryVec( datasourceRequestHistogram = promauto.NewHistogramVec(
prometheus.SummaryOpts{ prometheus.HistogramOpts{
Namespace: "grafana", Namespace: "grafana",
Name: "datasource_request_duration_seconds", Name: "datasource_request_duration_seconds",
Help: "summary of outgoing datasource requests sent from Grafana", Help: "summary of outgoing data source requests sent from Grafana",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
}, []string{"datasource", "code", "method"}, }, []string{"datasource", "code", "method"},
) )
var datasourceResponseSummary = prometheus.NewSummaryVec( datasourceResponseHistogram = promauto.NewHistogramVec(
prometheus.SummaryOpts{ prometheus.HistogramOpts{
Namespace: "grafana", Namespace: "grafana",
Name: "datasource_response_size_bytes", Name: "datasource_response_size_bytes",
Help: "summary of datasource response sizes returned to Grafana", Help: "summary of data source response sizes returned to Grafana",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Buckets: []float64{128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576},
}, []string{"datasource"}, }, []string{"datasource"},
) )
var datasourceRequestsInFlight = prometheus.NewGaugeVec( datasourceRequestsInFlight = promauto.NewGaugeVec(
prometheus.GaugeOpts{ prometheus.GaugeOpts{
Namespace: "grafana", Namespace: "grafana",
Name: "datasource_request_in_flight", Name: "datasource_request_in_flight",
Help: "A gauge of outgoing datasource requests currently being sent by Grafana", Help: "A gauge of outgoing data source requests currently being sent by Grafana",
}, },
[]string{"datasource"}, []string{"datasource"},
)
) )
func init() {
prometheus.MustRegister(datasourceRequestSummary,
datasourceRequestCounter,
datasourceRequestsInFlight,
datasourceResponseSummary)
}
const DataSourceMetricsMiddlewareName = "metrics" const DataSourceMetricsMiddlewareName = "metrics"
var executeMiddlewareFunc = executeMiddleware var executeMiddlewareFunc = executeMiddleware
@ -84,11 +80,11 @@ func DataSourceMetricsMiddleware() sdkhttpclient.Middleware {
func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels) http.RoundTripper { func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels) http.RoundTripper {
return sdkhttpclient.RoundTripperFunc(func(r *http.Request) (*http.Response, error) { return sdkhttpclient.RoundTripperFunc(func(r *http.Request) (*http.Response, error) {
requestCounter := datasourceRequestCounter.MustCurryWith(datasourceLabel) requestCounter := datasourceRequestCounter.MustCurryWith(datasourceLabel)
requestSummary := datasourceRequestSummary.MustCurryWith(datasourceLabel) requestHistogram := datasourceRequestHistogram.MustCurryWith(datasourceLabel)
requestInFlight := datasourceRequestsInFlight.With(datasourceLabel) requestInFlight := datasourceRequestsInFlight.With(datasourceLabel)
responseSizeSummary := datasourceResponseSummary.With(datasourceLabel) responseSizeHistogram := datasourceResponseHistogram.With(datasourceLabel)
res, err := promhttp.InstrumentRoundTripperDuration(requestSummary, res, err := promhttp.InstrumentRoundTripperDuration(requestHistogram,
promhttp.InstrumentRoundTripperCounter(requestCounter, promhttp.InstrumentRoundTripperCounter(requestCounter,
promhttp.InstrumentRoundTripperInFlight(requestInFlight, next))). promhttp.InstrumentRoundTripperInFlight(requestInFlight, next))).
RoundTrip(r) RoundTrip(r)
@ -98,7 +94,7 @@ func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels
if res != nil && res.StatusCode != http.StatusSwitchingProtocols { if res != nil && res.StatusCode != http.StatusSwitchingProtocols {
res.Body = httpclient.CountBytesReader(res.Body, func(bytesRead int64) { res.Body = httpclient.CountBytesReader(res.Body, func(bytesRead int64) {
responseSizeSummary.Observe(float64(bytesRead)) responseSizeHistogram.Observe(float64(bytesRead))
}) })
} }

View File

@ -5,29 +5,23 @@ import (
"time" "time"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
) )
var ( var (
pluginRequestCounter *prometheus.CounterVec pluginRequestCounter = promauto.NewCounterVec(prometheus.CounterOpts{
pluginRequestDuration *prometheus.SummaryVec
)
func init() {
pluginRequestCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "grafana", Namespace: "grafana",
Name: "plugin_request_total", Name: "plugin_request_total",
Help: "The total amount of plugin requests", Help: "The total amount of plugin requests",
}, []string{"plugin_id", "endpoint", "status"}) }, []string{"plugin_id", "endpoint", "status"})
pluginRequestDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{ pluginRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "grafana", Namespace: "grafana",
Name: "plugin_request_duration_milliseconds", Name: "plugin_request_duration_milliseconds",
Help: "Plugin request duration", Help: "Plugin request duration",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
}, []string{"plugin_id", "endpoint"}) }, []string{"plugin_id", "endpoint"})
)
prometheus.MustRegister(pluginRequestCounter, pluginRequestDuration)
}
// instrumentPluginRequest instruments success rate and latency of `fn` // instrumentPluginRequest instruments success rate and latency of `fn`
func instrumentPluginRequest(pluginID string, endpoint string, fn func() error) error { func instrumentPluginRequest(pluginID string, endpoint string, fn func() error) error {

View File

@ -50,7 +50,7 @@ type Scheduler struct {
BehindSeconds prometheus.Gauge BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec EvalDuration *prometheus.HistogramVec
SchedulePeriodicDuration prometheus.Histogram SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge SchedulableAlertRulesHash prometheus.Gauge
@ -156,13 +156,13 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
}, },
[]string{"org"}, []string{"org"},
), ),
EvalDuration: promauto.With(r).NewSummaryVec( EvalDuration: promauto.With(r).NewHistogramVec(
prometheus.SummaryOpts{ prometheus.HistogramOpts{
Namespace: Namespace, Namespace: Namespace,
Subsystem: Subsystem, Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds", Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.", Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
}, },
[]string{"org"}, []string{"org"},
), ),

View File

@ -441,10 +441,22 @@ func TestSchedule_ruleRoutine(t *testing.T) {
// duration metric has 0 values because of mocked clock that do not advance // duration metric has 0 values because of mocked clock that do not advance
expectedMetric := fmt.Sprintf( expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute. `# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
# TYPE grafana_alerting_rule_evaluation_duration_seconds summary # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.5"} 0 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.9"} 0 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.99"} 0 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures. # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.