Convert some metrics to Histograms (#50420)

Because Summary metrics can not be aggreated, convert them to histograms
so that users with HA deployments can use these metrics.
* Convert metrics registration to promauto.
* Improve help text style.

Signed-off-by: SuperQ <superq@gmail.com>
This commit is contained in:
Ben Kochie 2022-06-15 13:19:43 +02:00 committed by GitHub
parent 390b7d084e
commit 68691d7775
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 70 additions and 68 deletions

View File

@ -7,52 +7,48 @@ import (
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/metrics/metricutil"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var datasourceRequestCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Name: "datasource_request_total",
Help: "A counter for outgoing requests for a datasource",
},
[]string{"datasource", "code", "method"},
)
var (
datasourceRequestCounter = promauto.NewCounterVec(
prometheus.CounterOpts{
Namespace: "grafana",
Name: "datasource_request_total",
Help: "A counter for outgoing requests for a data source",
},
[]string{"datasource", "code", "method"},
)
var datasourceRequestSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "grafana",
Name: "datasource_request_duration_seconds",
Help: "summary of outgoing datasource requests sent from Grafana",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"datasource", "code", "method"},
)
datasourceRequestHistogram = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "grafana",
Name: "datasource_request_duration_seconds",
Help: "summary of outgoing data source requests sent from Grafana",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
}, []string{"datasource", "code", "method"},
)
var datasourceResponseSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "grafana",
Name: "datasource_response_size_bytes",
Help: "summary of datasource response sizes returned to Grafana",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"datasource"},
)
datasourceResponseHistogram = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "grafana",
Name: "datasource_response_size_bytes",
Help: "summary of data source response sizes returned to Grafana",
Buckets: []float64{128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576},
}, []string{"datasource"},
)
var datasourceRequestsInFlight = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "grafana",
Name: "datasource_request_in_flight",
Help: "A gauge of outgoing datasource requests currently being sent by Grafana",
},
[]string{"datasource"},
datasourceRequestsInFlight = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "grafana",
Name: "datasource_request_in_flight",
Help: "A gauge of outgoing data source requests currently being sent by Grafana",
},
[]string{"datasource"},
)
)
func init() {
prometheus.MustRegister(datasourceRequestSummary,
datasourceRequestCounter,
datasourceRequestsInFlight,
datasourceResponseSummary)
}
const DataSourceMetricsMiddlewareName = "metrics"
var executeMiddlewareFunc = executeMiddleware
@ -84,11 +80,11 @@ func DataSourceMetricsMiddleware() sdkhttpclient.Middleware {
func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels) http.RoundTripper {
return sdkhttpclient.RoundTripperFunc(func(r *http.Request) (*http.Response, error) {
requestCounter := datasourceRequestCounter.MustCurryWith(datasourceLabel)
requestSummary := datasourceRequestSummary.MustCurryWith(datasourceLabel)
requestHistogram := datasourceRequestHistogram.MustCurryWith(datasourceLabel)
requestInFlight := datasourceRequestsInFlight.With(datasourceLabel)
responseSizeSummary := datasourceResponseSummary.With(datasourceLabel)
responseSizeHistogram := datasourceResponseHistogram.With(datasourceLabel)
res, err := promhttp.InstrumentRoundTripperDuration(requestSummary,
res, err := promhttp.InstrumentRoundTripperDuration(requestHistogram,
promhttp.InstrumentRoundTripperCounter(requestCounter,
promhttp.InstrumentRoundTripperInFlight(requestInFlight, next))).
RoundTrip(r)
@ -98,7 +94,7 @@ func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels
if res != nil && res.StatusCode != http.StatusSwitchingProtocols {
res.Body = httpclient.CountBytesReader(res.Body, func(bytesRead int64) {
responseSizeSummary.Observe(float64(bytesRead))
responseSizeHistogram.Observe(float64(bytesRead))
})
}

View File

@ -5,29 +5,23 @@ import (
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
var (
pluginRequestCounter *prometheus.CounterVec
pluginRequestDuration *prometheus.SummaryVec
)
func init() {
pluginRequestCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
pluginRequestCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "grafana",
Name: "plugin_request_total",
Help: "The total amount of plugin requests",
}, []string{"plugin_id", "endpoint", "status"})
pluginRequestDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
Namespace: "grafana",
Name: "plugin_request_duration_milliseconds",
Help: "Plugin request duration",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
pluginRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "grafana",
Name: "plugin_request_duration_milliseconds",
Help: "Plugin request duration",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
}, []string{"plugin_id", "endpoint"})
prometheus.MustRegister(pluginRequestCounter, pluginRequestDuration)
}
)
// instrumentPluginRequest instruments success rate and latency of `fn`
func instrumentPluginRequest(pluginID string, endpoint string, fn func() error) error {

View File

@ -50,7 +50,7 @@ type Scheduler struct {
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
EvalDuration *prometheus.HistogramVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
@ -156,13 +156,13 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
EvalDuration: promauto.With(r).NewSummaryVec(
prometheus.SummaryOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
EvalDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "rule_evaluation_duration_seconds",
Help: "The duration for a rule to execute.",
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
},
[]string{"org"},
),

View File

@ -441,10 +441,22 @@ func TestSchedule_ruleRoutine(t *testing.T) {
// duration metric has 0 values because of mocked clock that do not advance
expectedMetric := fmt.Sprintf(
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
# TYPE grafana_alerting_rule_evaluation_duration_seconds summary
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.5"} 0
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.9"} 0
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.99"} 0
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.