mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Convert some metrics to Histograms (#50420)
Because Summary metrics can not be aggreated, convert them to histograms so that users with HA deployments can use these metrics. * Convert metrics registration to promauto. * Improve help text style. Signed-off-by: SuperQ <superq@gmail.com>
This commit is contained in:
parent
390b7d084e
commit
68691d7775
@ -7,52 +7,48 @@ import (
|
||||
"github.com/grafana/grafana/pkg/infra/httpclient"
|
||||
"github.com/grafana/grafana/pkg/infra/metrics/metricutil"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
var datasourceRequestCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_total",
|
||||
Help: "A counter for outgoing requests for a datasource",
|
||||
},
|
||||
[]string{"datasource", "code", "method"},
|
||||
)
|
||||
var (
|
||||
datasourceRequestCounter = promauto.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_total",
|
||||
Help: "A counter for outgoing requests for a data source",
|
||||
},
|
||||
[]string{"datasource", "code", "method"},
|
||||
)
|
||||
|
||||
var datasourceRequestSummary = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_duration_seconds",
|
||||
Help: "summary of outgoing datasource requests sent from Grafana",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
}, []string{"datasource", "code", "method"},
|
||||
)
|
||||
datasourceRequestHistogram = promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_duration_seconds",
|
||||
Help: "summary of outgoing data source requests sent from Grafana",
|
||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||
}, []string{"datasource", "code", "method"},
|
||||
)
|
||||
|
||||
var datasourceResponseSummary = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_response_size_bytes",
|
||||
Help: "summary of datasource response sizes returned to Grafana",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
}, []string{"datasource"},
|
||||
)
|
||||
datasourceResponseHistogram = promauto.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_response_size_bytes",
|
||||
Help: "summary of data source response sizes returned to Grafana",
|
||||
Buckets: []float64{128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576},
|
||||
}, []string{"datasource"},
|
||||
)
|
||||
|
||||
var datasourceRequestsInFlight = prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_in_flight",
|
||||
Help: "A gauge of outgoing datasource requests currently being sent by Grafana",
|
||||
},
|
||||
[]string{"datasource"},
|
||||
datasourceRequestsInFlight = promauto.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "datasource_request_in_flight",
|
||||
Help: "A gauge of outgoing data source requests currently being sent by Grafana",
|
||||
},
|
||||
[]string{"datasource"},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(datasourceRequestSummary,
|
||||
datasourceRequestCounter,
|
||||
datasourceRequestsInFlight,
|
||||
datasourceResponseSummary)
|
||||
}
|
||||
|
||||
const DataSourceMetricsMiddlewareName = "metrics"
|
||||
|
||||
var executeMiddlewareFunc = executeMiddleware
|
||||
@ -84,11 +80,11 @@ func DataSourceMetricsMiddleware() sdkhttpclient.Middleware {
|
||||
func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels) http.RoundTripper {
|
||||
return sdkhttpclient.RoundTripperFunc(func(r *http.Request) (*http.Response, error) {
|
||||
requestCounter := datasourceRequestCounter.MustCurryWith(datasourceLabel)
|
||||
requestSummary := datasourceRequestSummary.MustCurryWith(datasourceLabel)
|
||||
requestHistogram := datasourceRequestHistogram.MustCurryWith(datasourceLabel)
|
||||
requestInFlight := datasourceRequestsInFlight.With(datasourceLabel)
|
||||
responseSizeSummary := datasourceResponseSummary.With(datasourceLabel)
|
||||
responseSizeHistogram := datasourceResponseHistogram.With(datasourceLabel)
|
||||
|
||||
res, err := promhttp.InstrumentRoundTripperDuration(requestSummary,
|
||||
res, err := promhttp.InstrumentRoundTripperDuration(requestHistogram,
|
||||
promhttp.InstrumentRoundTripperCounter(requestCounter,
|
||||
promhttp.InstrumentRoundTripperInFlight(requestInFlight, next))).
|
||||
RoundTrip(r)
|
||||
@ -98,7 +94,7 @@ func executeMiddleware(next http.RoundTripper, datasourceLabel prometheus.Labels
|
||||
|
||||
if res != nil && res.StatusCode != http.StatusSwitchingProtocols {
|
||||
res.Body = httpclient.CountBytesReader(res.Body, func(bytesRead int64) {
|
||||
responseSizeSummary.Observe(float64(bytesRead))
|
||||
responseSizeHistogram.Observe(float64(bytesRead))
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -5,29 +5,23 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
var (
|
||||
pluginRequestCounter *prometheus.CounterVec
|
||||
pluginRequestDuration *prometheus.SummaryVec
|
||||
)
|
||||
|
||||
func init() {
|
||||
pluginRequestCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
pluginRequestCounter = promauto.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "plugin_request_total",
|
||||
Help: "The total amount of plugin requests",
|
||||
}, []string{"plugin_id", "endpoint", "status"})
|
||||
|
||||
pluginRequestDuration = prometheus.NewSummaryVec(prometheus.SummaryOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "plugin_request_duration_milliseconds",
|
||||
Help: "Plugin request duration",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
pluginRequestDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "grafana",
|
||||
Name: "plugin_request_duration_milliseconds",
|
||||
Help: "Plugin request duration",
|
||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||
}, []string{"plugin_id", "endpoint"})
|
||||
|
||||
prometheus.MustRegister(pluginRequestCounter, pluginRequestDuration)
|
||||
}
|
||||
)
|
||||
|
||||
// instrumentPluginRequest instruments success rate and latency of `fn`
|
||||
func instrumentPluginRequest(pluginID string, endpoint string, fn func() error) error {
|
||||
|
@ -50,7 +50,7 @@ type Scheduler struct {
|
||||
BehindSeconds prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
EvalDuration *prometheus.HistogramVec
|
||||
SchedulePeriodicDuration prometheus.Histogram
|
||||
SchedulableAlertRules prometheus.Gauge
|
||||
SchedulableAlertRulesHash prometheus.Gauge
|
||||
@ -156,13 +156,13 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
EvalDuration: promauto.With(r).NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_evaluation_duration_seconds",
|
||||
Help: "The duration for a rule to execute.",
|
||||
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
|
||||
EvalDuration: promauto.With(r).NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "rule_evaluation_duration_seconds",
|
||||
Help: "The duration for a rule to execute.",
|
||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
|
@ -441,10 +441,22 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
||||
// duration metric has 0 values because of mocked clock that do not advance
|
||||
expectedMetric := fmt.Sprintf(
|
||||
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
|
||||
# TYPE grafana_alerting_rule_evaluation_duration_seconds summary
|
||||
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.5"} 0
|
||||
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.9"} 0
|
||||
grafana_alerting_rule_evaluation_duration_seconds{org="%[1]d",quantile="0.99"} 0
|
||||
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
|
||||
|
Loading…
Reference in New Issue
Block a user