diff --git a/docs/sources/shared/alerts/grafana-managed-alerts.md b/docs/sources/shared/alerts/grafana-managed-alerts.md index 269fe53f5fe..287ebfb5bbf 100644 --- a/docs/sources/shared/alerts/grafana-managed-alerts.md +++ b/docs/sources/shared/alerts/grafana-managed-alerts.md @@ -24,12 +24,15 @@ Alerting rules can only query backend data sources with alerting enabled: The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../setup-grafana/set-up-grafana-monitoring/" >}}). See also, [View alert rules and their current state]({{< relref "../../alerting/fundamentals/state-and-health/" >}}). -| Metric Name | Type | Description | -| ------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- | -| `alerting.alerts` | gauge | How many alerts by state | -| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API | -| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts | -| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations | -| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures | -| `alerting.rule_evaluation_duration_seconds` | summary | The duration for a rule to execute | -| `alerting.rule_group_rules` | gauge | The number of rules | +| Metric Name | Type | Description | +| --------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- | +| `alerting.alerts` | gauge | How many alerts by state | +| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API | +| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts | +| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations | +| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures | +| `alerting.rule_evaluation_duration_seconds` | histogram | The time to evaluate a rule | +| `alerting.rule_process_evaluation_duration_seconds` | histogram | The time to process the evaluation results for a rule | +| `alerting.rule_send_alerts_duration_seconds` | histogram | The time to send the alerts to Alertmanager | +| `alerting.rule_group_rules` | gauge | The number of rules | +| `alerting.state_calculation_duration_seconds` | histogram | The duration of calculation of a single state | diff --git a/pkg/services/ngalert/api/api.go b/pkg/services/ngalert/api/api.go index 2f7b106b591..2a59a387a71 100644 --- a/pkg/services/ngalert/api/api.go +++ b/pkg/services/ngalert/api/api.go @@ -7,6 +7,7 @@ import ( "github.com/grafana/grafana/pkg/api/routing" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/accesscontrol" "github.com/grafana/grafana/pkg/services/datasourceproxy" "github.com/grafana/grafana/pkg/services/datasources" @@ -84,8 +85,8 @@ type API struct { EvaluatorFactory eval.EvaluatorFactory FeatureManager featuremgmt.FeatureToggles Historian Historian - - AppUrl *url.URL + Tracer tracing.Tracer + AppUrl *url.URL // Hooks can be used to replace API handlers for specific paths. Hooks *Hooks @@ -134,9 +135,10 @@ func (api *API) RegisterAPIEndpoints(m *metrics.API) { accessControl: api.AccessControl, evaluator: api.EvaluatorFactory, cfg: &api.Cfg.UnifiedAlerting, - backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory), + backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory, api.Tracer), featureManager: api.FeatureManager, appUrl: api.AppUrl, + tracer: api.Tracer, }), m) api.RegisterConfigurationApiEndpoints(NewConfiguration( &ConfigSrv{ diff --git a/pkg/services/ngalert/api/api_testing.go b/pkg/services/ngalert/api/api_testing.go index df0364948ed..fe8d62d0863 100644 --- a/pkg/services/ngalert/api/api_testing.go +++ b/pkg/services/ngalert/api/api_testing.go @@ -15,6 +15,7 @@ import ( "github.com/grafana/grafana/pkg/api/response" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/accesscontrol" contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model" "github.com/grafana/grafana/pkg/services/datasources" @@ -39,6 +40,7 @@ type TestingApiSrv struct { backtesting *backtesting.Engine featureManager featuremgmt.FeatureToggles appUrl *url.URL + tracer tracing.Tracer } // RouteTestGrafanaRuleConfig returns a list of potential alerts for a given rule configuration. This is intended to be @@ -86,6 +88,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext, Clock: clock.New(), Historian: nil, MaxStateSaveConcurrency: 1, + Tracer: srv.tracer, } manager := state.NewManager(cfg) includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel) diff --git a/pkg/services/ngalert/api/api_testing_test.go b/pkg/services/ngalert/api/api_testing_test.go index 40d16bb4ea3..26a4299ff18 100644 --- a/pkg/services/ngalert/api/api_testing_test.go +++ b/pkg/services/ngalert/api/api_testing_test.go @@ -10,6 +10,7 @@ import ( "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/accesscontrol" acMock "github.com/grafana/grafana/pkg/services/accesscontrol/mock" contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model" @@ -216,6 +217,7 @@ func TestRouteEvalQueries(t *testing.T) { srv := &TestingApiSrv{ accessControl: ac, + tracer: tracing.InitializeTracerForTest(), } response := srv.RouteEvalQueries(rc, definitions.EvalQueriesPayload{ @@ -277,5 +279,6 @@ func createTestingApiSrv(t *testing.T, ds *fakes.FakeCacheService, ac *acMock.Mo accessControl: ac, evaluator: evaluator, cfg: config(t), + tracer: tracing.InitializeTracerForTest(), } } diff --git a/pkg/services/ngalert/backtesting/engine.go b/pkg/services/ngalert/backtesting/engine.go index 0081fd2eb07..bc3e0110ae4 100644 --- a/pkg/services/ngalert/backtesting/engine.go +++ b/pkg/services/ngalert/backtesting/engine.go @@ -12,6 +12,7 @@ import ( "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/state" @@ -40,7 +41,7 @@ type Engine struct { createStateManager func() stateManager } -func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine { +func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer) *Engine { return &Engine{ evalFactory: evalFactory, createStateManager: func() stateManager { @@ -52,6 +53,7 @@ func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine { Clock: clock.New(), Historian: nil, MaxStateSaveConcurrency: 1, + Tracer: tracer, } return state.NewManager(cfg) }, diff --git a/pkg/services/ngalert/metrics/scheduler.go b/pkg/services/ngalert/metrics/scheduler.go index 06531db8c69..d3f045a4640 100644 --- a/pkg/services/ngalert/metrics/scheduler.go +++ b/pkg/services/ngalert/metrics/scheduler.go @@ -18,6 +18,8 @@ type Scheduler struct { EvalTotal *prometheus.CounterVec EvalFailures *prometheus.CounterVec EvalDuration *prometheus.HistogramVec + ProcessDuration *prometheus.HistogramVec + SendDuration *prometheus.HistogramVec GroupRules *prometheus.GaugeVec SchedulePeriodicDuration prometheus.Histogram SchedulableAlertRules prometheus.Gauge @@ -63,8 +65,28 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler { Namespace: Namespace, Subsystem: Subsystem, Name: "rule_evaluation_duration_seconds", - Help: "The duration for a rule to execute.", - Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100}, + Help: "The time to evaluate a rule.", + Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300}, + }, + []string{"org"}, + ), + ProcessDuration: promauto.With(r).NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_process_evaluation_duration_seconds", + Help: "The time to process the evaluation results for a rule.", + Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300}, + }, + []string{"org"}, + ), + SendDuration: promauto.With(r).NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "rule_send_alerts_duration_seconds", + Help: "The time to send the alerts to Alertmanager.", + Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300}, }, []string{"org"}, ), diff --git a/pkg/services/ngalert/metrics/state.go b/pkg/services/ngalert/metrics/state.go index e9bec6464ce..1aae3d1e683 100644 --- a/pkg/services/ngalert/metrics/state.go +++ b/pkg/services/ngalert/metrics/state.go @@ -6,7 +6,8 @@ import ( ) type State struct { - AlertState *prometheus.GaugeVec + AlertState *prometheus.GaugeVec + StateUpdateDuration prometheus.Histogram } func NewStateMetrics(r prometheus.Registerer) *State { @@ -17,5 +18,14 @@ func NewStateMetrics(r prometheus.Registerer) *State { Name: "alerts", Help: "How many alerts by state.", }, []string{"state"}), + StateUpdateDuration: promauto.With(r).NewHistogram( + prometheus.HistogramOpts{ + Namespace: Namespace, + Subsystem: Subsystem, + Name: "state_calculation_duration_seconds", + Help: "The duration of calculation of a single state.", + Buckets: []float64{0.01, 0.1, 1, 2, 5, 10}, + }, + ), } } diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 7a063df6980..b30b024f725 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -221,6 +221,7 @@ func (ng *AlertNG) init() error { DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState), MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency, ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoDataErrorExecution), + Tracer: ng.tracer, } stateManager := state.NewManager(cfg) scheduler := schedule.NewScheduler(schedCfg, stateManager) @@ -268,6 +269,7 @@ func (ng *AlertNG) init() error { AppUrl: appUrl, Historian: history, Hooks: api.NewHooks(ng.Log), + Tracer: ng.tracer, } ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics()) diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 0fe3835e7d6..ff13953b432 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -351,6 +351,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID) evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID) evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID) + processDuration := sch.metrics.ProcessDuration.WithLabelValues(orgID) + sendDuration := sch.metrics.SendDuration.WithLabelValues(orgID) notify := func(states []state.StateTransition) { expiredAlerts := state.FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock) @@ -423,6 +425,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR logger.Debug("Skip updating the state because the context has been cancelled") return } + start = sch.clock.Now() processedStates := sch.stateManager.ProcessEvalResults( ctx, e.scheduledAt, @@ -430,6 +433,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR results, state.GetRuleExtraLabels(e.rule, e.folderTitle, !sch.disableGrafanaFolder), ) + processDuration.Observe(sch.clock.Now().Sub(start).Seconds()) + + start = sch.clock.Now() alerts := state.FromStateTransitionToPostableAlerts(processedStates, sch.stateManager, sch.appURL) span.AddEvents( []string{"message", "state_transitions", "alerts_to_send"}, @@ -441,6 +447,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR if len(alerts.PostableAlerts) > 0 { sch.alertsSender.Send(key, alerts) } + sendDuration.Observe(sch.clock.Now().Sub(start).Seconds()) } retryIfError := func(f func(attempt int64) error) error { diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index 94ce4187f14..b59daceafa0 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -82,6 +82,7 @@ func TestProcessTicks(t *testing.T) { Clock: mockedClock, Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: testTracer, } st := state.NewManager(managerCfg) @@ -453,22 +454,21 @@ func TestSchedule_ruleRoutine(t *testing.T) { t.Run("it reports metrics", func(t *testing.T) { // duration metric has 0 values because of mocked clock that do not advance expectedMetric := fmt.Sprintf( - `# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute. + `# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule. # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 @@ -478,9 +478,45 @@ func TestSchedule_ruleRoutine(t *testing.T) { # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. # TYPE grafana_alerting_rule_evaluations_total counter grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule. + # TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0 + grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1 + # HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager. + # TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0 + grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1 `, rule.OrgID) - err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total") + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds") require.NoError(t, err) }) }) @@ -641,34 +677,69 @@ func TestSchedule_ruleRoutine(t *testing.T) { t.Run("it should increase failure counter", func(t *testing.T) { // duration metric has 0 values because of mocked clock that do not advance expectedMetric := fmt.Sprintf( - `# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute. - # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1 - grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 - grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 - grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 - # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures. - # TYPE grafana_alerting_rule_evaluation_failures_total counter - grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1 - # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. - # TYPE grafana_alerting_rule_evaluations_total counter - grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 + `# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule. + # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 + grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0 + grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1 + # HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures. + # TYPE grafana_alerting_rule_evaluation_failures_total counter + grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations. + # TYPE grafana_alerting_rule_evaluations_total counter + grafana_alerting_rule_evaluations_total{org="%[1]d"} 1 + # HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule. + # TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 + grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0 + grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1 + # HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager. + # TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1 + grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0 + grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1 `, rule.OrgID) - err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total") + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds") require.NoError(t, err) }) @@ -826,6 +897,7 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor Clock: mockedClock, Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: testTracer, } st := state.NewManager(managerCfg) diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index 04eaa8e0bb5..57db4572e0f 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -8,8 +8,10 @@ import ( "github.com/benbjohnson/clock" "github.com/grafana/dskit/concurrency" "github.com/grafana/grafana-plugin-sdk-go/data" + "go.opentelemetry.io/otel/attribute" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngModels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -30,6 +32,7 @@ type AlertInstanceManager interface { type Manager struct { log log.Logger metrics *metrics.State + tracer tracing.Tracer clock clock.Clock cache *cache @@ -60,6 +63,8 @@ type ManagerCfg struct { // ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error) // to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK` ApplyNoDataAndErrorToAllStates bool + + Tracer tracing.Tracer } func NewManager(cfg ManagerCfg) *Manager { @@ -76,6 +81,7 @@ func NewManager(cfg ManagerCfg) *Manager { doNotSaveNormalState: cfg.DoNotSaveNormalState, maxStateSaveConcurrency: cfg.MaxStateSaveConcurrency, applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates, + tracer: cfg.Tracer, } } @@ -251,18 +257,46 @@ func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.Alert // ProcessEvalResults updates the current states that belong to a rule with the evaluation results. // if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition { - logger := st.log.FromContext(ctx) + tracingCtx, span := st.tracer.Start(ctx, "alert rule state calculation") + defer span.End() + span.SetAttributes("rule_uid", alertRule.UID, attribute.String("rule_uid", alertRule.UID)) + span.SetAttributes("org_id", alertRule.OrgID, attribute.Int64("org_id", alertRule.OrgID)) + span.SetAttributes("rule_version", alertRule.Version, attribute.Int64("rule_version", alertRule.Version)) + utcTick := evaluatedAt.UTC().Format(time.RFC3339Nano) + span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick)) + span.SetAttributes("results", len(results), attribute.Int("tick", len(results))) + + logger := st.log.FromContext(tracingCtx) logger.Debug("State manager processing evaluation results", "resultCount", len(results)) - states := st.setNextStateForRule(ctx, alertRule, results, extraLabels, logger) + states := st.setNextStateForRule(tracingCtx, alertRule, results, extraLabels, logger) + + span.AddEvents([]string{"message", "state_transitions"}, + []tracing.EventValue{ + {Str: "results processed"}, + {Num: int64(len(states))}, + }) staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule) - st.deleteAlertStates(ctx, logger, staleStates) + st.deleteAlertStates(tracingCtx, logger, staleStates) - st.saveAlertStates(ctx, logger, states...) + if len(staleStates) > 0 { + span.AddEvents([]string{"message", "state_transitions"}, + []tracing.EventValue{ + {Str: "deleted stale states"}, + {Num: int64(len(staleStates))}, + }) + } + + st.saveAlertStates(tracingCtx, logger, states...) + + span.AddEvents([]string{"message"}, + []tracing.EventValue{ + {Str: "updated database"}, + }) allChanges := append(states, staleStates...) if st.historian != nil { - st.historian.Record(ctx, history_model.NewRuleMeta(alertRule, logger), allChanges) + st.historian.Record(tracingCtx, history_model.NewRuleMeta(alertRule, logger), allChanges) } return allChanges } @@ -303,6 +337,7 @@ func (st *Manager) setNextStateForAll(ctx context.Context, alertRule *ngModels.A // Set the current state based on evaluation results func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, currentState *State, result eval.Result, logger log.Logger) StateTransition { + start := st.clock.Now() currentState.LastEvaluationTime = result.EvaluatedAt currentState.EvaluationDuration = result.EvaluationDuration currentState.Results = append(currentState.Results, Evaluation{ @@ -390,6 +425,10 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu PreviousStateReason: oldReason, } + if st.metrics != nil { + st.metrics.StateUpdateDuration.Observe(st.clock.Now().Sub(start).Seconds()) + } + return nextState } diff --git a/pkg/services/ngalert/state/manager_bench_test.go b/pkg/services/ngalert/state/manager_bench_test.go index 94306c0f8f2..9eda9948cf0 100644 --- a/pkg/services/ngalert/state/manager_bench_test.go +++ b/pkg/services/ngalert/state/manager_bench_test.go @@ -6,14 +6,16 @@ import ( "testing" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/mock" + + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/annotations" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/state" "github.com/grafana/grafana/pkg/services/ngalert/state/historian" - "github.com/prometheus/client_golang/prometheus" - "github.com/stretchr/testify/mock" ) func BenchmarkProcessEvalResults(b *testing.B) { @@ -25,6 +27,7 @@ func BenchmarkProcessEvalResults(b *testing.B) { cfg := state.ManagerCfg{ Historian: hist, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } sut := state.NewManager(cfg) now := time.Now().UTC() diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index 13f71d88609..76985101879 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -19,6 +19,7 @@ import ( "github.com/grafana/grafana/pkg/expr" "github.com/grafana/grafana/pkg/infra/log/logtest" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -311,6 +312,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) { cfg := ManagerCfg{ Metrics: testMetrics, + Tracer: tracing.InitializeTracerForTest(), ExternalURL: nil, InstanceStore: &FakeInstanceStore{}, Images: &NotAvailableImageService{}, diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index 311ada9750a..8380f30ebbf 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -1,6 +1,7 @@ package state_test import ( + "bytes" "context" "errors" "fmt" @@ -15,11 +16,13 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/exp/slices" "github.com/grafana/grafana/pkg/expr" + "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/annotations" "github.com/grafana/grafana/pkg/services/annotations/annotationstest" "github.com/grafana/grafana/pkg/services/dashboards" @@ -198,6 +201,7 @@ func TestWarmStateCache(t *testing.T) { Clock: clock.NewMock(), Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -234,6 +238,7 @@ func TestDashboardAnnotations(t *testing.T) { Clock: clock.New(), Historian: hist, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) @@ -1198,18 +1203,21 @@ func TestProcessEvalResults(t *testing.T) { for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo() - m := metrics.NewHistorianMetrics(prometheus.NewRegistry()) - store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, m) - hist := historian.NewAnnotationBackend(store, nil, m) + reg := prometheus.NewPedanticRegistry() + stateMetrics := metrics.NewStateMetrics(reg) + metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry()) + store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, metrics) + hist := historian.NewAnnotationBackend(store, nil, metrics) clk := clock.NewMock() cfg := state.ManagerCfg{ - Metrics: testMetrics.GetStateMetrics(), + Metrics: stateMetrics, ExternalURL: nil, InstanceStore: &state.FakeInstanceStore{}, Images: &state.NotAvailableImageService{}, Clock: clk, Historian: hist, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) @@ -1220,7 +1228,7 @@ func TestProcessEvalResults(t *testing.T) { slices.SortFunc(evals, func(a, b time.Time) bool { return a.Before(b) }) - + results := 0 for _, evalTime := range evals { res := tc.evalResults[evalTime] for i := 0; i < len(res); i++ { @@ -1228,6 +1236,7 @@ func TestProcessEvalResults(t *testing.T) { } clk.Set(evalTime) _ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels) + results += len(res) } states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID) @@ -1278,6 +1287,22 @@ func TestProcessEvalResults(t *testing.T) { require.Eventuallyf(t, func() bool { return tc.expectedAnnotations == fakeAnnoRepo.Len() }, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items())) + + expectedMetric := fmt.Sprintf( + `# HELP grafana_alerting_state_calculation_duration_seconds The duration of calculation of a single state. + # TYPE grafana_alerting_state_calculation_duration_seconds histogram + grafana_alerting_state_calculation_duration_seconds_bucket{le="0.01"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="0.1"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="1"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="2"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="5"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="10"} %[1]d + grafana_alerting_state_calculation_duration_seconds_bucket{le="+Inf"} %[1]d + grafana_alerting_state_calculation_duration_seconds_sum 0 + grafana_alerting_state_calculation_duration_seconds_count %[1]d + `, results) + err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_state_calculation_duration_seconds", "grafana_alerting_state_calculation_total") + require.NoError(t, err) }) } @@ -1292,6 +1317,7 @@ func TestProcessEvalResults(t *testing.T) { Clock: clk, Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) rule := models.AlertRuleGen()() @@ -1442,6 +1468,7 @@ func TestStaleResultsHandler(t *testing.T) { Clock: clock.New(), Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -1523,6 +1550,7 @@ func TestStaleResults(t *testing.T) { Clock: clk, Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) @@ -1695,6 +1723,7 @@ func TestDeleteStateByRuleUID(t *testing.T) { Clock: clk, Historian: &state.FakeHistorian{}, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) st.Warm(ctx, dbstore) @@ -1835,6 +1864,7 @@ func TestResetStateByRuleUID(t *testing.T) { Clock: clk, Historian: fakeHistorian, MaxStateSaveConcurrency: 1, + Tracer: tracing.InitializeTracerForTest(), } st := state.NewManager(cfg) st.Warm(ctx, dbstore)