Alerting: Add new metrics and tracings to state manager and scheduler (#71398)

* add metrics and tracing to state manager * propagate tracer to state manager * add scheduler metrics * fix backtesting * add test for state metrics * remove StateUpdateCount * update docs * metrics can be null * add tracer to new tests
2025-02-25 18:55:37 -06:00 · 2023-08-16 03:04:18 -04:00
parent 90e3f516ff
commit 938e26b59f
14 changed files with 264 additions and 64 deletions
--- a/docs/sources/shared/alerts/grafana-managed-alerts.md
+++ b/docs/sources/shared/alerts/grafana-managed-alerts.md
@@ -24,12 +24,15 @@ Alerting rules can only query backend data sources with alerting enabled:
 The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../setup-grafana/set-up-grafana-monitoring/" >}}). See also, [View alert rules and their current state]({{< relref "../../alerting/fundamentals/state-and-health/" >}}).
-| Metric Name                                 | Type      | Description                                                                              |
+| Metric Name                                         | Type      | Description                                                                              |
-| ------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
+| --------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
-| `alerting.alerts`                           | gauge     | How many alerts by state                                                                 |
+| `alerting.alerts`                                   | gauge     | How many alerts by state                                                                 |
-| `alerting.request_duration_seconds`         | histogram | Histogram of requests to the Alerting API                                                |
+| `alerting.request_duration_seconds`                 | histogram | Histogram of requests to the Alerting API                                                |
-| `alerting.active_configurations`            | gauge     | The number of active, non default alertmanager configurations for grafana managed alerts |
+| `alerting.active_configurations`                    | gauge     | The number of active, non default alertmanager configurations for grafana managed alerts |
-| `alerting.rule_evaluations_total`           | counter   | The total number of rule evaluations                                                     |
+| `alerting.rule_evaluations_total`                   | counter   | The total number of rule evaluations                                                     |
-| `alerting.rule_evaluation_failures_total`   | counter   | The total number of rule evaluation failures                                             |
+| `alerting.rule_evaluation_failures_total`           | counter   | The total number of rule evaluation failures                                             |
-| `alerting.rule_evaluation_duration_seconds` | summary   | The duration for a rule to execute                                                       |
+| `alerting.rule_evaluation_duration_seconds`         | histogram | The time to evaluate a rule                                                              |
-| `alerting.rule_group_rules`                 | gauge     | The number of rules                                                                      |
+| `alerting.rule_process_evaluation_duration_seconds` | histogram | The time to process the evaluation results for a rule                                    |
 | `alerting.rule_send_alerts_duration_seconds`        | histogram | The time to send the alerts to Alertmanager                                              |
 | `alerting.rule_group_rules`                         | gauge     | The number of rules                                                                      |
 | `alerting.state_calculation_duration_seconds`       | histogram | The duration of calculation of a single state                                            |
--- a/pkg/services/ngalert/api/api.go
+++ b/pkg/services/ngalert/api/api.go
@@ -7,6 +7,7 @@ import (
 	"github.com/grafana/grafana/pkg/api/routing"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/accesscontrol"
 	"github.com/grafana/grafana/pkg/services/datasourceproxy"
 	"github.com/grafana/grafana/pkg/services/datasources"
@@ -84,8 +85,8 @@ type API struct {
 	EvaluatorFactory     eval.EvaluatorFactory
 	FeatureManager       featuremgmt.FeatureToggles
 	Historian            Historian
-
+	Tracer               tracing.Tracer
-	AppUrl *url.URL
+	AppUrl               *url.URL
 	// Hooks can be used to replace API handlers for specific paths.
 	Hooks *Hooks
@@ -134,9 +135,10 @@ func (api *API) RegisterAPIEndpoints(m *metrics.API) {
 			accessControl:   api.AccessControl,
 			evaluator:       api.EvaluatorFactory,
 			cfg:             &api.Cfg.UnifiedAlerting,
-			backtesting:     backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory),
+			backtesting:     backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory, api.Tracer),
 			featureManager:  api.FeatureManager,
 			appUrl:          api.AppUrl,
 			tracer:          api.Tracer,
 		}), m)
 	api.RegisterConfigurationApiEndpoints(NewConfiguration(
 		&ConfigSrv{
--- a/pkg/services/ngalert/api/api_testing.go
+++ b/pkg/services/ngalert/api/api_testing.go
@@ -15,6 +15,7 @@ import (
 	"github.com/grafana/grafana/pkg/api/response"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/accesscontrol"
 	contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
 	"github.com/grafana/grafana/pkg/services/datasources"
@@ -39,6 +40,7 @@ type TestingApiSrv struct {
 	backtesting     *backtesting.Engine
 	featureManager  featuremgmt.FeatureToggles
 	appUrl          *url.URL
 	tracer          tracing.Tracer
 }
 // RouteTestGrafanaRuleConfig returns a list of potential alerts for a given rule configuration. This is intended to be
@@ -86,6 +88,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext,
 		Clock:                   clock.New(),
 		Historian:               nil,
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  srv.tracer,
 	}
 	manager := state.NewManager(cfg)
 	includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel)
--- a/pkg/services/ngalert/api/api_testing_test.go
+++ b/pkg/services/ngalert/api/api_testing_test.go
@@ -10,6 +10,7 @@ import (
 	"github.com/stretchr/testify/mock"
 	"github.com/stretchr/testify/require"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/accesscontrol"
 	acMock "github.com/grafana/grafana/pkg/services/accesscontrol/mock"
 	contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
@@ -216,6 +217,7 @@ func TestRouteEvalQueries(t *testing.T) {
 			srv := &TestingApiSrv{
 				accessControl: ac,
 				tracer:        tracing.InitializeTracerForTest(),
 			}
 			response := srv.RouteEvalQueries(rc, definitions.EvalQueriesPayload{
@@ -277,5 +279,6 @@ func createTestingApiSrv(t *testing.T, ds *fakes.FakeCacheService, ac *acMock.Mo
 		accessControl:   ac,
 		evaluator:       evaluator,
 		cfg:             config(t),
 		tracer:          tracing.InitializeTracerForTest(),
 	}
 }
--- a/pkg/services/ngalert/backtesting/engine.go
+++ b/pkg/services/ngalert/backtesting/engine.go
@@ -12,6 +12,7 @@ import (
 	"github.com/grafana/grafana-plugin-sdk-go/data"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 	"github.com/grafana/grafana/pkg/services/ngalert/state"
@@ -40,7 +41,7 @@ type Engine struct {
 	createStateManager func() stateManager
 }
-func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
+func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer) *Engine {
 	return &Engine{
 		evalFactory: evalFactory,
 		createStateManager: func() stateManager {
@@ -52,6 +53,7 @@ func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
 				Clock:                   clock.New(),
 				Historian:               nil,
 				MaxStateSaveConcurrency: 1,
 				Tracer:                  tracer,
 			}
 			return state.NewManager(cfg)
 		},
--- a/pkg/services/ngalert/metrics/scheduler.go
+++ b/pkg/services/ngalert/metrics/scheduler.go
@@ -18,6 +18,8 @@ type Scheduler struct {
 	EvalTotal                           *prometheus.CounterVec
 	EvalFailures                        *prometheus.CounterVec
 	EvalDuration                        *prometheus.HistogramVec
 	ProcessDuration                     *prometheus.HistogramVec
 	SendDuration                        *prometheus.HistogramVec
 	GroupRules                          *prometheus.GaugeVec
 	SchedulePeriodicDuration            prometheus.Histogram
 	SchedulableAlertRules               prometheus.Gauge
@@ -63,8 +65,28 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "rule_evaluation_duration_seconds",
-				Help:      "The duration for a rule to execute.",
+				Help:      "The time to evaluate a rule.",
-				Buckets:   []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
+				Buckets:   []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
 			},
 			[]string{"org"},
 		),
 		ProcessDuration: promauto.With(r).NewHistogramVec(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "rule_process_evaluation_duration_seconds",
 				Help:      "The time to process the evaluation results for a rule.",
 				Buckets:   []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
 			},
 			[]string{"org"},
 		),
 		SendDuration: promauto.With(r).NewHistogramVec(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "rule_send_alerts_duration_seconds",
 				Help:      "The time to send the alerts to Alertmanager.",
 				Buckets:   []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
 			},
 			[]string{"org"},
 		),
--- a/pkg/services/ngalert/metrics/state.go
+++ b/pkg/services/ngalert/metrics/state.go
@@ -6,7 +6,8 @@ import (
 )
 type State struct {
-	AlertState *prometheus.GaugeVec
+	AlertState          *prometheus.GaugeVec
 	StateUpdateDuration prometheus.Histogram
 }
 func NewStateMetrics(r prometheus.Registerer) *State {
@@ -17,5 +18,14 @@ func NewStateMetrics(r prometheus.Registerer) *State {
 			Name:      "alerts",
 			Help:      "How many alerts by state.",
 		}, []string{"state"}),
 		StateUpdateDuration: promauto.With(r).NewHistogram(
 			prometheus.HistogramOpts{
 				Namespace: Namespace,
 				Subsystem: Subsystem,
 				Name:      "state_calculation_duration_seconds",
 				Help:      "The duration of calculation of a single state.",
 				Buckets:   []float64{0.01, 0.1, 1, 2, 5, 10},
 			},
 		),
 	}
 }
--- a/pkg/services/ngalert/ngalert.go
+++ b/pkg/services/ngalert/ngalert.go
@@ -221,6 +221,7 @@ func (ng *AlertNG) init() error {
 		DoNotSaveNormalState:           ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
 		MaxStateSaveConcurrency:        ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
 		ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoDataErrorExecution),
 		Tracer:                         ng.tracer,
 	}
 	stateManager := state.NewManager(cfg)
 	scheduler := schedule.NewScheduler(schedCfg, stateManager)
@@ -268,6 +269,7 @@ func (ng *AlertNG) init() error {
 		AppUrl:               appUrl,
 		Historian:            history,
 		Hooks:                api.NewHooks(ng.Log),
 		Tracer:               ng.tracer,
 	}
 	ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@@ -351,6 +351,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 	evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID)
 	evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
 	evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
 	processDuration := sch.metrics.ProcessDuration.WithLabelValues(orgID)
 	sendDuration := sch.metrics.SendDuration.WithLabelValues(orgID)
 	notify := func(states []state.StateTransition) {
 		expiredAlerts := state.FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
@@ -423,6 +425,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 			logger.Debug("Skip updating the state because the context has been cancelled")
 			return
 		}
 		start = sch.clock.Now()
 		processedStates := sch.stateManager.ProcessEvalResults(
 			ctx,
 			e.scheduledAt,
@@ -430,6 +433,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 			results,
 			state.GetRuleExtraLabels(e.rule, e.folderTitle, !sch.disableGrafanaFolder),
 		)
 		processDuration.Observe(sch.clock.Now().Sub(start).Seconds())
 		start = sch.clock.Now()
 		alerts := state.FromStateTransitionToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
 		span.AddEvents(
 			[]string{"message", "state_transitions", "alerts_to_send"},
@@ -441,6 +447,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
 		if len(alerts.PostableAlerts) > 0 {
 			sch.alertsSender.Send(key, alerts)
 		}
 		sendDuration.Observe(sch.clock.Now().Sub(start).Seconds())
 	}
 	retryIfError := func(f func(attempt int64) error) error {
--- a/pkg/services/ngalert/schedule/schedule_unit_test.go
+++ b/pkg/services/ngalert/schedule/schedule_unit_test.go
@@ -82,6 +82,7 @@ func TestProcessTicks(t *testing.T) {
 		Clock:                   mockedClock,
 		Historian:               &state.FakeHistorian{},
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  testTracer,
 	}
 	st := state.NewManager(managerCfg)
@@ -453,22 +454,21 @@ func TestSchedule_ruleRoutine(t *testing.T) {
 			t.Run("it reports metrics", func(t *testing.T) {
 				// duration metric has 0 values because of mocked clock that do not advance
 				expectedMetric := fmt.Sprintf(
-					`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
+					`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
        	            	# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
+        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
+        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
+							grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
 							grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
 							grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
 							grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
 							grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
        	            	grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
        	            	grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
@@ -478,9 +478,45 @@ func TestSchedule_ruleRoutine(t *testing.T) {
        	            	# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
        	            	# TYPE grafana_alerting_rule_evaluations_total counter
        	            	grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
 							# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
 							# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
 							grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
 							grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
 							# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
 							# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
 							grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
 							grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
 				`, rule.OrgID)
-				err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
+				err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
 				require.NoError(t, err)
 			})
 		})
@@ -641,34 +677,69 @@ func TestSchedule_ruleRoutine(t *testing.T) {
 		t.Run("it should increase failure counter", func(t *testing.T) {
 			// duration metric has 0 values because of mocked clock that do not advance
 			expectedMetric := fmt.Sprintf(
-				`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
+				`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
-        	            	# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
+        	            # TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
+						grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
+						grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
+						grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
+						grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
+						grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
+        	            grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
-        	            	grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
+        	            grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
-        	            	grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
+						# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
-							# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
+        	            # TYPE grafana_alerting_rule_evaluation_failures_total counter
-        	            	# TYPE grafana_alerting_rule_evaluation_failures_total counter
+        	            grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
-        	            	grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
+        	            # HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
-        	            	# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
+        	            # TYPE grafana_alerting_rule_evaluations_total counter
-        	            	# TYPE grafana_alerting_rule_evaluations_total counter
+        	            grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
-        	            	grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
+						# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
 						# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
 						grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
 						grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
 						# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
 						# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
 						grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
 						grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
 				`, rule.OrgID)
-			err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
+			err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
 			require.NoError(t, err)
 		})
@@ -826,6 +897,7 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor
 		Clock:                   mockedClock,
 		Historian:               &state.FakeHistorian{},
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  testTracer,
 	}
 	st := state.NewManager(managerCfg)
--- a/pkg/services/ngalert/state/manager.go
+++ b/pkg/services/ngalert/state/manager.go
@@ -8,8 +8,10 @@ import (
 	"github.com/benbjohnson/clock"
 	"github.com/grafana/dskit/concurrency"
 	"github.com/grafana/grafana-plugin-sdk-go/data"
 	"go.opentelemetry.io/otel/attribute"
 	"github.com/grafana/grafana/pkg/infra/log"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
@@ -30,6 +32,7 @@ type AlertInstanceManager interface {
 type Manager struct {
 	log     log.Logger
 	metrics *metrics.State
 	tracer  tracing.Tracer
 	clock       clock.Clock
 	cache       *cache
@@ -60,6 +63,8 @@ type ManagerCfg struct {
 	// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
 	// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
 	ApplyNoDataAndErrorToAllStates bool
 	Tracer tracing.Tracer
 }
 func NewManager(cfg ManagerCfg) *Manager {
@@ -76,6 +81,7 @@ func NewManager(cfg ManagerCfg) *Manager {
 		doNotSaveNormalState:           cfg.DoNotSaveNormalState,
 		maxStateSaveConcurrency:        cfg.MaxStateSaveConcurrency,
 		applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
 		tracer:                         cfg.Tracer,
 	}
 }
@@ -251,18 +257,46 @@ func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.Alert
 // ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
 // if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
 func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition {
-	logger := st.log.FromContext(ctx)
+	tracingCtx, span := st.tracer.Start(ctx, "alert rule state calculation")
 	defer span.End()
 	span.SetAttributes("rule_uid", alertRule.UID, attribute.String("rule_uid", alertRule.UID))
 	span.SetAttributes("org_id", alertRule.OrgID, attribute.Int64("org_id", alertRule.OrgID))
 	span.SetAttributes("rule_version", alertRule.Version, attribute.Int64("rule_version", alertRule.Version))
 	utcTick := evaluatedAt.UTC().Format(time.RFC3339Nano)
 	span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))
 	span.SetAttributes("results", len(results), attribute.Int("tick", len(results)))
 	logger := st.log.FromContext(tracingCtx)
 	logger.Debug("State manager processing evaluation results", "resultCount", len(results))
-	states := st.setNextStateForRule(ctx, alertRule, results, extraLabels, logger)
+	states := st.setNextStateForRule(tracingCtx, alertRule, results, extraLabels, logger)
 	span.AddEvents([]string{"message", "state_transitions"},
 		[]tracing.EventValue{
 			{Str: "results processed"},
 			{Num: int64(len(states))},
 		})
 	staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
-	st.deleteAlertStates(ctx, logger, staleStates)
+	st.deleteAlertStates(tracingCtx, logger, staleStates)
-	st.saveAlertStates(ctx, logger, states...)
+	if len(staleStates) > 0 {
 		span.AddEvents([]string{"message", "state_transitions"},
 			[]tracing.EventValue{
 				{Str: "deleted stale states"},
 				{Num: int64(len(staleStates))},
 			})
 	}
 	st.saveAlertStates(tracingCtx, logger, states...)
 	span.AddEvents([]string{"message"},
 		[]tracing.EventValue{
 			{Str: "updated database"},
 		})
 	allChanges := append(states, staleStates...)
 	if st.historian != nil {
-		st.historian.Record(ctx, history_model.NewRuleMeta(alertRule, logger), allChanges)
+		st.historian.Record(tracingCtx, history_model.NewRuleMeta(alertRule, logger), allChanges)
 	}
 	return allChanges
 }
@@ -303,6 +337,7 @@ func (st *Manager) setNextStateForAll(ctx context.Context, alertRule *ngModels.A
 // Set the current state based on evaluation results
 func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, currentState *State, result eval.Result, logger log.Logger) StateTransition {
 	start := st.clock.Now()
 	currentState.LastEvaluationTime = result.EvaluatedAt
 	currentState.EvaluationDuration = result.EvaluationDuration
 	currentState.Results = append(currentState.Results, Evaluation{
@@ -390,6 +425,10 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
 		PreviousStateReason: oldReason,
 	}
 	if st.metrics != nil {
 		st.metrics.StateUpdateDuration.Observe(st.clock.Now().Sub(start).Seconds())
 	}
 	return nextState
 }
--- a/pkg/services/ngalert/state/manager_bench_test.go
+++ b/pkg/services/ngalert/state/manager_bench_test.go
@@ -6,14 +6,16 @@ import (
 	"testing"
 	"time"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/stretchr/testify/mock"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/annotations"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	"github.com/grafana/grafana/pkg/services/ngalert/models"
 	"github.com/grafana/grafana/pkg/services/ngalert/state"
 	"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/stretchr/testify/mock"
 )
 func BenchmarkProcessEvalResults(b *testing.B) {
@@ -25,6 +27,7 @@ func BenchmarkProcessEvalResults(b *testing.B) {
 	cfg := state.ManagerCfg{
 		Historian:               hist,
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  tracing.InitializeTracerForTest(),
 	}
 	sut := state.NewManager(cfg)
 	now := time.Now().UTC()
--- a/pkg/services/ngalert/state/manager_private_test.go
+++ b/pkg/services/ngalert/state/manager_private_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/grafana/grafana/pkg/expr"
 	"github.com/grafana/grafana/pkg/infra/log/logtest"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 	"github.com/grafana/grafana/pkg/services/ngalert/metrics"
 	ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
@@ -311,6 +312,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
 		cfg := ManagerCfg{
 			Metrics:                 testMetrics,
 			Tracer:                  tracing.InitializeTracerForTest(),
 			ExternalURL:             nil,
 			InstanceStore:           &FakeInstanceStore{},
 			Images:                  &NotAvailableImageService{},
--- a/pkg/services/ngalert/state/manager_test.go
+++ b/pkg/services/ngalert/state/manager_test.go
@@ -1,6 +1,7 @@
 package state_test
 import (
 	"bytes"
 	"context"
 	"errors"
 	"fmt"
@@ -15,11 +16,13 @@ import (
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"github.com/grafana/grafana-plugin-sdk-go/data"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/exp/slices"
 	"github.com/grafana/grafana/pkg/expr"
 	"github.com/grafana/grafana/pkg/infra/tracing"
 	"github.com/grafana/grafana/pkg/services/annotations"
 	"github.com/grafana/grafana/pkg/services/annotations/annotationstest"
 	"github.com/grafana/grafana/pkg/services/dashboards"
@@ -198,6 +201,7 @@ func TestWarmStateCache(t *testing.T) {
 		Clock:                   clock.NewMock(),
 		Historian:               &state.FakeHistorian{},
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  tracing.InitializeTracerForTest(),
 	}
 	st := state.NewManager(cfg)
 	st.Warm(ctx, dbstore)
@@ -234,6 +238,7 @@ func TestDashboardAnnotations(t *testing.T) {
 		Clock:                   clock.New(),
 		Historian:               hist,
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  tracing.InitializeTracerForTest(),
 	}
 	st := state.NewManager(cfg)
@@ -1198,18 +1203,21 @@ func TestProcessEvalResults(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.desc, func(t *testing.T) {
 			fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
-			m := metrics.NewHistorianMetrics(prometheus.NewRegistry())
+			reg := prometheus.NewPedanticRegistry()
-			store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, m)
+			stateMetrics := metrics.NewStateMetrics(reg)
-			hist := historian.NewAnnotationBackend(store, nil, m)
+			metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
 			store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, metrics)
 			hist := historian.NewAnnotationBackend(store, nil, metrics)
 			clk := clock.NewMock()
 			cfg := state.ManagerCfg{
-				Metrics:                 testMetrics.GetStateMetrics(),
+				Metrics:                 stateMetrics,
 				ExternalURL:             nil,
 				InstanceStore:           &state.FakeInstanceStore{},
 				Images:                  &state.NotAvailableImageService{},
 				Clock:                   clk,
 				Historian:               hist,
 				MaxStateSaveConcurrency: 1,
 				Tracer:                  tracing.InitializeTracerForTest(),
 			}
 			st := state.NewManager(cfg)
@@ -1220,7 +1228,7 @@ func TestProcessEvalResults(t *testing.T) {
 			slices.SortFunc(evals, func(a, b time.Time) bool {
 				return a.Before(b)
 			})
-
+			results := 0
 			for _, evalTime := range evals {
 				res := tc.evalResults[evalTime]
 				for i := 0; i < len(res); i++ {
@@ -1228,6 +1236,7 @@ func TestProcessEvalResults(t *testing.T) {
 				}
 				clk.Set(evalTime)
 				_ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
 				results += len(res)
 			}
 			states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID)
@@ -1278,6 +1287,22 @@ func TestProcessEvalResults(t *testing.T) {
 			require.Eventuallyf(t, func() bool {
 				return tc.expectedAnnotations == fakeAnnoRepo.Len()
 			}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items()))
 			expectedMetric := fmt.Sprintf(
 				`# HELP grafana_alerting_state_calculation_duration_seconds The duration of calculation of a single state.
        	            # TYPE grafana_alerting_state_calculation_duration_seconds histogram
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="0.01"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="0.1"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="1"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="2"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="5"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="10"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_bucket{le="+Inf"} %[1]d
        	            grafana_alerting_state_calculation_duration_seconds_sum 0
        	            grafana_alerting_state_calculation_duration_seconds_count %[1]d
 						`, results)
 			err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_state_calculation_duration_seconds", "grafana_alerting_state_calculation_total")
 			require.NoError(t, err)
 		})
 	}
@@ -1292,6 +1317,7 @@ func TestProcessEvalResults(t *testing.T) {
 			Clock:                   clk,
 			Historian:               &state.FakeHistorian{},
 			MaxStateSaveConcurrency: 1,
 			Tracer:                  tracing.InitializeTracerForTest(),
 		}
 		st := state.NewManager(cfg)
 		rule := models.AlertRuleGen()()
@@ -1442,6 +1468,7 @@ func TestStaleResultsHandler(t *testing.T) {
 			Clock:                   clock.New(),
 			Historian:               &state.FakeHistorian{},
 			MaxStateSaveConcurrency: 1,
 			Tracer:                  tracing.InitializeTracerForTest(),
 		}
 		st := state.NewManager(cfg)
 		st.Warm(ctx, dbstore)
@@ -1523,6 +1550,7 @@ func TestStaleResults(t *testing.T) {
 		Clock:                   clk,
 		Historian:               &state.FakeHistorian{},
 		MaxStateSaveConcurrency: 1,
 		Tracer:                  tracing.InitializeTracerForTest(),
 	}
 	st := state.NewManager(cfg)
@@ -1695,6 +1723,7 @@ func TestDeleteStateByRuleUID(t *testing.T) {
 				Clock:                   clk,
 				Historian:               &state.FakeHistorian{},
 				MaxStateSaveConcurrency: 1,
 				Tracer:                  tracing.InitializeTracerForTest(),
 			}
 			st := state.NewManager(cfg)
 			st.Warm(ctx, dbstore)
@@ -1835,6 +1864,7 @@ func TestResetStateByRuleUID(t *testing.T) {
 				Clock:                   clk,
 				Historian:               fakeHistorian,
 				MaxStateSaveConcurrency: 1,
 				Tracer:                  tracing.InitializeTracerForTest(),
 			}
 			st := state.NewManager(cfg)
 			st.Warm(ctx, dbstore)