mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add new metrics and tracings to state manager and scheduler (#71398)
* add metrics and tracing to state manager * propagate tracer to state manager * add scheduler metrics * fix backtesting * add test for state metrics * remove StateUpdateCount * update docs * metrics can be null * add tracer to new tests
This commit is contained in:
@@ -24,12 +24,15 @@ Alerting rules can only query backend data sources with alerting enabled:
|
|||||||
|
|
||||||
The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../setup-grafana/set-up-grafana-monitoring/" >}}). See also, [View alert rules and their current state]({{< relref "../../alerting/fundamentals/state-and-health/" >}}).
|
The alerting engine publishes some internal metrics about itself. You can read more about how Grafana publishes [internal metrics]({{< relref "../../setup-grafana/set-up-grafana-monitoring/" >}}). See also, [View alert rules and their current state]({{< relref "../../alerting/fundamentals/state-and-health/" >}}).
|
||||||
|
|
||||||
| Metric Name | Type | Description |
|
| Metric Name | Type | Description |
|
||||||
| ------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
|
| --------------------------------------------------- | --------- | ---------------------------------------------------------------------------------------- |
|
||||||
| `alerting.alerts` | gauge | How many alerts by state |
|
| `alerting.alerts` | gauge | How many alerts by state |
|
||||||
| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API |
|
| `alerting.request_duration_seconds` | histogram | Histogram of requests to the Alerting API |
|
||||||
| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts |
|
| `alerting.active_configurations` | gauge | The number of active, non default alertmanager configurations for grafana managed alerts |
|
||||||
| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations |
|
| `alerting.rule_evaluations_total` | counter | The total number of rule evaluations |
|
||||||
| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
|
| `alerting.rule_evaluation_failures_total` | counter | The total number of rule evaluation failures |
|
||||||
| `alerting.rule_evaluation_duration_seconds` | summary | The duration for a rule to execute |
|
| `alerting.rule_evaluation_duration_seconds` | histogram | The time to evaluate a rule |
|
||||||
| `alerting.rule_group_rules` | gauge | The number of rules |
|
| `alerting.rule_process_evaluation_duration_seconds` | histogram | The time to process the evaluation results for a rule |
|
||||||
|
| `alerting.rule_send_alerts_duration_seconds` | histogram | The time to send the alerts to Alertmanager |
|
||||||
|
| `alerting.rule_group_rules` | gauge | The number of rules |
|
||||||
|
| `alerting.state_calculation_duration_seconds` | histogram | The duration of calculation of a single state |
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
|
|
||||||
"github.com/grafana/grafana/pkg/api/routing"
|
"github.com/grafana/grafana/pkg/api/routing"
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
||||||
"github.com/grafana/grafana/pkg/services/datasourceproxy"
|
"github.com/grafana/grafana/pkg/services/datasourceproxy"
|
||||||
"github.com/grafana/grafana/pkg/services/datasources"
|
"github.com/grafana/grafana/pkg/services/datasources"
|
||||||
@@ -84,8 +85,8 @@ type API struct {
|
|||||||
EvaluatorFactory eval.EvaluatorFactory
|
EvaluatorFactory eval.EvaluatorFactory
|
||||||
FeatureManager featuremgmt.FeatureToggles
|
FeatureManager featuremgmt.FeatureToggles
|
||||||
Historian Historian
|
Historian Historian
|
||||||
|
Tracer tracing.Tracer
|
||||||
AppUrl *url.URL
|
AppUrl *url.URL
|
||||||
|
|
||||||
// Hooks can be used to replace API handlers for specific paths.
|
// Hooks can be used to replace API handlers for specific paths.
|
||||||
Hooks *Hooks
|
Hooks *Hooks
|
||||||
@@ -134,9 +135,10 @@ func (api *API) RegisterAPIEndpoints(m *metrics.API) {
|
|||||||
accessControl: api.AccessControl,
|
accessControl: api.AccessControl,
|
||||||
evaluator: api.EvaluatorFactory,
|
evaluator: api.EvaluatorFactory,
|
||||||
cfg: &api.Cfg.UnifiedAlerting,
|
cfg: &api.Cfg.UnifiedAlerting,
|
||||||
backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory),
|
backtesting: backtesting.NewEngine(api.AppUrl, api.EvaluatorFactory, api.Tracer),
|
||||||
featureManager: api.FeatureManager,
|
featureManager: api.FeatureManager,
|
||||||
appUrl: api.AppUrl,
|
appUrl: api.AppUrl,
|
||||||
|
tracer: api.Tracer,
|
||||||
}), m)
|
}), m)
|
||||||
api.RegisterConfigurationApiEndpoints(NewConfiguration(
|
api.RegisterConfigurationApiEndpoints(NewConfiguration(
|
||||||
&ConfigSrv{
|
&ConfigSrv{
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import (
|
|||||||
|
|
||||||
"github.com/grafana/grafana/pkg/api/response"
|
"github.com/grafana/grafana/pkg/api/response"
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
||||||
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
||||||
"github.com/grafana/grafana/pkg/services/datasources"
|
"github.com/grafana/grafana/pkg/services/datasources"
|
||||||
@@ -39,6 +40,7 @@ type TestingApiSrv struct {
|
|||||||
backtesting *backtesting.Engine
|
backtesting *backtesting.Engine
|
||||||
featureManager featuremgmt.FeatureToggles
|
featureManager featuremgmt.FeatureToggles
|
||||||
appUrl *url.URL
|
appUrl *url.URL
|
||||||
|
tracer tracing.Tracer
|
||||||
}
|
}
|
||||||
|
|
||||||
// RouteTestGrafanaRuleConfig returns a list of potential alerts for a given rule configuration. This is intended to be
|
// RouteTestGrafanaRuleConfig returns a list of potential alerts for a given rule configuration. This is intended to be
|
||||||
@@ -86,6 +88,7 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext,
|
|||||||
Clock: clock.New(),
|
Clock: clock.New(),
|
||||||
Historian: nil,
|
Historian: nil,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: srv.tracer,
|
||||||
}
|
}
|
||||||
manager := state.NewManager(cfg)
|
manager := state.NewManager(cfg)
|
||||||
includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel)
|
includeFolder := !srv.cfg.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel)
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import (
|
|||||||
"github.com/stretchr/testify/mock"
|
"github.com/stretchr/testify/mock"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
||||||
acMock "github.com/grafana/grafana/pkg/services/accesscontrol/mock"
|
acMock "github.com/grafana/grafana/pkg/services/accesscontrol/mock"
|
||||||
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
contextmodel "github.com/grafana/grafana/pkg/services/contexthandler/model"
|
||||||
@@ -216,6 +217,7 @@ func TestRouteEvalQueries(t *testing.T) {
|
|||||||
|
|
||||||
srv := &TestingApiSrv{
|
srv := &TestingApiSrv{
|
||||||
accessControl: ac,
|
accessControl: ac,
|
||||||
|
tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
|
|
||||||
response := srv.RouteEvalQueries(rc, definitions.EvalQueriesPayload{
|
response := srv.RouteEvalQueries(rc, definitions.EvalQueriesPayload{
|
||||||
@@ -277,5 +279,6 @@ func createTestingApiSrv(t *testing.T, ds *fakes.FakeCacheService, ac *acMock.Mo
|
|||||||
accessControl: ac,
|
accessControl: ac,
|
||||||
evaluator: evaluator,
|
evaluator: evaluator,
|
||||||
cfg: config(t),
|
cfg: config(t),
|
||||||
|
tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||||
@@ -40,7 +41,7 @@ type Engine struct {
|
|||||||
createStateManager func() stateManager
|
createStateManager func() stateManager
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
|
func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory, tracer tracing.Tracer) *Engine {
|
||||||
return &Engine{
|
return &Engine{
|
||||||
evalFactory: evalFactory,
|
evalFactory: evalFactory,
|
||||||
createStateManager: func() stateManager {
|
createStateManager: func() stateManager {
|
||||||
@@ -52,6 +53,7 @@ func NewEngine(appUrl *url.URL, evalFactory eval.EvaluatorFactory) *Engine {
|
|||||||
Clock: clock.New(),
|
Clock: clock.New(),
|
||||||
Historian: nil,
|
Historian: nil,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracer,
|
||||||
}
|
}
|
||||||
return state.NewManager(cfg)
|
return state.NewManager(cfg)
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ type Scheduler struct {
|
|||||||
EvalTotal *prometheus.CounterVec
|
EvalTotal *prometheus.CounterVec
|
||||||
EvalFailures *prometheus.CounterVec
|
EvalFailures *prometheus.CounterVec
|
||||||
EvalDuration *prometheus.HistogramVec
|
EvalDuration *prometheus.HistogramVec
|
||||||
|
ProcessDuration *prometheus.HistogramVec
|
||||||
|
SendDuration *prometheus.HistogramVec
|
||||||
GroupRules *prometheus.GaugeVec
|
GroupRules *prometheus.GaugeVec
|
||||||
SchedulePeriodicDuration prometheus.Histogram
|
SchedulePeriodicDuration prometheus.Histogram
|
||||||
SchedulableAlertRules prometheus.Gauge
|
SchedulableAlertRules prometheus.Gauge
|
||||||
@@ -63,8 +65,28 @@ func NewSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
|||||||
Namespace: Namespace,
|
Namespace: Namespace,
|
||||||
Subsystem: Subsystem,
|
Subsystem: Subsystem,
|
||||||
Name: "rule_evaluation_duration_seconds",
|
Name: "rule_evaluation_duration_seconds",
|
||||||
Help: "The duration for a rule to execute.",
|
Help: "The time to evaluate a rule.",
|
||||||
Buckets: []float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10, 25, 50, 100},
|
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
ProcessDuration: promauto.With(r).NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_process_evaluation_duration_seconds",
|
||||||
|
Help: "The time to process the evaluation results for a rule.",
|
||||||
|
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
|
||||||
|
},
|
||||||
|
[]string{"org"},
|
||||||
|
),
|
||||||
|
SendDuration: promauto.With(r).NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "rule_send_alerts_duration_seconds",
|
||||||
|
Help: "The time to send the alerts to Alertmanager.",
|
||||||
|
Buckets: []float64{.01, .1, .5, 1, 5, 10, 15, 30, 60, 120, 180, 240, 300},
|
||||||
},
|
},
|
||||||
[]string{"org"},
|
[]string{"org"},
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -6,7 +6,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type State struct {
|
type State struct {
|
||||||
AlertState *prometheus.GaugeVec
|
AlertState *prometheus.GaugeVec
|
||||||
|
StateUpdateDuration prometheus.Histogram
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewStateMetrics(r prometheus.Registerer) *State {
|
func NewStateMetrics(r prometheus.Registerer) *State {
|
||||||
@@ -17,5 +18,14 @@ func NewStateMetrics(r prometheus.Registerer) *State {
|
|||||||
Name: "alerts",
|
Name: "alerts",
|
||||||
Help: "How many alerts by state.",
|
Help: "How many alerts by state.",
|
||||||
}, []string{"state"}),
|
}, []string{"state"}),
|
||||||
|
StateUpdateDuration: promauto.With(r).NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "state_calculation_duration_seconds",
|
||||||
|
Help: "The duration of calculation of a single state.",
|
||||||
|
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10},
|
||||||
|
},
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -221,6 +221,7 @@ func (ng *AlertNG) init() error {
|
|||||||
DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
|
DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
|
||||||
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
|
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
|
||||||
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoDataErrorExecution),
|
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoDataErrorExecution),
|
||||||
|
Tracer: ng.tracer,
|
||||||
}
|
}
|
||||||
stateManager := state.NewManager(cfg)
|
stateManager := state.NewManager(cfg)
|
||||||
scheduler := schedule.NewScheduler(schedCfg, stateManager)
|
scheduler := schedule.NewScheduler(schedCfg, stateManager)
|
||||||
@@ -268,6 +269,7 @@ func (ng *AlertNG) init() error {
|
|||||||
AppUrl: appUrl,
|
AppUrl: appUrl,
|
||||||
Historian: history,
|
Historian: history,
|
||||||
Hooks: api.NewHooks(ng.Log),
|
Hooks: api.NewHooks(ng.Log),
|
||||||
|
Tracer: ng.tracer,
|
||||||
}
|
}
|
||||||
ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
|
ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
|
||||||
|
|
||||||
|
|||||||
@@ -351,6 +351,8 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID)
|
evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID)
|
||||||
evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
|
evalDuration := sch.metrics.EvalDuration.WithLabelValues(orgID)
|
||||||
evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
|
evalTotalFailures := sch.metrics.EvalFailures.WithLabelValues(orgID)
|
||||||
|
processDuration := sch.metrics.ProcessDuration.WithLabelValues(orgID)
|
||||||
|
sendDuration := sch.metrics.SendDuration.WithLabelValues(orgID)
|
||||||
|
|
||||||
notify := func(states []state.StateTransition) {
|
notify := func(states []state.StateTransition) {
|
||||||
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
|
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, sch.appURL, sch.clock)
|
||||||
@@ -423,6 +425,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
logger.Debug("Skip updating the state because the context has been cancelled")
|
logger.Debug("Skip updating the state because the context has been cancelled")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
start = sch.clock.Now()
|
||||||
processedStates := sch.stateManager.ProcessEvalResults(
|
processedStates := sch.stateManager.ProcessEvalResults(
|
||||||
ctx,
|
ctx,
|
||||||
e.scheduledAt,
|
e.scheduledAt,
|
||||||
@@ -430,6 +433,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
results,
|
results,
|
||||||
state.GetRuleExtraLabels(e.rule, e.folderTitle, !sch.disableGrafanaFolder),
|
state.GetRuleExtraLabels(e.rule, e.folderTitle, !sch.disableGrafanaFolder),
|
||||||
)
|
)
|
||||||
|
processDuration.Observe(sch.clock.Now().Sub(start).Seconds())
|
||||||
|
|
||||||
|
start = sch.clock.Now()
|
||||||
alerts := state.FromStateTransitionToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
|
alerts := state.FromStateTransitionToPostableAlerts(processedStates, sch.stateManager, sch.appURL)
|
||||||
span.AddEvents(
|
span.AddEvents(
|
||||||
[]string{"message", "state_transitions", "alerts_to_send"},
|
[]string{"message", "state_transitions", "alerts_to_send"},
|
||||||
@@ -441,6 +447,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
|||||||
if len(alerts.PostableAlerts) > 0 {
|
if len(alerts.PostableAlerts) > 0 {
|
||||||
sch.alertsSender.Send(key, alerts)
|
sch.alertsSender.Send(key, alerts)
|
||||||
}
|
}
|
||||||
|
sendDuration.Observe(sch.clock.Now().Sub(start).Seconds())
|
||||||
}
|
}
|
||||||
|
|
||||||
retryIfError := func(f func(attempt int64) error) error {
|
retryIfError := func(f func(attempt int64) error) error {
|
||||||
|
|||||||
@@ -82,6 +82,7 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
Clock: mockedClock,
|
Clock: mockedClock,
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: testTracer,
|
||||||
}
|
}
|
||||||
st := state.NewManager(managerCfg)
|
st := state.NewManager(managerCfg)
|
||||||
|
|
||||||
@@ -453,22 +454,21 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
|||||||
t.Run("it reports metrics", func(t *testing.T) {
|
t.Run("it reports metrics", func(t *testing.T) {
|
||||||
// duration metric has 0 values because of mocked clock that do not advance
|
// duration metric has 0 values because of mocked clock that do not advance
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
|
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
|
||||||
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
|
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
|
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
|
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
|
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
|
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||||
@@ -478,9 +478,45 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
|||||||
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
||||||
# TYPE grafana_alerting_rule_evaluations_total counter
|
# TYPE grafana_alerting_rule_evaluations_total counter
|
||||||
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
||||||
|
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
||||||
|
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||||
|
# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
|
||||||
|
# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
||||||
`, rule.OrgID)
|
`, rule.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
@@ -641,34 +677,69 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
|||||||
t.Run("it should increase failure counter", func(t *testing.T) {
|
t.Run("it should increase failure counter", func(t *testing.T) {
|
||||||
// duration metric has 0 values because of mocked clock that do not advance
|
// duration metric has 0 values because of mocked clock that do not advance
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
`# HELP grafana_alerting_rule_evaluation_duration_seconds The duration for a rule to execute.
|
`# HELP grafana_alerting_rule_evaluation_duration_seconds The time to evaluate a rule.
|
||||||
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
# TYPE grafana_alerting_rule_evaluation_duration_seconds histogram
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.005"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.025"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.05"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.25"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="2.5"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="25"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="50"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="100"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||||
grafana_alerting_rule_evaluation_duration_seconds_count{org="%[1]d"} 1
|
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
|
||||||
# HELP grafana_alerting_rule_evaluation_failures_total The total number of rule evaluation failures.
|
# TYPE grafana_alerting_rule_evaluation_failures_total counter
|
||||||
# TYPE grafana_alerting_rule_evaluation_failures_total counter
|
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
|
||||||
grafana_alerting_rule_evaluation_failures_total{org="%[1]d"} 1
|
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
||||||
# HELP grafana_alerting_rule_evaluations_total The total number of rule evaluations.
|
# TYPE grafana_alerting_rule_evaluations_total counter
|
||||||
# TYPE grafana_alerting_rule_evaluations_total counter
|
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
||||||
grafana_alerting_rule_evaluations_total{org="%[1]d"} 1
|
# HELP grafana_alerting_rule_process_evaluation_duration_seconds The time to process the evaluation results for a rule.
|
||||||
|
# TYPE grafana_alerting_rule_process_evaluation_duration_seconds histogram
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_sum{org="%[1]d"} 0
|
||||||
|
grafana_alerting_rule_process_evaluation_duration_seconds_count{org="%[1]d"} 1
|
||||||
|
# HELP grafana_alerting_rule_send_alerts_duration_seconds The time to send the alerts to Alertmanager.
|
||||||
|
# TYPE grafana_alerting_rule_send_alerts_duration_seconds histogram
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.01"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.1"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="0.5"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="1"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="5"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="10"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="15"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="30"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="60"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="120"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="180"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="240"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="300"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_bucket{org="%[1]d",le="+Inf"} 1
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_sum{org="%[1]d"} 0
|
||||||
|
grafana_alerting_rule_send_alerts_duration_seconds_count{org="%[1]d"} 1
|
||||||
`, rule.OrgID)
|
`, rule.OrgID)
|
||||||
|
|
||||||
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total")
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_rule_evaluation_duration_seconds", "grafana_alerting_rule_evaluations_total", "grafana_alerting_rule_evaluation_failures_total", "grafana_alerting_rule_process_evaluation_duration_seconds", "grafana_alerting_rule_send_alerts_duration_seconds")
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -826,6 +897,7 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor
|
|||||||
Clock: mockedClock,
|
Clock: mockedClock,
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: testTracer,
|
||||||
}
|
}
|
||||||
st := state.NewManager(managerCfg)
|
st := state.NewManager(managerCfg)
|
||||||
|
|
||||||
|
|||||||
@@ -8,8 +8,10 @@ import (
|
|||||||
"github.com/benbjohnson/clock"
|
"github.com/benbjohnson/clock"
|
||||||
"github.com/grafana/dskit/concurrency"
|
"github.com/grafana/dskit/concurrency"
|
||||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||||
|
"go.opentelemetry.io/otel/attribute"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||||
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
@@ -30,6 +32,7 @@ type AlertInstanceManager interface {
|
|||||||
type Manager struct {
|
type Manager struct {
|
||||||
log log.Logger
|
log log.Logger
|
||||||
metrics *metrics.State
|
metrics *metrics.State
|
||||||
|
tracer tracing.Tracer
|
||||||
|
|
||||||
clock clock.Clock
|
clock clock.Clock
|
||||||
cache *cache
|
cache *cache
|
||||||
@@ -60,6 +63,8 @@ type ManagerCfg struct {
|
|||||||
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
|
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
|
||||||
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
|
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
|
||||||
ApplyNoDataAndErrorToAllStates bool
|
ApplyNoDataAndErrorToAllStates bool
|
||||||
|
|
||||||
|
Tracer tracing.Tracer
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewManager(cfg ManagerCfg) *Manager {
|
func NewManager(cfg ManagerCfg) *Manager {
|
||||||
@@ -76,6 +81,7 @@ func NewManager(cfg ManagerCfg) *Manager {
|
|||||||
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
||||||
maxStateSaveConcurrency: cfg.MaxStateSaveConcurrency,
|
maxStateSaveConcurrency: cfg.MaxStateSaveConcurrency,
|
||||||
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
|
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
|
||||||
|
tracer: cfg.Tracer,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -251,18 +257,46 @@ func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.Alert
|
|||||||
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
|
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
|
||||||
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
|
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
|
||||||
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition {
|
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition {
|
||||||
logger := st.log.FromContext(ctx)
|
tracingCtx, span := st.tracer.Start(ctx, "alert rule state calculation")
|
||||||
|
defer span.End()
|
||||||
|
span.SetAttributes("rule_uid", alertRule.UID, attribute.String("rule_uid", alertRule.UID))
|
||||||
|
span.SetAttributes("org_id", alertRule.OrgID, attribute.Int64("org_id", alertRule.OrgID))
|
||||||
|
span.SetAttributes("rule_version", alertRule.Version, attribute.Int64("rule_version", alertRule.Version))
|
||||||
|
utcTick := evaluatedAt.UTC().Format(time.RFC3339Nano)
|
||||||
|
span.SetAttributes("tick", utcTick, attribute.String("tick", utcTick))
|
||||||
|
span.SetAttributes("results", len(results), attribute.Int("tick", len(results)))
|
||||||
|
|
||||||
|
logger := st.log.FromContext(tracingCtx)
|
||||||
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
|
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
|
||||||
states := st.setNextStateForRule(ctx, alertRule, results, extraLabels, logger)
|
states := st.setNextStateForRule(tracingCtx, alertRule, results, extraLabels, logger)
|
||||||
|
|
||||||
|
span.AddEvents([]string{"message", "state_transitions"},
|
||||||
|
[]tracing.EventValue{
|
||||||
|
{Str: "results processed"},
|
||||||
|
{Num: int64(len(states))},
|
||||||
|
})
|
||||||
|
|
||||||
staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
|
staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
|
||||||
st.deleteAlertStates(ctx, logger, staleStates)
|
st.deleteAlertStates(tracingCtx, logger, staleStates)
|
||||||
|
|
||||||
st.saveAlertStates(ctx, logger, states...)
|
if len(staleStates) > 0 {
|
||||||
|
span.AddEvents([]string{"message", "state_transitions"},
|
||||||
|
[]tracing.EventValue{
|
||||||
|
{Str: "deleted stale states"},
|
||||||
|
{Num: int64(len(staleStates))},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
st.saveAlertStates(tracingCtx, logger, states...)
|
||||||
|
|
||||||
|
span.AddEvents([]string{"message"},
|
||||||
|
[]tracing.EventValue{
|
||||||
|
{Str: "updated database"},
|
||||||
|
})
|
||||||
|
|
||||||
allChanges := append(states, staleStates...)
|
allChanges := append(states, staleStates...)
|
||||||
if st.historian != nil {
|
if st.historian != nil {
|
||||||
st.historian.Record(ctx, history_model.NewRuleMeta(alertRule, logger), allChanges)
|
st.historian.Record(tracingCtx, history_model.NewRuleMeta(alertRule, logger), allChanges)
|
||||||
}
|
}
|
||||||
return allChanges
|
return allChanges
|
||||||
}
|
}
|
||||||
@@ -303,6 +337,7 @@ func (st *Manager) setNextStateForAll(ctx context.Context, alertRule *ngModels.A
|
|||||||
|
|
||||||
// Set the current state based on evaluation results
|
// Set the current state based on evaluation results
|
||||||
func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, currentState *State, result eval.Result, logger log.Logger) StateTransition {
|
func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, currentState *State, result eval.Result, logger log.Logger) StateTransition {
|
||||||
|
start := st.clock.Now()
|
||||||
currentState.LastEvaluationTime = result.EvaluatedAt
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
||||||
currentState.EvaluationDuration = result.EvaluationDuration
|
currentState.EvaluationDuration = result.EvaluationDuration
|
||||||
currentState.Results = append(currentState.Results, Evaluation{
|
currentState.Results = append(currentState.Results, Evaluation{
|
||||||
@@ -390,6 +425,10 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
|
|||||||
PreviousStateReason: oldReason,
|
PreviousStateReason: oldReason,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if st.metrics != nil {
|
||||||
|
st.metrics.StateUpdateDuration.Observe(st.clock.Now().Sub(start).Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
return nextState
|
return nextState
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,14 +6,16 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/stretchr/testify/mock"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/annotations"
|
"github.com/grafana/grafana/pkg/services/annotations"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
"github.com/grafana/grafana/pkg/services/ngalert/state"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
|
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/stretchr/testify/mock"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func BenchmarkProcessEvalResults(b *testing.B) {
|
func BenchmarkProcessEvalResults(b *testing.B) {
|
||||||
@@ -25,6 +27,7 @@ func BenchmarkProcessEvalResults(b *testing.B) {
|
|||||||
cfg := state.ManagerCfg{
|
cfg := state.ManagerCfg{
|
||||||
Historian: hist,
|
Historian: hist,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
sut := state.NewManager(cfg)
|
sut := state.NewManager(cfg)
|
||||||
now := time.Now().UTC()
|
now := time.Now().UTC()
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import (
|
|||||||
|
|
||||||
"github.com/grafana/grafana/pkg/expr"
|
"github.com/grafana/grafana/pkg/expr"
|
||||||
"github.com/grafana/grafana/pkg/infra/log/logtest"
|
"github.com/grafana/grafana/pkg/infra/log/logtest"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
@@ -311,6 +312,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
|
|||||||
|
|
||||||
cfg := ManagerCfg{
|
cfg := ManagerCfg{
|
||||||
Metrics: testMetrics,
|
Metrics: testMetrics,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
ExternalURL: nil,
|
ExternalURL: nil,
|
||||||
InstanceStore: &FakeInstanceStore{},
|
InstanceStore: &FakeInstanceStore{},
|
||||||
Images: &NotAvailableImageService{},
|
Images: &NotAvailableImageService{},
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package state_test
|
package state_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
@@ -15,11 +16,13 @@ import (
|
|||||||
"github.com/google/go-cmp/cmp/cmpopts"
|
"github.com/google/go-cmp/cmp/cmpopts"
|
||||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"golang.org/x/exp/slices"
|
"golang.org/x/exp/slices"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/expr"
|
"github.com/grafana/grafana/pkg/expr"
|
||||||
|
"github.com/grafana/grafana/pkg/infra/tracing"
|
||||||
"github.com/grafana/grafana/pkg/services/annotations"
|
"github.com/grafana/grafana/pkg/services/annotations"
|
||||||
"github.com/grafana/grafana/pkg/services/annotations/annotationstest"
|
"github.com/grafana/grafana/pkg/services/annotations/annotationstest"
|
||||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||||
@@ -198,6 +201,7 @@ func TestWarmStateCache(t *testing.T) {
|
|||||||
Clock: clock.NewMock(),
|
Clock: clock.NewMock(),
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
st.Warm(ctx, dbstore)
|
st.Warm(ctx, dbstore)
|
||||||
@@ -234,6 +238,7 @@ func TestDashboardAnnotations(t *testing.T) {
|
|||||||
Clock: clock.New(),
|
Clock: clock.New(),
|
||||||
Historian: hist,
|
Historian: hist,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
|
|
||||||
@@ -1198,18 +1203,21 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
t.Run(tc.desc, func(t *testing.T) {
|
t.Run(tc.desc, func(t *testing.T) {
|
||||||
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
|
fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo()
|
||||||
m := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
reg := prometheus.NewPedanticRegistry()
|
||||||
store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, m)
|
stateMetrics := metrics.NewStateMetrics(reg)
|
||||||
hist := historian.NewAnnotationBackend(store, nil, m)
|
metrics := metrics.NewHistorianMetrics(prometheus.NewRegistry())
|
||||||
|
store := historian.NewAnnotationStore(fakeAnnoRepo, &dashboards.FakeDashboardService{}, metrics)
|
||||||
|
hist := historian.NewAnnotationBackend(store, nil, metrics)
|
||||||
clk := clock.NewMock()
|
clk := clock.NewMock()
|
||||||
cfg := state.ManagerCfg{
|
cfg := state.ManagerCfg{
|
||||||
Metrics: testMetrics.GetStateMetrics(),
|
Metrics: stateMetrics,
|
||||||
ExternalURL: nil,
|
ExternalURL: nil,
|
||||||
InstanceStore: &state.FakeInstanceStore{},
|
InstanceStore: &state.FakeInstanceStore{},
|
||||||
Images: &state.NotAvailableImageService{},
|
Images: &state.NotAvailableImageService{},
|
||||||
Clock: clk,
|
Clock: clk,
|
||||||
Historian: hist,
|
Historian: hist,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
|
|
||||||
@@ -1220,7 +1228,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
slices.SortFunc(evals, func(a, b time.Time) bool {
|
slices.SortFunc(evals, func(a, b time.Time) bool {
|
||||||
return a.Before(b)
|
return a.Before(b)
|
||||||
})
|
})
|
||||||
|
results := 0
|
||||||
for _, evalTime := range evals {
|
for _, evalTime := range evals {
|
||||||
res := tc.evalResults[evalTime]
|
res := tc.evalResults[evalTime]
|
||||||
for i := 0; i < len(res); i++ {
|
for i := 0; i < len(res); i++ {
|
||||||
@@ -1228,6 +1236,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
}
|
}
|
||||||
clk.Set(evalTime)
|
clk.Set(evalTime)
|
||||||
_ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
|
_ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
|
||||||
|
results += len(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID)
|
states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID)
|
||||||
@@ -1278,6 +1287,22 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
require.Eventuallyf(t, func() bool {
|
require.Eventuallyf(t, func() bool {
|
||||||
return tc.expectedAnnotations == fakeAnnoRepo.Len()
|
return tc.expectedAnnotations == fakeAnnoRepo.Len()
|
||||||
}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items()))
|
}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items()))
|
||||||
|
|
||||||
|
expectedMetric := fmt.Sprintf(
|
||||||
|
`# HELP grafana_alerting_state_calculation_duration_seconds The duration of calculation of a single state.
|
||||||
|
# TYPE grafana_alerting_state_calculation_duration_seconds histogram
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="0.01"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="0.1"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="1"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="2"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="5"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="10"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_bucket{le="+Inf"} %[1]d
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_sum 0
|
||||||
|
grafana_alerting_state_calculation_duration_seconds_count %[1]d
|
||||||
|
`, results)
|
||||||
|
err := testutil.GatherAndCompare(reg, bytes.NewBufferString(expectedMetric), "grafana_alerting_state_calculation_duration_seconds", "grafana_alerting_state_calculation_total")
|
||||||
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1292,6 +1317,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
Clock: clk,
|
Clock: clk,
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
rule := models.AlertRuleGen()()
|
rule := models.AlertRuleGen()()
|
||||||
@@ -1442,6 +1468,7 @@ func TestStaleResultsHandler(t *testing.T) {
|
|||||||
Clock: clock.New(),
|
Clock: clock.New(),
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
st.Warm(ctx, dbstore)
|
st.Warm(ctx, dbstore)
|
||||||
@@ -1523,6 +1550,7 @@ func TestStaleResults(t *testing.T) {
|
|||||||
Clock: clk,
|
Clock: clk,
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
|
|
||||||
@@ -1695,6 +1723,7 @@ func TestDeleteStateByRuleUID(t *testing.T) {
|
|||||||
Clock: clk,
|
Clock: clk,
|
||||||
Historian: &state.FakeHistorian{},
|
Historian: &state.FakeHistorian{},
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
st.Warm(ctx, dbstore)
|
st.Warm(ctx, dbstore)
|
||||||
@@ -1835,6 +1864,7 @@ func TestResetStateByRuleUID(t *testing.T) {
|
|||||||
Clock: clk,
|
Clock: clk,
|
||||||
Historian: fakeHistorian,
|
Historian: fakeHistorian,
|
||||||
MaxStateSaveConcurrency: 1,
|
MaxStateSaveConcurrency: 1,
|
||||||
|
Tracer: tracing.InitializeTracerForTest(),
|
||||||
}
|
}
|
||||||
st := state.NewManager(cfg)
|
st := state.NewManager(cfg)
|
||||||
st.Warm(ctx, dbstore)
|
st.Warm(ctx, dbstore)
|
||||||
|
|||||||
Reference in New Issue
Block a user