diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index c045f441322..42d3cda8c40 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -29,6 +29,7 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/schedule" "github.com/grafana/grafana/pkg/services/ngalert/sender" "github.com/grafana/grafana/pkg/services/ngalert/state" + "github.com/grafana/grafana/pkg/services/ngalert/state/historian" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/services/notifications" "github.com/grafana/grafana/pkg/services/quota" @@ -167,7 +168,8 @@ func (ng *AlertNG) init() error { AlertSender: alertsRouter, } - stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), appUrl, store, store, ng.dashboardService, ng.imageService, clk, ng.annotationsRepo) + historian := historian.NewAnnotationHistorian(ng.annotationsRepo, ng.dashboardService, ng.Log) + stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), appUrl, store, store, ng.imageService, clk, historian) scheduler := schedule.NewScheduler(schedCfg, appUrl, stateManager) // if it is required to include folder title to the alerts, we need to subscribe to changes of alert title diff --git a/pkg/services/ngalert/schedule/schedule_test.go b/pkg/services/ngalert/schedule/schedule_test.go index 496a2d18d29..ff36b53d869 100644 --- a/pkg/services/ngalert/schedule/schedule_test.go +++ b/pkg/services/ngalert/schedule/schedule_test.go @@ -19,8 +19,6 @@ import ( "github.com/stretchr/testify/require" "github.com/grafana/grafana/pkg/infra/log" - "github.com/grafana/grafana/pkg/services/annotations/annotationstest" - "github.com/grafana/grafana/pkg/services/dashboards" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/image" "github.com/grafana/grafana/pkg/services/ngalert/metrics" @@ -112,7 +110,7 @@ func TestWarmStateCache(t *testing.T) { RuleStore: dbstore, Metrics: testMetrics.GetSchedulerMetrics(), } - st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clock.NewMock(), annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) st.Warm(ctx) t.Run("instance cache has expected entries", func(t *testing.T) { @@ -165,7 +163,7 @@ func TestAlertingTicker(t *testing.T) { Metrics: testMetrics.GetSchedulerMetrics(), AlertSender: notifier, } - st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clock.NewMock(), annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) appUrl := &url.URL{ Scheme: "http", Host: "localhost", diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index d41b35e9cc2..657a8e8a155 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -21,8 +21,6 @@ import ( "github.com/grafana/grafana/pkg/expr" "github.com/grafana/grafana/pkg/infra/log" - "github.com/grafana/grafana/pkg/services/annotations/annotationstest" - "github.com/grafana/grafana/pkg/services/dashboards" "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/image" @@ -530,7 +528,7 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor } stateRs := state.FakeRuleReader{} - st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), nil, &stateRs, is, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, mockedClock, annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), nil, &stateRs, is, &image.NoopImageService{}, mockedClock, &state.FakeHistorian{}) return NewScheduler(schedCfg, appUrl, st) } diff --git a/pkg/services/ngalert/state/historian/annotation.go b/pkg/services/ngalert/state/historian/annotation.go new file mode 100644 index 00000000000..739fa901ce3 --- /dev/null +++ b/pkg/services/ngalert/state/historian/annotation.go @@ -0,0 +1,88 @@ +package historian + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + "github.com/grafana/grafana-plugin-sdk-go/data" + "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/models" + "github.com/grafana/grafana/pkg/services/annotations" + "github.com/grafana/grafana/pkg/services/dashboards" + ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" + "github.com/grafana/grafana/pkg/services/ngalert/state" +) + +// AnnotationStateHistorian is an implementation of state.Historian that uses Grafana Annotations as the backing datastore. +type AnnotationStateHistorian struct { + annotations annotations.Repository + dashboards dashboards.DashboardService + log log.Logger +} + +func NewAnnotationHistorian(annotations annotations.Repository, dashboards dashboards.DashboardService, log log.Logger) *AnnotationStateHistorian { + return &AnnotationStateHistorian{ + annotations: annotations, + dashboards: dashboards, + log: log, + } +} + +func (h *AnnotationStateHistorian) RecordState(ctx context.Context, rule *ngmodels.AlertRule, labels data.Labels, evaluatedAt time.Time, currentData, previousData state.InstanceStateAndReason) { + h.log.Debug("alert state changed creating annotation", "alertRuleUID", rule.UID, "newState", currentData.String(), "oldState", previousData.String()) + + labels = removePrivateLabels(labels) + annotationText := fmt.Sprintf("%s {%s} - %s", rule.Title, labels.String(), currentData.String()) + + item := &annotations.Item{ + AlertId: rule.ID, + OrgId: rule.OrgID, + PrevState: previousData.String(), + NewState: currentData.String(), + Text: annotationText, + Epoch: evaluatedAt.UnixNano() / int64(time.Millisecond), + } + + dashUid, ok := rule.Annotations[ngmodels.DashboardUIDAnnotation] + if ok { + panelUid := rule.Annotations[ngmodels.PanelIDAnnotation] + + panelId, err := strconv.ParseInt(panelUid, 10, 64) + if err != nil { + h.log.Error("error parsing panelUID for alert annotation", "panelUID", panelUid, "alertRuleUID", rule.UID, "err", err.Error()) + return + } + + query := &models.GetDashboardQuery{ + Uid: dashUid, + OrgId: rule.OrgID, + } + + err = h.dashboards.GetDashboard(ctx, query) + if err != nil { + h.log.Error("error getting dashboard for alert annotation", "dashboardUID", dashUid, "alertRuleUID", rule.UID, "err", err.Error()) + return + } + + item.PanelId = panelId + item.DashboardId = query.Result.Id + } + + if err := h.annotations.Save(ctx, item); err != nil { + h.log.Error("error saving alert annotation", "alertRuleUID", rule.UID, "err", err.Error()) + return + } +} + +func removePrivateLabels(labels data.Labels) data.Labels { + result := make(data.Labels) + for k, v := range labels { + if !strings.HasPrefix(k, "__") && !strings.HasSuffix(k, "__") { + result[k] = v + } + } + return result +} diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index 02857ecbc41..69ff45e0930 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -5,17 +5,12 @@ import ( "errors" "fmt" "net/url" - "strconv" - "strings" "time" "github.com/benbjohnson/clock" "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/infra/log" - "github.com/grafana/grafana/pkg/models" - "github.com/grafana/grafana/pkg/services/annotations" - "github.com/grafana/grafana/pkg/services/dashboards" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/image" "github.com/grafana/grafana/pkg/services/ngalert/metrics" @@ -41,28 +36,25 @@ type Manager struct { quit chan struct{} ResendDelay time.Duration - ruleStore RuleReader - instanceStore InstanceStore - dashboardService dashboards.DashboardService - imageService image.ImageService - AnnotationsRepo annotations.Repository + ruleStore RuleReader + instanceStore InstanceStore + imageService image.ImageService + historian Historian } func NewManager(logger log.Logger, metrics *metrics.State, externalURL *url.URL, - ruleStore RuleReader, instanceStore InstanceStore, - dashboardService dashboards.DashboardService, imageService image.ImageService, clock clock.Clock, annotationsRepo annotations.Repository) *Manager { + ruleStore RuleReader, instanceStore InstanceStore, imageService image.ImageService, clock clock.Clock, historian Historian) *Manager { manager := &Manager{ - cache: newCache(logger, metrics, externalURL), - quit: make(chan struct{}), - ResendDelay: ResendDelay, // TODO: make this configurable - log: logger, - metrics: metrics, - ruleStore: ruleStore, - instanceStore: instanceStore, - dashboardService: dashboardService, - imageService: imageService, - clock: clock, - AnnotationsRepo: annotationsRepo, + cache: newCache(logger, metrics, externalURL), + quit: make(chan struct{}), + ResendDelay: ResendDelay, // TODO: make this configurable + log: logger, + metrics: metrics, + ruleStore: ruleStore, + instanceStore: instanceStore, + imageService: imageService, + historian: historian, + clock: clock, } go manager.recordMetrics() return manager @@ -281,7 +273,7 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu shouldUpdateAnnotation := oldState != currentState.State || oldReason != currentState.StateReason if shouldUpdateAnnotation { - go st.annotateState(ctx, alertRule, currentState.Labels, result.EvaluatedAt, InstanceStateAndReason{State: currentState.State, Reason: currentState.StateReason}, InstanceStateAndReason{State: oldState, Reason: oldReason}) + go st.historian.RecordState(ctx, alertRule, currentState.Labels, result.EvaluatedAt, InstanceStateAndReason{State: currentState.State, Reason: currentState.StateReason}, InstanceStateAndReason{State: oldState, Reason: oldReason}) } return currentState } @@ -358,52 +350,6 @@ func (i InstanceStateAndReason) String() string { return s } -func (st *Manager) annotateState(ctx context.Context, alertRule *ngModels.AlertRule, labels data.Labels, evaluatedAt time.Time, currentData, previousData InstanceStateAndReason) { - st.log.Debug("alert state changed creating annotation", "alertRuleUID", alertRule.UID, "newState", currentData.String(), "oldState", previousData.String()) - - labels = removePrivateLabels(labels) - annotationText := fmt.Sprintf("%s {%s} - %s", alertRule.Title, labels.String(), currentData.String()) - - item := &annotations.Item{ - AlertId: alertRule.ID, - OrgId: alertRule.OrgID, - PrevState: previousData.String(), - NewState: currentData.String(), - Text: annotationText, - Epoch: evaluatedAt.UnixNano() / int64(time.Millisecond), - } - - dashUid, ok := alertRule.Annotations[ngModels.DashboardUIDAnnotation] - if ok { - panelUid := alertRule.Annotations[ngModels.PanelIDAnnotation] - - panelId, err := strconv.ParseInt(panelUid, 10, 64) - if err != nil { - st.log.Error("error parsing panelUID for alert annotation", "panelUID", panelUid, "alertRuleUID", alertRule.UID, "err", err.Error()) - return - } - - query := &models.GetDashboardQuery{ - Uid: dashUid, - OrgId: alertRule.OrgID, - } - - err = st.dashboardService.GetDashboard(ctx, query) - if err != nil { - st.log.Error("error getting dashboard for alert annotation", "dashboardUID", dashUid, "alertRuleUID", alertRule.UID, "err", err.Error()) - return - } - - item.PanelId = panelId - item.DashboardId = query.Result.Id - } - - if err := st.AnnotationsRepo.Save(ctx, item); err != nil { - st.log.Error("error saving alert annotation", "alertRuleUID", alertRule.UID, "err", err.Error()) - return - } -} - func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State { var resolvedStates []*State allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID) @@ -428,7 +374,7 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim s.StateReason = ngModels.StateReasonMissingSeries s.EndsAt = evaluatedAt s.Resolved = true - st.annotateState(ctx, alertRule, s.Labels, evaluatedAt, + st.historian.RecordState(ctx, alertRule, s.Labels, evaluatedAt, InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason}, previousState, ) @@ -442,13 +388,3 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool { return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt) } - -func removePrivateLabels(labels data.Labels) data.Labels { - result := make(data.Labels) - for k, v := range labels { - if !strings.HasPrefix(k, "__") && !strings.HasSuffix(k, "__") { - result[k] = v - } - } - return result -} diff --git a/pkg/services/ngalert/state/manager_private_test.go b/pkg/services/ngalert/state/manager_private_test.go index b3f7dc7cad0..9dbe3f42e6b 100644 --- a/pkg/services/ngalert/state/manager_private_test.go +++ b/pkg/services/ngalert/state/manager_private_test.go @@ -12,8 +12,6 @@ import ( "github.com/benbjohnson/clock" "github.com/grafana/grafana/pkg/infra/log" - "github.com/grafana/grafana/pkg/services/annotations/annotationstest" - "github.com/grafana/grafana/pkg/services/dashboards" "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -95,7 +93,7 @@ func Test_maybeNewImage(t *testing.T) { imageService := &CountingImageService{} mgr := NewManager(log.NewNopLogger(), &metrics.State{}, nil, &FakeRuleReader{}, &FakeInstanceStore{}, - &dashboards.FakeDashboardService{}, imageService, clock.NewMock(), annotationstest.NewFakeAnnotationsRepo()) + imageService, clock.NewMock(), &FakeHistorian{}) err := mgr.maybeTakeScreenshot(context.Background(), &ngmodels.AlertRule{}, test.state, test.oldState) require.NoError(t, err) if !test.shouldScreenshot { diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index 676305787da..95cce8bbc40 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -25,6 +25,7 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/state" + "github.com/grafana/grafana/pkg/services/ngalert/state/historian" "github.com/grafana/grafana/pkg/services/ngalert/tests" ) @@ -38,7 +39,8 @@ func TestDashboardAnnotations(t *testing.T) { _, dbstore := tests.SetupTestEnv(t, 1) fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo() - st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clock.New(), fakeAnnoRepo) + hist := historian.NewAnnotationHistorian(fakeAnnoRepo, &dashboards.FakeDashboardService{}, log.NewNopLogger()) + st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.New(), hist) const mainOrgID int64 = 1 @@ -1980,7 +1982,8 @@ func TestProcessEvalResults(t *testing.T) { for _, tc := range testCases { fakeAnnoRepo := annotationstest.NewFakeAnnotationsRepo() - st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, &state.FakeInstanceStore{}, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clock.New(), fakeAnnoRepo) + hist := historian.NewAnnotationHistorian(fakeAnnoRepo, &dashboards.FakeDashboardService{}, log.NewNopLogger()) + st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, &state.FakeInstanceStore{}, &image.NotAvailableImageService{}, clock.New(), hist) t.Run(tc.desc, func(t *testing.T) { for _, res := range tc.evalResults { _ = st.ProcessEvalResults(context.Background(), evaluationTime, tc.alertRule, res, data.Labels{ @@ -2008,7 +2011,7 @@ func TestProcessEvalResults(t *testing.T) { t.Run("should save state to database", func(t *testing.T) { instanceStore := &state.FakeInstanceStore{} clk := clock.New() - st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, instanceStore, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clk, annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, instanceStore, &image.NotAvailableImageService{}, clk, &state.FakeHistorian{}) rule := models.AlertRuleGen()() var results = eval.GenerateResults(rand.Intn(4)+1, eval.ResultGen(eval.WithEvaluatedAt(clk.Now()))) @@ -2125,7 +2128,7 @@ func TestStaleResultsHandler(t *testing.T) { for _, tc := range testCases { ctx := context.Background() - st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clock.New(), annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.New(), &state.FakeHistorian{}) st.Warm(ctx) existingStatesForRule := st.GetStatesForRuleUID(rule.OrgID, rule.UID) @@ -2188,7 +2191,7 @@ func TestStaleResults(t *testing.T) { clk := clock.NewMock() clk.Set(time.Now()) - st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &dashboards.FakeDashboardService{}, &image.NoopImageService{}, clk, annotationstest.NewFakeAnnotationsRepo()) + st := state.NewManager(log.New("test_stale_results_handler"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clk, &state.FakeHistorian{}) orgID := rand.Int63() rule := tests.CreateTestAlertRule(t, ctx, dbstore, 10, orgID) diff --git a/pkg/services/ngalert/state/persist.go b/pkg/services/ngalert/state/persist.go index b433aef29f8..6b3bf1a8732 100644 --- a/pkg/services/ngalert/state/persist.go +++ b/pkg/services/ngalert/state/persist.go @@ -2,7 +2,9 @@ package state import ( "context" + "time" + "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/services/ngalert/models" ) @@ -19,3 +21,8 @@ type InstanceStore interface { type RuleReader interface { ListAlertRules(ctx context.Context, query *models.ListAlertRulesQuery) error } + +// Historian maintains an audit log of alert state history. +type Historian interface { + RecordState(ctx context.Context, rule *models.AlertRule, labels data.Labels, evaluatedAt time.Time, currentData, previousData InstanceStateAndReason) +} diff --git a/pkg/services/ngalert/state/testing.go b/pkg/services/ngalert/state/testing.go index ad7e6dc53f2..9b04dfaa0cd 100644 --- a/pkg/services/ngalert/state/testing.go +++ b/pkg/services/ngalert/state/testing.go @@ -3,7 +3,9 @@ package state import ( "context" "sync" + "time" + "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/services/ngalert/models" ) @@ -41,3 +43,8 @@ type FakeRuleReader struct{} func (f *FakeRuleReader) ListAlertRules(_ context.Context, q *models.ListAlertRulesQuery) error { return nil } + +type FakeHistorian struct{} + +func (f *FakeHistorian) RecordState(ctx context.Context, rule *models.AlertRule, labels data.Labels, evaluatedAt time.Time, currentData, previousData InstanceStateAndReason) { +}