mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: State manager to use InstanceStore (#53852)
* move saving the state to state manager when scheduler stops * move saving state to ProcessEvalResults * add GetRuleKey to State * add LogContext to AlertRuleKey
This commit is contained in:
@@ -66,8 +66,9 @@ func NewManager(logger log.Logger, metrics *metrics.State, externalURL *url.URL,
|
||||
return manager
|
||||
}
|
||||
|
||||
func (st *Manager) Close() {
|
||||
func (st *Manager) Close(ctx context.Context) {
|
||||
st.quit <- struct{}{}
|
||||
st.flushState(ctx)
|
||||
}
|
||||
|
||||
func (st *Manager) Warm(ctx context.Context) {
|
||||
@@ -161,7 +162,8 @@ func (st *Manager) RemoveByRuleUID(orgID int64, ruleUID string) {
|
||||
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
|
||||
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
|
||||
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []*State {
|
||||
st.log.Debug("state manager processing evaluation results", "uid", alertRule.UID, "resultCount", len(results))
|
||||
logger := st.log.New(alertRule.GetKey().LogContext())
|
||||
logger.Debug("state manager processing evaluation results", "resultCount", len(results))
|
||||
var states []*State
|
||||
processedResults := make(map[string]*State, len(results))
|
||||
for _, result := range results {
|
||||
@@ -170,6 +172,14 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
||||
processedResults[s.CacheId] = s
|
||||
}
|
||||
st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
||||
if len(states) > 0 {
|
||||
logger.Debug("saving new states to the database", "count", len(states))
|
||||
for _, state := range states {
|
||||
if err := st.saveState(ctx, state); err != nil {
|
||||
logger.Error("failed to save alert state", "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
return states
|
||||
}
|
||||
|
||||
@@ -297,6 +307,42 @@ func (st *Manager) Put(states []*State) {
|
||||
}
|
||||
}
|
||||
|
||||
// flushState dumps the entire state to the database
|
||||
func (st *Manager) flushState(ctx context.Context) {
|
||||
t := st.clock.Now()
|
||||
st.log.Info("flushing the state")
|
||||
st.cache.mtxStates.Lock()
|
||||
defer st.cache.mtxStates.Unlock()
|
||||
totalStates, errorsCnt := 0, 0
|
||||
for _, orgStates := range st.cache.states {
|
||||
for _, ruleStates := range orgStates {
|
||||
for _, state := range ruleStates {
|
||||
err := st.saveState(ctx, state)
|
||||
totalStates++
|
||||
if err != nil {
|
||||
st.log.Error("failed to save alert state", append(state.GetRuleKey().LogContext(), "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error()))
|
||||
errorsCnt++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
st.log.Info("the state has been flushed", "total_instances", totalStates, "errors", errorsCnt, "took", st.clock.Since(t))
|
||||
}
|
||||
|
||||
func (st *Manager) saveState(ctx context.Context, s *State) error {
|
||||
cmd := ngModels.SaveAlertInstanceCommand{
|
||||
RuleOrgID: s.OrgID,
|
||||
RuleUID: s.AlertRuleUID,
|
||||
Labels: ngModels.InstanceLabels(s.Labels),
|
||||
State: ngModels.InstanceStateType(s.State.String()),
|
||||
StateReason: s.StateReason,
|
||||
LastEvalTime: s.LastEvaluationTime,
|
||||
CurrentStateSince: s.StartsAt,
|
||||
CurrentStateEnd: s.EndsAt,
|
||||
}
|
||||
return st.instanceStore.SaveAlertInstance(ctx, &cmd)
|
||||
}
|
||||
|
||||
// TODO: why wouldn't you allow other types like NoData or Error?
|
||||
func translateInstanceState(state ngModels.InstanceStateType) eval.State {
|
||||
switch {
|
||||
|
||||
@@ -7,13 +7,16 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/annotations"
|
||||
"github.com/grafana/grafana/pkg/services/dashboards"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/image"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
@@ -149,3 +152,45 @@ func TestIsItStale(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestClose(t *testing.T) {
|
||||
instanceStore := &store.FakeInstanceStore{}
|
||||
clk := clock.New()
|
||||
st := NewManager(log.New("test_state_manager"), metrics.NewNGAlert(prometheus.NewPedanticRegistry()).GetStateMetrics(), nil, nil, instanceStore, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clk)
|
||||
fakeAnnoRepo := store.NewFakeAnnotationsRepo()
|
||||
annotations.SetRepository(fakeAnnoRepo)
|
||||
|
||||
_, rules := ngmodels.GenerateUniqueAlertRules(10, ngmodels.AlertRuleGen())
|
||||
for _, rule := range rules {
|
||||
results := eval.GenerateResults(rand.Intn(4)+1, eval.ResultGen(eval.WithEvaluatedAt(clk.Now())))
|
||||
_ = st.ProcessEvalResults(context.Background(), clk.Now(), rule, results, ngmodels.GenerateAlertLabels(rand.Intn(4), "extra_"))
|
||||
}
|
||||
var states []*State
|
||||
for _, org := range st.cache.states {
|
||||
for _, rule := range org {
|
||||
for _, state := range rule {
|
||||
states = append(states, state)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
instanceStore.RecordedOps = nil
|
||||
st.Close(context.Background())
|
||||
|
||||
t.Run("should flush the state to store", func(t *testing.T) {
|
||||
savedStates := make(map[string]ngmodels.SaveAlertInstanceCommand)
|
||||
for _, op := range instanceStore.RecordedOps {
|
||||
switch q := op.(type) {
|
||||
case ngmodels.SaveAlertInstanceCommand:
|
||||
cacheId, err := q.Labels.StringKey()
|
||||
require.NoError(t, err)
|
||||
savedStates[cacheId] = q
|
||||
}
|
||||
}
|
||||
|
||||
require.Len(t, savedStates, len(states))
|
||||
for _, s := range states {
|
||||
require.Contains(t, savedStates, s.CacheId)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"sort"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -2007,6 +2008,34 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items))
|
||||
})
|
||||
}
|
||||
|
||||
t.Run("should save state to database", func(t *testing.T) {
|
||||
fakeAnnoRepo := store.NewFakeAnnotationsRepo()
|
||||
annotations.SetRepository(fakeAnnoRepo)
|
||||
instanceStore := &store.FakeInstanceStore{}
|
||||
clk := clock.New()
|
||||
st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, instanceStore, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clk)
|
||||
rule := models.AlertRuleGen()()
|
||||
var results = eval.GenerateResults(rand.Intn(4)+1, eval.ResultGen(eval.WithEvaluatedAt(clk.Now())))
|
||||
|
||||
states := st.ProcessEvalResults(context.Background(), clk.Now(), rule, results, make(data.Labels))
|
||||
|
||||
require.NotEmpty(t, states)
|
||||
|
||||
savedStates := make(map[string]models.SaveAlertInstanceCommand)
|
||||
for _, op := range instanceStore.RecordedOps {
|
||||
switch q := op.(type) {
|
||||
case models.SaveAlertInstanceCommand:
|
||||
cacheId, err := q.Labels.StringKey()
|
||||
require.NoError(t, err)
|
||||
savedStates[cacheId] = q
|
||||
}
|
||||
}
|
||||
require.Len(t, savedStates, len(states))
|
||||
for _, s := range states {
|
||||
require.Contains(t, savedStates, s.CacheId)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func printAllAnnotations(annos []*annotations.Item) string {
|
||||
|
||||
@@ -36,6 +36,13 @@ type State struct {
|
||||
Error error
|
||||
}
|
||||
|
||||
func (a *State) GetRuleKey() models.AlertRuleKey {
|
||||
return models.AlertRuleKey{
|
||||
OrgID: a.OrgID,
|
||||
UID: a.AlertRuleUID,
|
||||
}
|
||||
}
|
||||
|
||||
type Evaluation struct {
|
||||
EvaluationTime time.Time
|
||||
EvaluationState eval.State
|
||||
|
||||
Reference in New Issue
Block a user