Alerting: State manager to use InstanceStore (#53852)

* move saving the state to state manager when scheduler stops
* move saving state to ProcessEvalResults

* add GetRuleKey to State
* add LogContext to AlertRuleKey
This commit is contained in:
Yuriy Tseretyan
2022-08-18 09:40:33 -04:00
committed by GitHub
parent 86de94cbfa
commit 9f90a7b54d
8 changed files with 218 additions and 45 deletions

View File

@@ -66,8 +66,9 @@ func NewManager(logger log.Logger, metrics *metrics.State, externalURL *url.URL,
return manager
}
func (st *Manager) Close() {
func (st *Manager) Close(ctx context.Context) {
st.quit <- struct{}{}
st.flushState(ctx)
}
func (st *Manager) Warm(ctx context.Context) {
@@ -161,7 +162,8 @@ func (st *Manager) RemoveByRuleUID(orgID int64, ruleUID string) {
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []*State {
st.log.Debug("state manager processing evaluation results", "uid", alertRule.UID, "resultCount", len(results))
logger := st.log.New(alertRule.GetKey().LogContext())
logger.Debug("state manager processing evaluation results", "resultCount", len(results))
var states []*State
processedResults := make(map[string]*State, len(results))
for _, result := range results {
@@ -170,6 +172,14 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
processedResults[s.CacheId] = s
}
st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
if len(states) > 0 {
logger.Debug("saving new states to the database", "count", len(states))
for _, state := range states {
if err := st.saveState(ctx, state); err != nil {
logger.Error("failed to save alert state", "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error())
}
}
}
return states
}
@@ -297,6 +307,42 @@ func (st *Manager) Put(states []*State) {
}
}
// flushState dumps the entire state to the database
func (st *Manager) flushState(ctx context.Context) {
t := st.clock.Now()
st.log.Info("flushing the state")
st.cache.mtxStates.Lock()
defer st.cache.mtxStates.Unlock()
totalStates, errorsCnt := 0, 0
for _, orgStates := range st.cache.states {
for _, ruleStates := range orgStates {
for _, state := range ruleStates {
err := st.saveState(ctx, state)
totalStates++
if err != nil {
st.log.Error("failed to save alert state", append(state.GetRuleKey().LogContext(), "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error()))
errorsCnt++
}
}
}
}
st.log.Info("the state has been flushed", "total_instances", totalStates, "errors", errorsCnt, "took", st.clock.Since(t))
}
func (st *Manager) saveState(ctx context.Context, s *State) error {
cmd := ngModels.SaveAlertInstanceCommand{
RuleOrgID: s.OrgID,
RuleUID: s.AlertRuleUID,
Labels: ngModels.InstanceLabels(s.Labels),
State: ngModels.InstanceStateType(s.State.String()),
StateReason: s.StateReason,
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
}
return st.instanceStore.SaveAlertInstance(ctx, &cmd)
}
// TODO: why wouldn't you allow other types like NoData or Error?
func translateInstanceState(state ngModels.InstanceStateType) eval.State {
switch {

View File

@@ -7,13 +7,16 @@ import (
"testing"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"
"github.com/benbjohnson/clock"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/annotations"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/image"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/store"
@@ -149,3 +152,45 @@ func TestIsItStale(t *testing.T) {
})
}
}
func TestClose(t *testing.T) {
instanceStore := &store.FakeInstanceStore{}
clk := clock.New()
st := NewManager(log.New("test_state_manager"), metrics.NewNGAlert(prometheus.NewPedanticRegistry()).GetStateMetrics(), nil, nil, instanceStore, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clk)
fakeAnnoRepo := store.NewFakeAnnotationsRepo()
annotations.SetRepository(fakeAnnoRepo)
_, rules := ngmodels.GenerateUniqueAlertRules(10, ngmodels.AlertRuleGen())
for _, rule := range rules {
results := eval.GenerateResults(rand.Intn(4)+1, eval.ResultGen(eval.WithEvaluatedAt(clk.Now())))
_ = st.ProcessEvalResults(context.Background(), clk.Now(), rule, results, ngmodels.GenerateAlertLabels(rand.Intn(4), "extra_"))
}
var states []*State
for _, org := range st.cache.states {
for _, rule := range org {
for _, state := range rule {
states = append(states, state)
}
}
}
instanceStore.RecordedOps = nil
st.Close(context.Background())
t.Run("should flush the state to store", func(t *testing.T) {
savedStates := make(map[string]ngmodels.SaveAlertInstanceCommand)
for _, op := range instanceStore.RecordedOps {
switch q := op.(type) {
case ngmodels.SaveAlertInstanceCommand:
cacheId, err := q.Labels.StringKey()
require.NoError(t, err)
savedStates[cacheId] = q
}
}
require.Len(t, savedStates, len(states))
for _, s := range states {
require.Contains(t, savedStates, s.CacheId)
}
})
}

View File

@@ -4,6 +4,7 @@ import (
"context"
"errors"
"fmt"
"math/rand"
"sort"
"testing"
"time"
@@ -2007,6 +2008,34 @@ func TestProcessEvalResults(t *testing.T) {
}, time.Second, 100*time.Millisecond, "%d annotations are present, expected %d. We have %+v", fakeAnnoRepo.Len(), tc.expectedAnnotations, printAllAnnotations(fakeAnnoRepo.Items))
})
}
t.Run("should save state to database", func(t *testing.T) {
fakeAnnoRepo := store.NewFakeAnnotationsRepo()
annotations.SetRepository(fakeAnnoRepo)
instanceStore := &store.FakeInstanceStore{}
clk := clock.New()
st := state.NewManager(log.New("test_state_manager"), testMetrics.GetStateMetrics(), nil, nil, instanceStore, &dashboards.FakeDashboardService{}, &image.NotAvailableImageService{}, clk)
rule := models.AlertRuleGen()()
var results = eval.GenerateResults(rand.Intn(4)+1, eval.ResultGen(eval.WithEvaluatedAt(clk.Now())))
states := st.ProcessEvalResults(context.Background(), clk.Now(), rule, results, make(data.Labels))
require.NotEmpty(t, states)
savedStates := make(map[string]models.SaveAlertInstanceCommand)
for _, op := range instanceStore.RecordedOps {
switch q := op.(type) {
case models.SaveAlertInstanceCommand:
cacheId, err := q.Labels.StringKey()
require.NoError(t, err)
savedStates[cacheId] = q
}
}
require.Len(t, savedStates, len(states))
for _, s := range states {
require.Contains(t, savedStates, s.CacheId)
}
})
}
func printAllAnnotations(annos []*annotations.Item) string {

View File

@@ -36,6 +36,13 @@ type State struct {
Error error
}
func (a *State) GetRuleKey() models.AlertRuleKey {
return models.AlertRuleKey{
OrgID: a.OrgID,
UID: a.AlertRuleUID,
}
}
type Evaluation struct {
EvaluationTime time.Time
EvaluationState eval.State