Alerting: Update handling of stale state (#58276)

* delete all stale states in one lock
* do not use touched states to detect stale rely only on LastEvaluationTime maintained correctly
* fix tests to use correct eval time
* delete unused method
This commit is contained in:
Yuri Tseretyan 2022-11-07 11:03:53 -05:00 committed by GitHub
parent db1fd10ff1
commit 3621cf5a12
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 64 additions and 55 deletions

View File

@ -169,7 +169,7 @@ func TestSchedule_ruleRoutine(t *testing.T) {
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
rule := models.AlertRuleGen()()
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil)
expectedStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
require.NotEmpty(t, expectedStates)
@ -189,7 +189,7 @@ func TestSchedule_ruleRoutine(t *testing.T) {
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
rule := models.AlertRuleGen()()
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil)
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
ctx, cancel := util.WithCancelCause(context.Background())

View File

@ -156,6 +156,27 @@ func (rs *ruleStates) expandRuleLabelsAndAnnotations(ctx context.Context, log lo
return expand(alertRule.Labels), expand(alertRule.Annotations)
}
func (rs *ruleStates) deleteStates(predicate func(s *State) bool) []*State {
deleted := make([]*State, 0)
for id, state := range rs.states {
if predicate(state) {
delete(rs.states, id)
deleted = append(deleted, state)
}
}
return deleted
}
func (c *cache) deleteRuleStates(ruleKey ngModels.AlertRuleKey, predicate func(s *State) bool) []*State {
c.mtxStates.Lock()
defer c.mtxStates.Unlock()
ruleStates, ok := c.states[ruleKey.OrgID][ruleKey.UID]
if ok {
return ruleStates.deleteStates(predicate)
}
return nil
}
func (c *cache) setAllStates(newStates map[int64]map[string]*ruleStates) {
c.mtxStates.Lock()
defer c.mtxStates.Unlock()
@ -283,13 +304,3 @@ func mergeLabels(a, b data.Labels) data.Labels {
}
return newLbs
}
func (c *cache) deleteEntry(orgID int64, alertRuleUID, cacheID string) {
c.mtxStates.Lock()
defer c.mtxStates.Unlock()
ruleStates, ok := c.states[orgID][alertRuleUID]
if !ok {
return
}
delete(ruleStates.states, cacheID)
}

View File

@ -172,13 +172,12 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
logger := st.log.FromContext(ctx)
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
var states []StateTransition
processedResults := make(map[string]*State, len(results))
for _, result := range results {
s := st.setNextState(ctx, alertRule, result, extraLabels, logger)
states = append(states, s)
processedResults[s.State.CacheID] = s.State
}
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults, logger)
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, logger)
st.saveAlertStates(ctx, logger, states...)
@ -333,58 +332,57 @@ func translateInstanceState(state ngModels.InstanceStateType) eval.State {
}
}
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State, logger log.Logger) []StateTransition {
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, logger log.Logger) []StateTransition {
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
var resolvedImage *ngModels.Image
var resolvedStates []StateTransition
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
staleStates := st.cache.deleteRuleStates(alertRule.GetKey(), func(s *State) bool {
return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds)
})
toDelete := make([]ngModels.AlertInstanceKey, 0)
for _, s := range allStates {
// Is the cached state in our recently processed results? If not, is it stale?
if _, ok := states[s.CacheID]; !ok && stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
logger.Info("Removing stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
st.cache.deleteEntry(s.OrgID, s.AlertRuleUID, s.CacheID)
for _, s := range staleStates {
logger.Info("Detected stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Unable to get alert instance key to delete it from database. Ignoring", "error", err.Error())
} else {
toDelete = append(toDelete, key)
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Unable to get alert instance key to delete it from database. Ignoring", "error", err.Error())
} else {
toDelete = append(toDelete, key)
}
if s.State == eval.Alerting {
oldState := s.State
oldReason := s.StateReason
s.State = eval.Normal
s.StateReason = ngModels.StateReasonMissingSeries
s.EndsAt = evaluatedAt
s.Resolved = true
s.LastEvaluationTime = evaluatedAt
record := StateTransition{
State: s,
PreviousState: oldState,
PreviousStateReason: oldReason,
}
if s.State == eval.Alerting {
oldState := s.State
oldReason := s.StateReason
s.State = eval.Normal
s.StateReason = ngModels.StateReasonMissingSeries
s.EndsAt = evaluatedAt
s.Resolved = true
s.LastEvaluationTime = evaluatedAt
record := StateTransition{
State: s,
PreviousState: oldState,
PreviousStateReason: oldReason,
// If there is no resolved image for this rule then take one
if resolvedImage == nil {
image, err := takeImage(ctx, st.imageService, alertRule)
if err != nil {
logger.Warn("Failed to take an image",
"dashboard", alertRule.DashboardUID,
"panel", alertRule.PanelID,
"error", err)
} else if image != nil {
resolvedImage = image
}
// If there is no resolved image for this rule then take one
if resolvedImage == nil {
image, err := takeImage(ctx, st.imageService, alertRule)
if err != nil {
logger.Warn("Failed to take an image",
"dashboard", alertRule.DashboardUID,
"panel", alertRule.PanelID,
"error", err)
} else if image != nil {
resolvedImage = image
}
}
s.Image = resolvedImage
resolvedStates = append(resolvedStates, record)
}
s.Image = resolvedImage
resolvedStates = append(resolvedStates, record)
}
}