mirror of
https://github.com/grafana/grafana.git
synced 2025-01-27 16:57:14 -06:00
Alerting: Update handling of stale state (#58276)
* delete all stale states in one lock * do not use touched states to detect stale rely only on LastEvaluationTime maintained correctly * fix tests to use correct eval time * delete unused method
This commit is contained in:
parent
db1fd10ff1
commit
3621cf5a12
@ -169,7 +169,7 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
||||
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
|
||||
|
||||
rule := models.AlertRuleGen()()
|
||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
|
||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil)
|
||||
expectedStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
||||
require.NotEmpty(t, expectedStates)
|
||||
|
||||
@ -189,7 +189,7 @@ func TestSchedule_ruleRoutine(t *testing.T) {
|
||||
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
|
||||
|
||||
rule := models.AlertRuleGen()()
|
||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen()), nil)
|
||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil)
|
||||
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||
|
||||
ctx, cancel := util.WithCancelCause(context.Background())
|
||||
|
@ -156,6 +156,27 @@ func (rs *ruleStates) expandRuleLabelsAndAnnotations(ctx context.Context, log lo
|
||||
return expand(alertRule.Labels), expand(alertRule.Annotations)
|
||||
}
|
||||
|
||||
func (rs *ruleStates) deleteStates(predicate func(s *State) bool) []*State {
|
||||
deleted := make([]*State, 0)
|
||||
for id, state := range rs.states {
|
||||
if predicate(state) {
|
||||
delete(rs.states, id)
|
||||
deleted = append(deleted, state)
|
||||
}
|
||||
}
|
||||
return deleted
|
||||
}
|
||||
|
||||
func (c *cache) deleteRuleStates(ruleKey ngModels.AlertRuleKey, predicate func(s *State) bool) []*State {
|
||||
c.mtxStates.Lock()
|
||||
defer c.mtxStates.Unlock()
|
||||
ruleStates, ok := c.states[ruleKey.OrgID][ruleKey.UID]
|
||||
if ok {
|
||||
return ruleStates.deleteStates(predicate)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cache) setAllStates(newStates map[int64]map[string]*ruleStates) {
|
||||
c.mtxStates.Lock()
|
||||
defer c.mtxStates.Unlock()
|
||||
@ -283,13 +304,3 @@ func mergeLabels(a, b data.Labels) data.Labels {
|
||||
}
|
||||
return newLbs
|
||||
}
|
||||
|
||||
func (c *cache) deleteEntry(orgID int64, alertRuleUID, cacheID string) {
|
||||
c.mtxStates.Lock()
|
||||
defer c.mtxStates.Unlock()
|
||||
ruleStates, ok := c.states[orgID][alertRuleUID]
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
delete(ruleStates.states, cacheID)
|
||||
}
|
||||
|
@ -172,13 +172,12 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
||||
logger := st.log.FromContext(ctx)
|
||||
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
|
||||
var states []StateTransition
|
||||
processedResults := make(map[string]*State, len(results))
|
||||
|
||||
for _, result := range results {
|
||||
s := st.setNextState(ctx, alertRule, result, extraLabels, logger)
|
||||
states = append(states, s)
|
||||
processedResults[s.State.CacheID] = s.State
|
||||
}
|
||||
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults, logger)
|
||||
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, logger)
|
||||
|
||||
st.saveAlertStates(ctx, logger, states...)
|
||||
|
||||
@ -333,58 +332,57 @@ func translateInstanceState(state ngModels.InstanceStateType) eval.State {
|
||||
}
|
||||
}
|
||||
|
||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State, logger log.Logger) []StateTransition {
|
||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, logger log.Logger) []StateTransition {
|
||||
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
|
||||
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
|
||||
var resolvedImage *ngModels.Image
|
||||
|
||||
var resolvedStates []StateTransition
|
||||
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
||||
staleStates := st.cache.deleteRuleStates(alertRule.GetKey(), func(s *State) bool {
|
||||
return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds)
|
||||
})
|
||||
|
||||
toDelete := make([]ngModels.AlertInstanceKey, 0)
|
||||
|
||||
for _, s := range allStates {
|
||||
// Is the cached state in our recently processed results? If not, is it stale?
|
||||
if _, ok := states[s.CacheID]; !ok && stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
|
||||
logger.Info("Removing stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
|
||||
st.cache.deleteEntry(s.OrgID, s.AlertRuleUID, s.CacheID)
|
||||
for _, s := range staleStates {
|
||||
logger.Info("Detected stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
|
||||
|
||||
key, err := s.GetAlertInstanceKey()
|
||||
if err != nil {
|
||||
logger.Error("Unable to get alert instance key to delete it from database. Ignoring", "error", err.Error())
|
||||
} else {
|
||||
toDelete = append(toDelete, key)
|
||||
key, err := s.GetAlertInstanceKey()
|
||||
if err != nil {
|
||||
logger.Error("Unable to get alert instance key to delete it from database. Ignoring", "error", err.Error())
|
||||
} else {
|
||||
toDelete = append(toDelete, key)
|
||||
}
|
||||
|
||||
if s.State == eval.Alerting {
|
||||
oldState := s.State
|
||||
oldReason := s.StateReason
|
||||
|
||||
s.State = eval.Normal
|
||||
s.StateReason = ngModels.StateReasonMissingSeries
|
||||
s.EndsAt = evaluatedAt
|
||||
s.Resolved = true
|
||||
s.LastEvaluationTime = evaluatedAt
|
||||
record := StateTransition{
|
||||
State: s,
|
||||
PreviousState: oldState,
|
||||
PreviousStateReason: oldReason,
|
||||
}
|
||||
|
||||
if s.State == eval.Alerting {
|
||||
oldState := s.State
|
||||
oldReason := s.StateReason
|
||||
|
||||
s.State = eval.Normal
|
||||
s.StateReason = ngModels.StateReasonMissingSeries
|
||||
s.EndsAt = evaluatedAt
|
||||
s.Resolved = true
|
||||
s.LastEvaluationTime = evaluatedAt
|
||||
record := StateTransition{
|
||||
State: s,
|
||||
PreviousState: oldState,
|
||||
PreviousStateReason: oldReason,
|
||||
// If there is no resolved image for this rule then take one
|
||||
if resolvedImage == nil {
|
||||
image, err := takeImage(ctx, st.imageService, alertRule)
|
||||
if err != nil {
|
||||
logger.Warn("Failed to take an image",
|
||||
"dashboard", alertRule.DashboardUID,
|
||||
"panel", alertRule.PanelID,
|
||||
"error", err)
|
||||
} else if image != nil {
|
||||
resolvedImage = image
|
||||
}
|
||||
|
||||
// If there is no resolved image for this rule then take one
|
||||
if resolvedImage == nil {
|
||||
image, err := takeImage(ctx, st.imageService, alertRule)
|
||||
if err != nil {
|
||||
logger.Warn("Failed to take an image",
|
||||
"dashboard", alertRule.DashboardUID,
|
||||
"panel", alertRule.PanelID,
|
||||
"error", err)
|
||||
} else if image != nil {
|
||||
resolvedImage = image
|
||||
}
|
||||
}
|
||||
s.Image = resolvedImage
|
||||
resolvedStates = append(resolvedStates, record)
|
||||
}
|
||||
s.Image = resolvedImage
|
||||
resolvedStates = append(resolvedStates, record)
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user