Alerting: Update state manager to save state transitions in one batch (#58358)

* change stale results handler to not update database but return transitions
* save all transitions in one call
This commit is contained in:
Yuri Tseretyan 2022-11-14 10:57:51 -05:00 committed by GitHub
parent 4915d21c25
commit 28d39d35fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -176,26 +176,23 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
s := st.setNextState(ctx, alertRule, result, extraLabels, logger)
states = append(states, s)
}
resolvedStates := st.staleResultsHandler(ctx, logger, alertRule, evaluatedAt)
staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
st.deleteAlertStates(ctx, logger, staleStates)
st.saveAlertStates(ctx, logger, states...)
changedStates := make([]StateTransition, 0, len(states))
for _, s := range states {
if s.changed() {
changedStates = append(changedStates, s)
}
}
st.logStateTransitions(ctx, alertRule, states, staleStates)
if st.historian != nil {
st.historian.RecordStates(ctx, alertRule, changedStates)
}
deltas := append(states, resolvedStates...)
nextStates := make([]*State, 0, len(states))
for _, s := range deltas {
for _, s := range states {
nextStates = append(nextStates, s.State)
}
// TODO refactor further. Do not filter because it will be filtered downstream
for _, s := range staleStates {
if s.PreviousState == eval.Alerting {
nextStates = append(nextStates, s.State)
}
}
return nextStates
}
@ -281,7 +278,7 @@ func (st *Manager) Put(states []*State) {
// TODO: Is the `State` type necessary? Should it embed the instance?
func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, states ...StateTransition) {
if st.instanceStore == nil {
if st.instanceStore == nil || len(states) == 0 {
return
}
@ -319,6 +316,49 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state
}
}
func (st *Manager) logStateTransitions(ctx context.Context, alertRule *ngModels.AlertRule, newStates, staleStates []StateTransition) {
if st.historian == nil {
return
}
changedStates := make([]StateTransition, 0, len(staleStates))
for _, s := range newStates {
if s.changed() {
changedStates = append(changedStates, s)
}
}
// TODO refactor further. Let historian decide what to log. Current logic removes states `Normal (reason-X) -> Normal (reason-Y)`
for _, t := range staleStates {
if t.PreviousState == eval.Alerting {
changedStates = append(changedStates, t)
}
}
st.historian.RecordStates(ctx, alertRule, changedStates)
}
func (st *Manager) deleteAlertStates(ctx context.Context, logger log.Logger, states []StateTransition) {
if st.instanceStore == nil || len(states) == 0 {
return
}
logger.Debug("Deleting alert states", "count", len(states))
toDelete := make([]ngModels.AlertInstanceKey, 0, len(states))
for _, s := range states {
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Failed to delete alert instance with invalid labels", "cacheID", s.CacheID, "error", err)
continue
}
toDelete = append(toDelete, key)
}
err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...)
if err != nil {
logger.Error("Failed to delete stale states", "error", err)
}
}
// TODO: why wouldn't you allow other types like NoData or Error?
func translateInstanceState(state ngModels.InstanceStateType) eval.State {
switch {
@ -331,72 +371,50 @@ func translateInstanceState(state ngModels.InstanceStateType) eval.State {
}
}
func (st *Manager) staleResultsHandler(ctx context.Context, logger log.Logger, r *ngModels.AlertRule, evaluatedAt time.Time) []StateTransition {
var (
// resolvedImage contains the image for all stale states that are resolved. The resolved image is shared between
// all resolved states as the alert rule is the same. TODO: We will need to change this when we support images
// without screenshots as each state will have a different image
resolvedImage *ngModels.Image
func (st *Manager) deleteStaleStatesFromCache(ctx context.Context, logger log.Logger, evaluatedAt time.Time, alertRule *ngModels.AlertRule) []StateTransition {
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
var resolvedImage *ngModels.Image
// resolvedStates contains the stale states that were resolved
resolvedStates []StateTransition
// staleStates contains the current set of stale states from the state cache
staleStates []*State
// toDelete contains the stale states to delete
toDelete []ngModels.AlertInstanceKey
)
staleStates = st.cache.deleteRuleStates(r.GetKey(), func(s *State) bool {
return stateIsStale(evaluatedAt, s.LastEvaluationTime, r.IntervalSeconds)
var resolvedStates []StateTransition
staleStates := st.cache.deleteRuleStates(alertRule.GetKey(), func(s *State) bool {
return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds)
})
for _, s := range staleStates {
logger.Info("Detected stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
oldState := s.State
oldReason := s.StateReason
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Unable to get alert instance key to delete it from database. Ignoring", "error", err)
} else {
toDelete = append(toDelete, key)
}
// If the stale state is alerting then it should first be resolved
if s.State == eval.Alerting {
t := StateTransition{PreviousState: s.State, PreviousStateReason: s.StateReason}
s.Resolve(ngModels.StateReasonMissingSeries, evaluatedAt)
s.LastEvaluationTime = evaluatedAt
s.State = eval.Normal
s.StateReason = ngModels.StateReasonMissingSeries
s.EndsAt = evaluatedAt
s.LastEvaluationTime = evaluatedAt
if oldState == eval.Alerting {
s.Resolved = true
// If there is no resolved image for this rule then take one
if resolvedImage == nil {
image, err := takeImage(ctx, st.images, r)
image, err := takeImage(ctx, st.images, alertRule)
if err != nil {
logger.Warn("Failed to take an image",
"dashboard", r.GetDashboardUID(),
"panel", r.GetPanelID(),
"dashboard", alertRule.GetDashboardUID(),
"panel", alertRule.GetPanelID(),
"error", err)
} else if image != nil {
resolvedImage = image
}
}
s.Image = resolvedImage
t.State = s
resolvedStates = append(resolvedStates, t)
}
}
if st.historian != nil {
st.historian.RecordStates(ctx, r, resolvedStates)
}
if st.instanceStore != nil {
if err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...); err != nil {
logger.Error("Unable to delete stale instances from database", "error", err, "count", len(toDelete))
record := StateTransition{
State: s,
PreviousState: oldState,
PreviousStateReason: oldReason,
}
resolvedStates = append(resolvedStates, record)
}
return resolvedStates
}