Alerting: Add debug logs for EndsAt timestamp (#70336)

This commit adds debug logs for previous_ends_at and next_ends_at
to state.go to help us debug issues where alerts are resolved in
Alertmanager due to expiration. This change is in response to a
support escalation where this information was needed but unavailable.
This commit is contained in:
George Robinson 2023-06-20 10:13:38 +01:00 committed by GitHub
parent fc39dadf62
commit 815e98ed95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -190,31 +190,73 @@ func resultNormal(state *State, _ *models.AlertRule, result eval.Result, logger
if state.State == eval.Normal {
logger.Debug("Keeping state", "state", state.State)
} else {
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.Normal)
nextEndsAt := result.EvaluatedAt
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.Normal,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
// Normal states have the same start and end timestamps
state.SetNormal("", result.EvaluatedAt, result.EvaluatedAt)
state.SetNormal("", nextEndsAt, nextEndsAt)
}
}
func resultAlerting(state *State, rule *models.AlertRule, result eval.Result, logger log.Logger) {
switch state.State {
case eval.Alerting:
logger.Debug("Keeping state", "state", state.State)
prevEndsAt := state.EndsAt
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Keeping state",
"state",
state.State,
"previous_ends_at",
prevEndsAt,
"next_ends_at",
state.EndsAt)
case eval.Pending:
// If the previous state is Pending then check if the For duration has been observed
if result.EvaluatedAt.Sub(state.StartsAt) >= rule.For {
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.Alerting)
state.SetAlerting("", result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.Alerting,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
state.SetAlerting("", result.EvaluatedAt, nextEndsAt)
}
default:
nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
if rule.For > 0 {
// If the alert rule has a For duration that should be observed then the state should be set to Pending
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.Pending)
state.SetPending("", result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.Pending,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
state.SetPending("", result.EvaluatedAt, nextEndsAt)
} else {
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.Alerting)
state.SetAlerting("", result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.Alerting,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
state.SetAlerting("", result.EvaluatedAt, nextEndsAt)
}
}
}
@ -228,12 +270,28 @@ func resultError(state *State, rule *models.AlertRule, result eval.Result, logge
state.StateReason = "error"
case models.ErrorErrState:
if state.State == eval.Error {
logger.Debug("Keeping state", "state", state.State)
prevEndsAt := state.EndsAt
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Keeping state",
"state",
state.State,
"previous_ends_at",
prevEndsAt,
"next_ends_at",
state.EndsAt)
} else {
nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
// This is the first occurrence of an error
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.Error)
state.SetError(result.Error, result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.Error,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
state.SetError(result.Error, result.EvaluatedAt, nextEndsAt)
if result.Error != nil {
state.Annotations["Error"] = result.Error.Error()
@ -269,12 +327,28 @@ func resultNoData(state *State, rule *models.AlertRule, result eval.Result, logg
state.StateReason = models.NoData.String()
case models.NoData:
if state.State == eval.NoData {
logger.Debug("Keeping state", "state", state.State)
prevEndsAt := state.EndsAt
state.Maintain(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Keeping state",
"state",
state.State,
"previous_ends_at",
prevEndsAt,
"next_ends_at",
state.EndsAt)
} else {
// This is the first occurrence of no data
logger.Debug("Changing state", "previous_state", state.State, "next_state", eval.NoData)
state.SetNoData("", result.EvaluatedAt, nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt))
nextEndsAt := nextEndsTime(rule.IntervalSeconds, result.EvaluatedAt)
logger.Debug("Changing state",
"previous_state",
state.State,
"next_state",
eval.NoData,
"previous_ends_at",
state.EndsAt,
"next_ends_at",
nextEndsAt)
state.SetNoData("", result.EvaluatedAt, nextEndsAt)
}
case models.OK:
logger.Debug("Execution no data state is Normal", "handler", "resultNormal", "previous_handler", "resultNoData")