mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Extract large closures in ruleRoutine (#84035)
* extract notify * extract resetState * move evaluate metrics inside evaluate * split out evaluate
This commit is contained in:
@@ -180,12 +180,122 @@ func (a *alertRuleInfo) stop(reason error) {
|
|||||||
a.stopFn(reason)
|
a.stopFn(reason)
|
||||||
}
|
}
|
||||||
|
|
||||||
//nolint:gocyclo
|
|
||||||
func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
|
func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
|
||||||
grafanaCtx := ngmodels.WithRuleKey(a.ctx, key)
|
grafanaCtx := ngmodels.WithRuleKey(a.ctx, key)
|
||||||
logger := a.logger.FromContext(grafanaCtx)
|
logger := a.logger.FromContext(grafanaCtx)
|
||||||
logger.Debug("Alert rule routine started")
|
logger.Debug("Alert rule routine started")
|
||||||
|
|
||||||
|
evalRunning := false
|
||||||
|
var currentFingerprint fingerprint
|
||||||
|
defer a.stopApplied(key)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
// used by external services (API) to notify that rule is updated.
|
||||||
|
case ctx := <-a.updateCh:
|
||||||
|
if currentFingerprint == ctx.Fingerprint {
|
||||||
|
logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
|
||||||
|
// clear the state. So the next evaluation will start from the scratch.
|
||||||
|
a.resetState(grafanaCtx, key, ctx.IsPaused)
|
||||||
|
currentFingerprint = ctx.Fingerprint
|
||||||
|
// evalCh - used by the scheduler to signal that evaluation is needed.
|
||||||
|
case ctx, ok := <-a.evalCh:
|
||||||
|
if !ok {
|
||||||
|
logger.Debug("Evaluation channel has been closed. Exiting")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if evalRunning {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
func() {
|
||||||
|
evalRunning = true
|
||||||
|
defer func() {
|
||||||
|
evalRunning = false
|
||||||
|
a.evalApplied(key, ctx.scheduledAt)
|
||||||
|
}()
|
||||||
|
|
||||||
|
for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
|
||||||
|
isPaused := ctx.rule.IsPaused
|
||||||
|
f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
|
||||||
|
// Do not clean up state if the eval loop has just started.
|
||||||
|
var needReset bool
|
||||||
|
if currentFingerprint != 0 && currentFingerprint != f {
|
||||||
|
logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
|
||||||
|
needReset = true
|
||||||
|
}
|
||||||
|
// We need to reset state if the loop has started and the alert is already paused. It can happen,
|
||||||
|
// if we have an alert with state and we do file provision with stateful Grafana, that state
|
||||||
|
// lingers in DB and won't be cleaned up until next alert rule update.
|
||||||
|
needReset = needReset || (currentFingerprint == 0 && isPaused)
|
||||||
|
if needReset {
|
||||||
|
a.resetState(grafanaCtx, key, isPaused)
|
||||||
|
}
|
||||||
|
currentFingerprint = f
|
||||||
|
if isPaused {
|
||||||
|
logger.Debug("Skip rule evaluation because it is paused")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fpStr := currentFingerprint.String()
|
||||||
|
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
|
||||||
|
tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
|
||||||
|
attribute.String("rule_uid", ctx.rule.UID),
|
||||||
|
attribute.Int64("org_id", ctx.rule.OrgID),
|
||||||
|
attribute.Int64("rule_version", ctx.rule.Version),
|
||||||
|
attribute.String("rule_fingerprint", fpStr),
|
||||||
|
attribute.String("tick", utcTick),
|
||||||
|
))
|
||||||
|
|
||||||
|
// Check before any execution if the context was cancelled so that we don't do any evaluations.
|
||||||
|
if tracingCtx.Err() != nil {
|
||||||
|
span.SetStatus(codes.Error, "rule evaluation cancelled")
|
||||||
|
span.End()
|
||||||
|
logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
retry := attempt < a.maxAttempts
|
||||||
|
err := a.evaluate(tracingCtx, key, f, attempt, ctx, span, retry)
|
||||||
|
// This is extremely confusing - when we exhaust all retry attempts, or we have no retryable errors
|
||||||
|
// we return nil - so technically, this is meaningless to know whether the evaluation has errors or not.
|
||||||
|
span.End()
|
||||||
|
if err == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
|
||||||
|
select {
|
||||||
|
case <-tracingCtx.Done():
|
||||||
|
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
||||||
|
return
|
||||||
|
case <-time.After(retryDelay):
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
case <-grafanaCtx.Done():
|
||||||
|
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
|
||||||
|
if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
|
||||||
|
// We do not want a context to be unbounded which could potentially cause a go routine running
|
||||||
|
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
|
||||||
|
// cases.
|
||||||
|
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
|
||||||
|
defer cancelFunc()
|
||||||
|
states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key, ngmodels.StateReasonRuleDeleted)
|
||||||
|
a.notify(grafanaCtx, key, states)
|
||||||
|
}
|
||||||
|
logger.Debug("Stopping alert rule routine")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *alertRuleInfo) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
|
||||||
orgID := fmt.Sprint(key.OrgID)
|
orgID := fmt.Sprint(key.OrgID)
|
||||||
evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
|
evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
|
||||||
evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
|
evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
|
||||||
@@ -193,25 +303,7 @@ func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
|
|||||||
processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID)
|
processDuration := a.metrics.ProcessDuration.WithLabelValues(orgID)
|
||||||
sendDuration := a.metrics.SendDuration.WithLabelValues(orgID)
|
sendDuration := a.metrics.SendDuration.WithLabelValues(orgID)
|
||||||
|
|
||||||
notify := func(states []state.StateTransition) {
|
logger := a.logger.FromContext(ctx).New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx)
|
||||||
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, a.appURL, a.clock)
|
|
||||||
if len(expiredAlerts.PostableAlerts) > 0 {
|
|
||||||
a.sender.Send(grafanaCtx, key, expiredAlerts)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resetState := func(ctx context.Context, isPaused bool) {
|
|
||||||
rule := a.ruleProvider.get(key)
|
|
||||||
reason := ngmodels.StateReasonUpdated
|
|
||||||
if isPaused {
|
|
||||||
reason = ngmodels.StateReasonPaused
|
|
||||||
}
|
|
||||||
states := a.stateManager.ResetStateByRuleUID(ctx, rule, reason)
|
|
||||||
notify(states)
|
|
||||||
}
|
|
||||||
|
|
||||||
evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
|
|
||||||
logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx)
|
|
||||||
start := a.clock.Now()
|
start := a.clock.Now()
|
||||||
|
|
||||||
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), a.newLoadedMetricsReader(e.rule))
|
evalCtx := eval.NewContextWithPreviousResults(ctx, SchedulerUserFor(e.rule.OrgID), a.newLoadedMetricsReader(e.rule))
|
||||||
@@ -301,114 +393,21 @@ func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
evalRunning := false
|
func (a *alertRuleInfo) notify(ctx context.Context, key ngmodels.AlertRuleKey, states []state.StateTransition) {
|
||||||
var currentFingerprint fingerprint
|
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, a.appURL, a.clock)
|
||||||
defer a.stopApplied(key)
|
if len(expiredAlerts.PostableAlerts) > 0 {
|
||||||
for {
|
a.sender.Send(ctx, key, expiredAlerts)
|
||||||
select {
|
}
|
||||||
// used by external services (API) to notify that rule is updated.
|
|
||||||
case ctx := <-a.updateCh:
|
|
||||||
if currentFingerprint == ctx.Fingerprint {
|
|
||||||
logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
|
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
|
func (a *alertRuleInfo) resetState(ctx context.Context, key ngmodels.AlertRuleKey, isPaused bool) {
|
||||||
// clear the state. So the next evaluation will start from the scratch.
|
rule := a.ruleProvider.get(key)
|
||||||
resetState(grafanaCtx, ctx.IsPaused)
|
reason := ngmodels.StateReasonUpdated
|
||||||
currentFingerprint = ctx.Fingerprint
|
|
||||||
// evalCh - used by the scheduler to signal that evaluation is needed.
|
|
||||||
case ctx, ok := <-a.evalCh:
|
|
||||||
if !ok {
|
|
||||||
logger.Debug("Evaluation channel has been closed. Exiting")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
if evalRunning {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
func() {
|
|
||||||
evalRunning = true
|
|
||||||
defer func() {
|
|
||||||
evalRunning = false
|
|
||||||
a.evalApplied(key, ctx.scheduledAt)
|
|
||||||
}()
|
|
||||||
|
|
||||||
for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
|
|
||||||
isPaused := ctx.rule.IsPaused
|
|
||||||
f := ruleWithFolder{ctx.rule, ctx.folderTitle}.Fingerprint()
|
|
||||||
// Do not clean up state if the eval loop has just started.
|
|
||||||
var needReset bool
|
|
||||||
if currentFingerprint != 0 && currentFingerprint != f {
|
|
||||||
logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
|
|
||||||
needReset = true
|
|
||||||
}
|
|
||||||
// We need to reset state if the loop has started and the alert is already paused. It can happen,
|
|
||||||
// if we have an alert with state and we do file provision with stateful Grafana, that state
|
|
||||||
// lingers in DB and won't be cleaned up until next alert rule update.
|
|
||||||
needReset = needReset || (currentFingerprint == 0 && isPaused)
|
|
||||||
if needReset {
|
|
||||||
resetState(grafanaCtx, isPaused)
|
|
||||||
}
|
|
||||||
currentFingerprint = f
|
|
||||||
if isPaused {
|
if isPaused {
|
||||||
logger.Debug("Skip rule evaluation because it is paused")
|
reason = ngmodels.StateReasonPaused
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fpStr := currentFingerprint.String()
|
|
||||||
utcTick := ctx.scheduledAt.UTC().Format(time.RFC3339Nano)
|
|
||||||
tracingCtx, span := a.tracer.Start(grafanaCtx, "alert rule execution", trace.WithAttributes(
|
|
||||||
attribute.String("rule_uid", ctx.rule.UID),
|
|
||||||
attribute.Int64("org_id", ctx.rule.OrgID),
|
|
||||||
attribute.Int64("rule_version", ctx.rule.Version),
|
|
||||||
attribute.String("rule_fingerprint", fpStr),
|
|
||||||
attribute.String("tick", utcTick),
|
|
||||||
))
|
|
||||||
|
|
||||||
// Check before any execution if the context was cancelled so that we don't do any evaluations.
|
|
||||||
if tracingCtx.Err() != nil {
|
|
||||||
span.SetStatus(codes.Error, "rule evaluation cancelled")
|
|
||||||
span.End()
|
|
||||||
logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
retry := attempt < a.maxAttempts
|
|
||||||
err := evaluate(tracingCtx, f, attempt, ctx, span, retry)
|
|
||||||
// This is extremely confusing - when we exhaust all retry attempts, or we have no retryable errors
|
|
||||||
// we return nil - so technically, this is meaningless to know whether the evaluation has errors or not.
|
|
||||||
span.End()
|
|
||||||
if err == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
|
|
||||||
select {
|
|
||||||
case <-tracingCtx.Done():
|
|
||||||
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
|
|
||||||
return
|
|
||||||
case <-time.After(retryDelay):
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
case <-grafanaCtx.Done():
|
|
||||||
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
|
|
||||||
if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
|
|
||||||
// We do not want a context to be unbounded which could potentially cause a go routine running
|
|
||||||
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
|
|
||||||
// cases.
|
|
||||||
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
|
|
||||||
defer cancelFunc()
|
|
||||||
states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key, ngmodels.StateReasonRuleDeleted)
|
|
||||||
notify(states)
|
|
||||||
}
|
|
||||||
logger.Debug("Stopping alert rule routine")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
states := a.stateManager.ResetStateByRuleUID(ctx, rule, reason)
|
||||||
|
a.notify(ctx, key, states)
|
||||||
}
|
}
|
||||||
|
|
||||||
// evalApplied is only used on tests.
|
// evalApplied is only used on tests.
|
||||||
|
|||||||
Reference in New Issue
Block a user