mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Clear the state cache when the alert routine stops (#99681)
This commit is contained in:
parent
97f4a164d1
commit
a0bf9202f5
@ -352,16 +352,23 @@ func (a *alertRule) Run() error {
|
|||||||
|
|
||||||
case <-grafanaCtx.Done():
|
case <-grafanaCtx.Done():
|
||||||
reason := grafanaCtx.Err()
|
reason := grafanaCtx.Err()
|
||||||
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
|
|
||||||
|
// We do not want a context to be unbounded which could potentially cause a go routine running
|
||||||
|
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
|
||||||
|
// cases.
|
||||||
|
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
|
||||||
|
defer cancelFunc()
|
||||||
|
|
||||||
if errors.Is(reason, errRuleDeleted) {
|
if errors.Is(reason, errRuleDeleted) {
|
||||||
// We do not want a context to be unbounded which could potentially cause a go routine running
|
// Clean up the state and send resolved notifications for firing alerts only if the reason for stopping
|
||||||
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
|
// the evaluation loop is that the rule was deleted.
|
||||||
// cases.
|
stateTransitions := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, a.key.AlertRuleKey), a.key, ngmodels.StateReasonRuleDeleted)
|
||||||
ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
|
a.expireAndSend(grafanaCtx, stateTransitions)
|
||||||
defer cancelFunc()
|
} else {
|
||||||
states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, a.key.AlertRuleKey), a.key, ngmodels.StateReasonRuleDeleted)
|
// Otherwise, just clean up the cache.
|
||||||
a.expireAndSend(grafanaCtx, states)
|
a.stateManager.ForgetStateByRuleUID(ngmodels.WithRuleKey(ctx, a.key.AlertRuleKey), a.key)
|
||||||
}
|
}
|
||||||
|
|
||||||
a.logger.Debug("Stopping alert rule routine", "reason", reason)
|
a.logger.Debug("Stopping alert rule routine", "reason", reason)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package schedule
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
@ -278,7 +279,12 @@ func TestAlertRuleIdentifier(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func blankRuleForTests(ctx context.Context, key models.AlertRuleKeyWithGroup) *alertRule {
|
func blankRuleForTests(ctx context.Context, key models.AlertRuleKeyWithGroup) *alertRule {
|
||||||
return newAlertRule(ctx, key, nil, false, 0, nil, nil, nil, nil, nil, nil, log.NewNopLogger(), nil, nil, nil)
|
managerCfg := state.ManagerCfg{
|
||||||
|
Historian: &state.FakeHistorian{},
|
||||||
|
Log: log.NewNopLogger(),
|
||||||
|
}
|
||||||
|
st := state.NewManager(managerCfg, state.NewNoopPersister())
|
||||||
|
return newAlertRule(ctx, key, nil, false, 0, nil, st, nil, nil, nil, nil, log.NewNopLogger(), nil, nil, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRuleRoutine(t *testing.T) {
|
func TestRuleRoutine(t *testing.T) {
|
||||||
@ -477,12 +483,26 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
t.Run("should exit", func(t *testing.T) {
|
t.Run("should exit", func(t *testing.T) {
|
||||||
t.Run("and not clear the state if parent context is cancelled", func(t *testing.T) {
|
rule := gen.With(withQueryForState(t, eval.Alerting)).GenerateRef()
|
||||||
stoppedChan := make(chan error)
|
genEvalResults := func(now time.Time) eval.Results {
|
||||||
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
|
return eval.GenerateResults(
|
||||||
|
rand.Intn(5)+1,
|
||||||
|
eval.ResultGen(
|
||||||
|
eval.WithEvaluatedAt(now),
|
||||||
|
// State should be alerting to test resolved notifications in some cases.
|
||||||
|
// When the alert rule is firing and is deleted, we should send
|
||||||
|
// resolved notifications.
|
||||||
|
eval.WithState(eval.Alerting),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
rule := gen.GenerateRef()
|
t.Run("and clean up the state if parent context is cancelled", func(t *testing.T) {
|
||||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil, nil)
|
stoppedChan := make(chan error)
|
||||||
|
sender := NewSyncAlertsSenderMock()
|
||||||
|
sch, _, _, _ := createSchedule(make(chan time.Time), sender)
|
||||||
|
|
||||||
|
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, genEvalResults(sch.clock.Now()), nil, nil)
|
||||||
expectedStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
expectedStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
||||||
require.NotEmpty(t, expectedStates)
|
require.NotEmpty(t, expectedStates)
|
||||||
|
|
||||||
@ -497,14 +517,40 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
cancel()
|
cancel()
|
||||||
err := waitForErrChannel(t, stoppedChan)
|
err := waitForErrChannel(t, stoppedChan)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.Equal(t, len(expectedStates), len(sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)))
|
require.Empty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||||
|
sender.AlertsSenderMock.AssertNotCalled(t, "Send")
|
||||||
})
|
})
|
||||||
t.Run("and clean up the state if delete is cancellation reason for inner context", func(t *testing.T) {
|
|
||||||
stoppedChan := make(chan error)
|
|
||||||
sch, _, _, _ := createSchedule(make(chan time.Time), nil)
|
|
||||||
|
|
||||||
rule := gen.GenerateRef()
|
t.Run("and clean up the state but not send anything if the reason is not rule deleted", func(t *testing.T) {
|
||||||
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, eval.GenerateResults(rand.Intn(5)+1, eval.ResultGen(eval.WithEvaluatedAt(sch.clock.Now()))), nil, nil)
|
stoppedChan := make(chan error)
|
||||||
|
sender := NewSyncAlertsSenderMock()
|
||||||
|
sch, _, _, _ := createSchedule(make(chan time.Time), sender)
|
||||||
|
|
||||||
|
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, genEvalResults(sch.clock.Now()), nil, nil)
|
||||||
|
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||||
|
|
||||||
|
factory := ruleFactoryFromScheduler(sch)
|
||||||
|
ruleInfo := factory.new(context.Background(), rule)
|
||||||
|
go func() {
|
||||||
|
err := ruleInfo.Run()
|
||||||
|
stoppedChan <- err
|
||||||
|
}()
|
||||||
|
|
||||||
|
ruleInfo.Stop(errors.New("some reason"))
|
||||||
|
err := waitForErrChannel(t, stoppedChan)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
require.Empty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||||
|
sender.AlertsSenderMock.AssertNotCalled(t, "Send")
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("and send resolved notifications if errRuleDeleted is the reason for stopping", func(t *testing.T) {
|
||||||
|
stoppedChan := make(chan error)
|
||||||
|
sender := NewSyncAlertsSenderMock()
|
||||||
|
sender.EXPECT().Send(mock.Anything, mock.Anything, mock.Anything).Times(1)
|
||||||
|
sch, _, _, _ := createSchedule(make(chan time.Time), sender)
|
||||||
|
|
||||||
|
_ = sch.stateManager.ProcessEvalResults(context.Background(), sch.clock.Now(), rule, genEvalResults(sch.clock.Now()), nil, nil)
|
||||||
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||||
|
|
||||||
factory := ruleFactoryFromScheduler(sch)
|
factory := ruleFactoryFromScheduler(sch)
|
||||||
@ -519,6 +565,7 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
require.Empty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
require.Empty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
|
||||||
|
sender.AlertsSenderMock.AssertExpectations(t)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -242,7 +242,7 @@ func (st *Manager) DeleteStateByRuleUID(ctx context.Context, ruleKey ngModels.Al
|
|||||||
logger := st.log.FromContext(ctx)
|
logger := st.log.FromContext(ctx)
|
||||||
logger.Debug("Resetting state of the rule")
|
logger.Debug("Resetting state of the rule")
|
||||||
|
|
||||||
states := st.cache.removeByRuleUID(ruleKey.OrgID, ruleKey.UID)
|
states := st.ForgetStateByRuleUID(ctx, ruleKey)
|
||||||
|
|
||||||
if len(states) == 0 {
|
if len(states) == 0 {
|
||||||
return nil
|
return nil
|
||||||
@ -280,11 +280,19 @@ func (st *Manager) DeleteStateByRuleUID(ctx context.Context, ruleKey ngModels.Al
|
|||||||
logger.Error("Failed to delete states that belong to a rule from database", "error", err)
|
logger.Error("Failed to delete states that belong to a rule from database", "error", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.Info("Rules state was reset", "states", len(states))
|
logger.Info("Rules state was reset", "states", len(states))
|
||||||
|
|
||||||
return transitions
|
return transitions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (st *Manager) ForgetStateByRuleUID(ctx context.Context, ruleKey ngModels.AlertRuleKeyWithGroup) []*State {
|
||||||
|
logger := st.log.FromContext(ctx)
|
||||||
|
logger.Debug("Removing rule state from cache")
|
||||||
|
|
||||||
|
return st.cache.removeByRuleUID(ruleKey.OrgID, ruleKey.UID)
|
||||||
|
}
|
||||||
|
|
||||||
// ResetStateByRuleUID removes the rule instances from cache and instanceStore and saves state history. If the state
|
// ResetStateByRuleUID removes the rule instances from cache and instanceStore and saves state history. If the state
|
||||||
// history has to be saved, rule must not be nil.
|
// history has to be saved, rule must not be nil.
|
||||||
func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.AlertRule, reason string) []StateTransition {
|
func (st *Manager) ResetStateByRuleUID(ctx context.Context, rule *ngModels.AlertRule, reason string) []StateTransition {
|
||||||
|
Loading…
Reference in New Issue
Block a user