Alerting: Scheduler and registry handle rules by an interface (#84044)

* export Evaluation

* Export Evaluation

* Export RuleVersionAndPauseStatus

* export Eval, create interface

* Export update and add to interface

* Export Stop and Run and add to interface

* Registry and scheduler use rule by interface and not concrete type

* Update factory to use interface, update tests to work over public API rather than writing to channels directly

* Rename map in registry

* Rename getOrCreateInfo to not reference a specific implementation

* Genericize alertRuleInfoRegistry into ruleRegistry

* Rename alertRuleInfo to alertRule

* Comments on interface

* Update pkg/services/ngalert/schedule/schedule.go

Co-authored-by: Jean-Philippe Quéméner <JohnnyQQQQ@users.noreply.github.com>

---------

Co-authored-by: Jean-Philippe Quéméner <JohnnyQQQQ@users.noreply.github.com>
This commit is contained in:
Alexander Weaver 2024-03-11 15:57:38 -05:00 committed by GitHub
parent 0b2640e9ff
commit 6c5e94095d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 142 additions and 124 deletions

View File

@ -23,9 +23,22 @@ import (
"go.opentelemetry.io/otel/trace"
)
type ruleFactoryFunc func(context.Context) *alertRuleInfo
// Rule represents a single piece of work that is executed periodically by the ruler.
type Rule interface {
// Run creates the resources that will perform the rule's work, and starts it. It blocks indefinitely, until Stop is called or another signal is sent.
Run(key ngmodels.AlertRuleKey) error
// Stop shuts down the rule's execution with an optional reason. It has no effect if the rule has not yet been Run.
Stop(reason error)
// Eval sends a signal to execute the work represented by the rule, exactly one time.
// It has no effect if the rule has not yet been Run, or if the rule is Stopped.
Eval(eval *Evaluation) (bool, *Evaluation)
// Update sends a singal to change the definition of the rule.
Update(lastVersion RuleVersionAndPauseStatus) bool
}
func (f ruleFactoryFunc) new(ctx context.Context) *alertRuleInfo {
type ruleFactoryFunc func(context.Context) Rule
func (f ruleFactoryFunc) new(ctx context.Context) Rule {
return f(ctx)
}
@ -44,8 +57,8 @@ func newRuleFactory(
evalAppliedHook evalAppliedFunc,
stopAppliedHook stopAppliedFunc,
) ruleFactoryFunc {
return func(ctx context.Context) *alertRuleInfo {
return newAlertRuleInfo(
return func(ctx context.Context) Rule {
return newAlertRule(
ctx,
appURL,
disableGrafanaFolder,
@ -71,9 +84,9 @@ type ruleProvider interface {
get(ngmodels.AlertRuleKey) *ngmodels.AlertRule
}
type alertRuleInfo struct {
evalCh chan *evaluation
updateCh chan ruleVersionAndPauseStatus
type alertRule struct {
evalCh chan *Evaluation
updateCh chan RuleVersionAndPauseStatus
ctx context.Context
stopFn util.CancelCauseFunc
@ -96,7 +109,7 @@ type alertRuleInfo struct {
tracer tracing.Tracer
}
func newAlertRuleInfo(
func newAlertRule(
parent context.Context,
appURL *url.URL,
disableGrafanaFolder bool,
@ -111,11 +124,11 @@ func newAlertRuleInfo(
tracer tracing.Tracer,
evalAppliedHook func(ngmodels.AlertRuleKey, time.Time),
stopAppliedHook func(ngmodels.AlertRuleKey),
) *alertRuleInfo {
) *alertRule {
ctx, stop := util.WithCancelCause(parent)
return &alertRuleInfo{
evalCh: make(chan *evaluation),
updateCh: make(chan ruleVersionAndPauseStatus),
return &alertRule{
evalCh: make(chan *Evaluation),
updateCh: make(chan RuleVersionAndPauseStatus),
ctx: ctx,
stopFn: stop,
appURL: appURL,
@ -141,9 +154,9 @@ func newAlertRuleInfo(
// - false when the send operation is stopped
//
// the second element contains a dropped message that was sent by a concurrent sender.
func (a *alertRuleInfo) eval(eval *evaluation) (bool, *evaluation) {
func (a *alertRule) Eval(eval *Evaluation) (bool, *Evaluation) {
// read the channel in unblocking manner to make sure that there is no concurrent send operation.
var droppedMsg *evaluation
var droppedMsg *Evaluation
select {
case droppedMsg = <-a.evalCh:
default:
@ -158,7 +171,7 @@ func (a *alertRuleInfo) eval(eval *evaluation) (bool, *evaluation) {
}
// update sends an instruction to the rule evaluation routine to update the scheduled rule to the specified version. The specified version must be later than the current version, otherwise no update will happen.
func (a *alertRuleInfo) update(lastVersion ruleVersionAndPauseStatus) bool {
func (a *alertRule) Update(lastVersion RuleVersionAndPauseStatus) bool {
// check if the channel is not empty.
select {
case <-a.updateCh:
@ -176,11 +189,13 @@ func (a *alertRuleInfo) update(lastVersion ruleVersionAndPauseStatus) bool {
}
// stop sends an instruction to the rule evaluation routine to shut down. an optional shutdown reason can be given.
func (a *alertRuleInfo) stop(reason error) {
a.stopFn(reason)
func (a *alertRule) Stop(reason error) {
if a.stopFn != nil {
a.stopFn(reason)
}
}
func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
func (a *alertRule) Run(key ngmodels.AlertRuleKey) error {
grafanaCtx := ngmodels.WithRuleKey(a.ctx, key)
logger := a.logger.FromContext(grafanaCtx)
logger.Debug("Alert rule routine started")
@ -295,7 +310,7 @@ func (a *alertRuleInfo) run(key ngmodels.AlertRuleKey) error {
}
}
func (a *alertRuleInfo) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f fingerprint, attempt int64, e *Evaluation, span trace.Span, retry bool) error {
orgID := fmt.Sprint(key.OrgID)
evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)
evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
@ -393,14 +408,14 @@ func (a *alertRuleInfo) evaluate(ctx context.Context, key ngmodels.AlertRuleKey,
return nil
}
func (a *alertRuleInfo) notify(ctx context.Context, key ngmodels.AlertRuleKey, states []state.StateTransition) {
func (a *alertRule) notify(ctx context.Context, key ngmodels.AlertRuleKey, states []state.StateTransition) {
expiredAlerts := state.FromAlertsStateToStoppedAlert(states, a.appURL, a.clock)
if len(expiredAlerts.PostableAlerts) > 0 {
a.sender.Send(ctx, key, expiredAlerts)
}
}
func (a *alertRuleInfo) resetState(ctx context.Context, key ngmodels.AlertRuleKey, isPaused bool) {
func (a *alertRule) resetState(ctx context.Context, key ngmodels.AlertRuleKey, isPaused bool) {
rule := a.ruleProvider.get(key)
reason := ngmodels.StateReasonUpdated
if isPaused {
@ -411,7 +426,7 @@ func (a *alertRuleInfo) resetState(ctx context.Context, key ngmodels.AlertRuleKe
}
// evalApplied is only used on tests.
func (a *alertRuleInfo) evalApplied(alertDefKey ngmodels.AlertRuleKey, now time.Time) {
func (a *alertRule) evalApplied(alertDefKey ngmodels.AlertRuleKey, now time.Time) {
if a.evalAppliedHook == nil {
return
}
@ -420,7 +435,7 @@ func (a *alertRuleInfo) evalApplied(alertDefKey ngmodels.AlertRuleKey, now time.
}
// stopApplied is only used on tests.
func (a *alertRuleInfo) stopApplied(alertDefKey ngmodels.AlertRuleKey) {
func (a *alertRule) stopApplied(alertDefKey ngmodels.AlertRuleKey) {
if a.stopAppliedHook == nil {
return
}

View File

@ -26,18 +26,18 @@ import (
"github.com/stretchr/testify/require"
)
func TestAlertRuleInfo(t *testing.T) {
func TestAlertRule(t *testing.T) {
type evalResponse struct {
success bool
droppedEval *evaluation
droppedEval *Evaluation
}
t.Run("when rule evaluation is not stopped", func(t *testing.T) {
t.Run("update should send to updateCh", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r := blankRuleForTests(context.Background())
resultCh := make(chan bool)
go func() {
resultCh <- r.update(ruleVersionAndPauseStatus{fingerprint(rand.Uint64()), false})
resultCh <- r.Update(RuleVersionAndPauseStatus{fingerprint(rand.Uint64()), false})
}()
select {
case <-r.updateCh:
@ -47,22 +47,22 @@ func TestAlertRuleInfo(t *testing.T) {
}
})
t.Run("update should drop any concurrent sending to updateCh", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
version1 := ruleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}
version2 := ruleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}
r := blankRuleForTests(context.Background())
version1 := RuleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}
version2 := RuleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
wg.Done()
r.update(version1)
r.Update(version1)
wg.Done()
}()
wg.Wait()
wg.Add(2) // one when time1 is sent, another when go-routine for time2 has started
go func() {
wg.Done()
r.update(version2)
r.Update(version2)
}()
wg.Wait() // at this point tick 1 has already been dropped
select {
@ -73,16 +73,16 @@ func TestAlertRuleInfo(t *testing.T) {
}
})
t.Run("eval should send to evalCh", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r := blankRuleForTests(context.Background())
expected := time.Now()
resultCh := make(chan evalResponse)
data := &evaluation{
data := &Evaluation{
scheduledAt: expected,
rule: models.AlertRuleGen()(),
folderTitle: util.GenerateShortUID(),
}
go func() {
result, dropped := r.eval(data)
result, dropped := r.Eval(data)
resultCh <- evalResponse{result, dropped}
}()
select {
@ -96,17 +96,17 @@ func TestAlertRuleInfo(t *testing.T) {
}
})
t.Run("eval should drop any concurrent sending to evalCh", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r := blankRuleForTests(context.Background())
time1 := time.UnixMilli(rand.Int63n(math.MaxInt64))
time2 := time.UnixMilli(rand.Int63n(math.MaxInt64))
resultCh1 := make(chan evalResponse)
resultCh2 := make(chan evalResponse)
data := &evaluation{
data := &Evaluation{
scheduledAt: time1,
rule: models.AlertRuleGen()(),
folderTitle: util.GenerateShortUID(),
}
data2 := &evaluation{
data2 := &Evaluation{
scheduledAt: time2,
rule: data.rule,
folderTitle: data.folderTitle,
@ -115,7 +115,7 @@ func TestAlertRuleInfo(t *testing.T) {
wg.Add(1)
go func() {
wg.Done()
result, dropped := r.eval(data)
result, dropped := r.Eval(data)
wg.Done()
resultCh1 <- evalResponse{result, dropped}
}()
@ -123,7 +123,7 @@ func TestAlertRuleInfo(t *testing.T) {
wg.Add(2) // one when time1 is sent, another when go-routine for time2 has started
go func() {
wg.Done()
result, dropped := r.eval(data2)
result, dropped := r.Eval(data2)
resultCh2 <- evalResponse{result, dropped}
}()
wg.Wait() // at this point tick 1 has already been dropped
@ -142,19 +142,19 @@ func TestAlertRuleInfo(t *testing.T) {
}
})
t.Run("eval should exit when context is cancelled", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r := blankRuleForTests(context.Background())
resultCh := make(chan evalResponse)
data := &evaluation{
data := &Evaluation{
scheduledAt: time.Now(),
rule: models.AlertRuleGen()(),
folderTitle: util.GenerateShortUID(),
}
go func() {
result, dropped := r.eval(data)
result, dropped := r.Eval(data)
resultCh <- evalResponse{result, dropped}
}()
runtime.Gosched()
r.stop(nil)
r.Stop(nil)
select {
case result := <-resultCh:
require.False(t, result.success)
@ -166,37 +166,37 @@ func TestAlertRuleInfo(t *testing.T) {
})
t.Run("when rule evaluation is stopped", func(t *testing.T) {
t.Run("Update should do nothing", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r.stop(errRuleDeleted)
r := blankRuleForTests(context.Background())
r.Stop(errRuleDeleted)
require.ErrorIs(t, r.ctx.Err(), errRuleDeleted)
require.False(t, r.update(ruleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}))
require.False(t, r.Update(RuleVersionAndPauseStatus{fingerprint(rand.Uint64()), false}))
})
t.Run("eval should do nothing", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r.stop(nil)
data := &evaluation{
r := blankRuleForTests(context.Background())
r.Stop(nil)
data := &Evaluation{
scheduledAt: time.Now(),
rule: models.AlertRuleGen()(),
folderTitle: util.GenerateShortUID(),
}
success, dropped := r.eval(data)
success, dropped := r.Eval(data)
require.False(t, success)
require.Nilf(t, dropped, "expected no dropped evaluations but got one")
})
t.Run("stop should do nothing", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r.stop(nil)
r.stop(nil)
r := blankRuleForTests(context.Background())
r.Stop(nil)
r.Stop(nil)
})
t.Run("stop should do nothing if parent context stopped", func(t *testing.T) {
ctx, cancelFn := context.WithCancel(context.Background())
r := blankRuleInfoForTests(ctx)
r := blankRuleForTests(ctx)
cancelFn()
r.stop(nil)
r.Stop(nil)
})
})
t.Run("should be thread-safe", func(t *testing.T) {
r := blankRuleInfoForTests(context.Background())
r := blankRuleForTests(context.Background())
wg := sync.WaitGroup{}
go func() {
for {
@ -221,15 +221,15 @@ func TestAlertRuleInfo(t *testing.T) {
}
switch rand.Intn(max) + 1 {
case 1:
r.update(ruleVersionAndPauseStatus{fingerprint(rand.Uint64()), false})
r.Update(RuleVersionAndPauseStatus{fingerprint(rand.Uint64()), false})
case 2:
r.eval(&evaluation{
r.Eval(&Evaluation{
scheduledAt: time.Now(),
rule: models.AlertRuleGen()(),
folderTitle: util.GenerateShortUID(),
})
case 3:
r.stop(nil)
r.Stop(nil)
}
}
wg.Done()
@ -240,9 +240,8 @@ func TestAlertRuleInfo(t *testing.T) {
})
}
func blankRuleInfoForTests(ctx context.Context) *alertRuleInfo {
factory := newRuleFactory(nil, false, 0, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil)
return factory.new(context.Background())
func blankRuleForTests(ctx context.Context) *alertRule {
return newAlertRule(context.Background(), nil, false, 0, nil, nil, nil, nil, nil, nil, nil, nil, nil, nil)
}
func TestRuleRoutine(t *testing.T) {
@ -279,16 +278,16 @@ func TestRuleRoutine(t *testing.T) {
t.Cleanup(cancel)
ruleInfo := factory.new(ctx)
go func() {
_ = ruleInfo.run(rule.GetKey())
_ = ruleInfo.Run(rule.GetKey())
}()
expectedTime := time.UnixMicro(rand.Int63())
ruleInfo.evalCh <- &evaluation{
ruleInfo.Eval(&Evaluation{
scheduledAt: expectedTime,
rule: rule,
folderTitle: folderTitle,
}
})
actualTime := waitForTimeChannel(t, evalAppliedChan)
require.Equal(t, expectedTime, actualTime)
@ -428,7 +427,7 @@ func TestRuleRoutine(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
ruleInfo := factory.new(ctx)
go func() {
err := ruleInfo.run(models.AlertRuleKey{})
err := ruleInfo.Run(models.AlertRuleKey{})
stoppedChan <- err
}()
@ -448,11 +447,11 @@ func TestRuleRoutine(t *testing.T) {
factory := ruleFactoryFromScheduler(sch)
ruleInfo := factory.new(context.Background())
go func() {
err := ruleInfo.run(rule.GetKey())
err := ruleInfo.Run(rule.GetKey())
stoppedChan <- err
}()
ruleInfo.stop(errRuleDeleted)
ruleInfo.Stop(errRuleDeleted)
err := waitForErrChannel(t, stoppedChan)
require.NoError(t, err)
@ -479,15 +478,15 @@ func TestRuleRoutine(t *testing.T) {
ruleInfo := factory.new(ctx)
go func() {
_ = ruleInfo.run(rule.GetKey())
_ = ruleInfo.Run(rule.GetKey())
}()
// init evaluation loop so it got the rule version
ruleInfo.evalCh <- &evaluation{
ruleInfo.Eval(&Evaluation{
scheduledAt: sch.clock.Now(),
rule: rule,
folderTitle: folderTitle,
}
})
waitForTimeChannel(t, evalAppliedChan)
@ -519,8 +518,8 @@ func TestRuleRoutine(t *testing.T) {
require.Greaterf(t, expectedToBeSent, 0, "State manager was expected to return at least one state that can be expired")
t.Run("should do nothing if version in channel is the same", func(t *testing.T) {
ruleInfo.updateCh <- ruleVersionAndPauseStatus{ruleFp, false}
ruleInfo.updateCh <- ruleVersionAndPauseStatus{ruleFp, false} // second time just to make sure that previous messages were handled
ruleInfo.Update(RuleVersionAndPauseStatus{ruleFp, false})
ruleInfo.Update(RuleVersionAndPauseStatus{ruleFp, false}) // second time just to make sure that previous messages were handled
actualStates := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
require.Len(t, actualStates, len(states))
@ -529,7 +528,7 @@ func TestRuleRoutine(t *testing.T) {
})
t.Run("should clear the state and expire firing alerts if version in channel is greater", func(t *testing.T) {
ruleInfo.updateCh <- ruleVersionAndPauseStatus{ruleFp + 1, false}
ruleInfo.Update(RuleVersionAndPauseStatus{ruleFp + 1, false})
require.Eventually(t, func() bool {
return len(sender.Calls()) > 0
@ -561,13 +560,13 @@ func TestRuleRoutine(t *testing.T) {
ruleInfo := factory.new(ctx)
go func() {
_ = ruleInfo.run(rule.GetKey())
_ = ruleInfo.Run(rule.GetKey())
}()
ruleInfo.evalCh <- &evaluation{
ruleInfo.Eval(&Evaluation{
scheduledAt: sch.clock.Now(),
rule: rule,
}
})
waitForTimeChannel(t, evalAppliedChan)
@ -667,13 +666,13 @@ func TestRuleRoutine(t *testing.T) {
ruleInfo := factory.new(ctx)
go func() {
_ = ruleInfo.run(rule.GetKey())
_ = ruleInfo.Run(rule.GetKey())
}()
ruleInfo.evalCh <- &evaluation{
ruleInfo.Eval(&Evaluation{
scheduledAt: sch.clock.Now(),
rule: rule,
}
})
waitForTimeChannel(t, evalAppliedChan)
@ -701,13 +700,13 @@ func TestRuleRoutine(t *testing.T) {
ruleInfo := factory.new(ctx)
go func() {
_ = ruleInfo.run(rule.GetKey())
_ = ruleInfo.Run(rule.GetKey())
}()
ruleInfo.evalCh <- &evaluation{
ruleInfo.Eval(&Evaluation{
scheduledAt: sch.clock.Now(),
rule: rule,
}
})
waitForTimeChannel(t, evalAppliedChan)

View File

@ -10,7 +10,7 @@ import (
var _ eval.AlertingResultsReader = AlertingResultsFromRuleState{}
func (a *alertRuleInfo) newLoadedMetricsReader(rule *ngmodels.AlertRule) eval.AlertingResultsReader {
func (a *alertRule) newLoadedMetricsReader(rule *ngmodels.AlertRule) eval.AlertingResultsReader {
return &AlertingResultsFromRuleState{
Manager: a.stateManager,
Rule: rule,

View File

@ -18,65 +18,69 @@ import (
var errRuleDeleted = errors.New("rule deleted")
type ruleFactory interface {
new(context.Context) *alertRuleInfo
new(context.Context) Rule
}
type alertRuleInfoRegistry struct {
mu sync.Mutex
alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
type ruleRegistry struct {
mu sync.Mutex
rules map[models.AlertRuleKey]Rule
}
// getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
// Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
func (r *alertRuleInfoRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey, factory ruleFactory) (*alertRuleInfo, bool) {
func newRuleRegistry() ruleRegistry {
return ruleRegistry{rules: make(map[models.AlertRuleKey]Rule)}
}
// getOrCreate gets rule routine from registry by the key. If it does not exist, it creates a new one.
// Returns a pointer to the rule routine and a flag that indicates whether it is a new struct or not.
func (r *ruleRegistry) getOrCreate(context context.Context, key models.AlertRuleKey, factory ruleFactory) (Rule, bool) {
r.mu.Lock()
defer r.mu.Unlock()
info, ok := r.alertRuleInfo[key]
rule, ok := r.rules[key]
if !ok {
info = factory.new(context)
r.alertRuleInfo[key] = info
rule = factory.new(context)
r.rules[key] = rule
}
return info, !ok
return rule, !ok
}
func (r *alertRuleInfoRegistry) exists(key models.AlertRuleKey) bool {
func (r *ruleRegistry) exists(key models.AlertRuleKey) bool {
r.mu.Lock()
defer r.mu.Unlock()
_, ok := r.alertRuleInfo[key]
_, ok := r.rules[key]
return ok
}
// del removes pair that has specific key from alertRuleInfo.
// del removes pair that has specific key from the registry.
// Returns 2-tuple where the first element is value of the removed pair
// and the second element indicates whether element with the specified key existed.
func (r *alertRuleInfoRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
func (r *ruleRegistry) del(key models.AlertRuleKey) (Rule, bool) {
r.mu.Lock()
defer r.mu.Unlock()
info, ok := r.alertRuleInfo[key]
rule, ok := r.rules[key]
if ok {
delete(r.alertRuleInfo, key)
delete(r.rules, key)
}
return info, ok
return rule, ok
}
func (r *alertRuleInfoRegistry) keyMap() map[models.AlertRuleKey]struct{} {
func (r *ruleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
r.mu.Lock()
defer r.mu.Unlock()
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.alertRuleInfo))
for k := range r.alertRuleInfo {
definitionsIDs := make(map[models.AlertRuleKey]struct{}, len(r.rules))
for k := range r.rules {
definitionsIDs[k] = struct{}{}
}
return definitionsIDs
}
type ruleVersionAndPauseStatus struct {
type RuleVersionAndPauseStatus struct {
Fingerprint fingerprint
IsPaused bool
}
type evaluation struct {
type Evaluation struct {
scheduledAt time.Time
rule *models.AlertRule
folderTitle string

View File

@ -47,8 +47,8 @@ type schedule struct {
// base tick rate (fastest possible configured check)
baseInterval time.Duration
// each alert rule gets its own channel and routine
registry alertRuleInfoRegistry
// each rule gets its own channel and routine
registry ruleRegistry
maxAttempts int64
@ -116,7 +116,7 @@ func NewScheduler(cfg SchedulerCfg, stateManager *state.Manager) *schedule {
}
sch := schedule{
registry: alertRuleInfoRegistry{alertRuleInfo: make(map[ngmodels.AlertRuleKey]*alertRuleInfo)},
registry: newRuleRegistry(),
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
baseInterval: cfg.BaseInterval,
@ -165,13 +165,13 @@ func (sch *schedule) deleteAlertRule(keys ...ngmodels.AlertRuleKey) {
sch.log.Info("Alert rule cannot be removed from the scheduler as it is not scheduled", key.LogContext()...)
}
// Delete the rule routine
ruleInfo, ok := sch.registry.del(key)
ruleRoutine, ok := sch.registry.del(key)
if !ok {
sch.log.Info("Alert rule cannot be stopped as it is not running", key.LogContext()...)
continue
}
// stop rule evaluation
ruleInfo.stop(errRuleDeleted)
ruleRoutine.Stop(errRuleDeleted)
}
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
alertRules, _ := sch.schedulableAlertRules.all()
@ -202,8 +202,8 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
}
type readyToRunItem struct {
ruleInfo *alertRuleInfo
evaluation
ruleRoutine Rule
Evaluation
}
// TODO refactor to accept a callback for tests that will be called with things that are returned currently, and return nothing.
@ -252,7 +252,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
)
for _, item := range alertRules {
key := item.GetKey()
ruleInfo, newRoutine := sch.registry.getOrCreateInfo(ctx, key, ruleFactory)
ruleRoutine, newRoutine := sch.registry.getOrCreate(ctx, key, ruleFactory)
// enforce minimum evaluation interval
if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
@ -264,7 +264,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
if newRoutine && !invalidInterval {
dispatcherGroup.Go(func() error {
return ruleInfo.run(key)
return ruleRoutine.Run(key)
})
}
@ -291,7 +291,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
if isReadyToRun {
sch.log.Debug("Rule is ready to run on the current tick", "uid", item.UID, "tick", tickNum, "frequency", itemFrequency, "offset", offset)
readyToRun = append(readyToRun, readyToRunItem{ruleInfo: ruleInfo, evaluation: evaluation{
readyToRun = append(readyToRun, readyToRunItem{ruleRoutine: ruleRoutine, Evaluation: Evaluation{
scheduledAt: tick,
rule: item,
folderTitle: folderTitle,
@ -300,12 +300,12 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
if _, isUpdated := updated[key]; isUpdated && !isReadyToRun {
// if we do not need to eval the rule, check the whether rule was just updated and if it was, notify evaluation routine about that
sch.log.Debug("Rule has been updated. Notifying evaluation routine", key.LogContext()...)
go func(ri *alertRuleInfo, rule *ngmodels.AlertRule) {
ri.update(ruleVersionAndPauseStatus{
go func(routine Rule, rule *ngmodels.AlertRule) {
routine.Update(RuleVersionAndPauseStatus{
Fingerprint: ruleWithFolder{rule: rule, folderTitle: folderTitle}.Fingerprint(),
IsPaused: rule.IsPaused,
})
}(ruleInfo, item)
}(ruleRoutine, item)
updatedRules = append(updatedRules, ngmodels.AlertRuleKeyWithVersion{
Version: item.Version,
AlertRuleKey: item.GetKey(),
@ -330,7 +330,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
time.AfterFunc(time.Duration(int64(i)*step), func() {
key := item.rule.GetKey()
success, dropped := item.ruleInfo.eval(&item.evaluation)
success, dropped := item.ruleRoutine.Eval(&item.Evaluation)
if !success {
sch.log.Debug("Scheduled evaluation was canceled because evaluation routine was stopped", append(key.LogContext(), "time", tick)...)
return

View File

@ -363,9 +363,9 @@ func TestSchedule_deleteAlertRule(t *testing.T) {
ruleFactory := ruleFactoryFromScheduler(sch)
rule := models.AlertRuleGen()()
key := rule.GetKey()
info, _ := sch.registry.getOrCreateInfo(context.Background(), key, ruleFactory)
info, _ := sch.registry.getOrCreate(context.Background(), key, ruleFactory)
sch.deleteAlertRule(key)
require.ErrorIs(t, info.ctx.Err(), errRuleDeleted)
require.ErrorIs(t, info.(*alertRule).ctx.Err(), errRuleDeleted)
require.False(t, sch.registry.exists(key))
})
})