Alerting: make alert rule routine evaluation control be thread-safe (#41220)

* change registry.delete to return deleted struct
* use pointer to alertRuleInfo instead copying.
* do not access evaluation channel when routine is stopped
* remove stopCh and use context cancellation
* do not return ctx.Err when channel is cancelled because it cancels all other routines
* make alertRuleInfo fields and functions package private
This commit is contained in:
Yuriy Tseretyan
2021-12-16 14:52:47 -05:00
committed by GitHub
parent 56921b205d
commit 1a762083d7
2 changed files with 158 additions and 92 deletions

View File

@@ -120,7 +120,7 @@ func NewScheduler(cfg SchedulerCfg, expressionService *expr.Service, appURL *url
ticker := alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds())) ticker := alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
sch := schedule{ sch := schedule{
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]alertRuleInfo)}, registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]*alertRuleInfo)},
maxAttempts: cfg.MaxAttempts, maxAttempts: cfg.MaxAttempts,
clock: cfg.C, clock: cfg.C,
baseInterval: cfg.BaseInterval, baseInterval: cfg.BaseInterval,
@@ -349,15 +349,15 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
type readyToRunItem struct { type readyToRunItem struct {
key models.AlertRuleKey key models.AlertRuleKey
ruleInfo alertRuleInfo ruleInfo *alertRuleInfo
version int64
} }
readyToRun := make([]readyToRunItem, 0) readyToRun := make([]readyToRunItem, 0)
for _, item := range alertRules { for _, item := range alertRules {
key := item.GetKey() key := item.GetKey()
itemVersion := item.Version itemVersion := item.Version
newRoutine := !sch.registry.exists(key) ruleInfo, newRoutine := sch.registry.getOrCreateInfo(ctx, key)
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
// enforce minimum evaluation interval // enforce minimum evaluation interval
if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) { if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
@@ -369,7 +369,7 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
if newRoutine && !invalidInterval { if newRoutine && !invalidInterval {
dispatcherGroup.Go(func() error { dispatcherGroup.Go(func() error {
return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh) return sch.ruleRoutine(ruleInfo.ctx, key, ruleInfo.evalCh)
}) })
} }
@@ -382,7 +382,7 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds()) itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 { if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo}) readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo, version: itemVersion})
} }
// remove the alert rule from the registered alert rules // remove the alert rule from the registered alert rules
@@ -398,19 +398,21 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
item := readyToRun[i] item := readyToRun[i]
time.AfterFunc(time.Duration(int64(i)*step), func() { time.AfterFunc(time.Duration(int64(i)*step), func() {
item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version} success := item.ruleInfo.eval(tick, item.version)
if !success {
sch.log.Debug("Scheduled evaluation was canceled because evaluation routine was stopped", "uid", item.key.UID, "org", item.key.OrgID, "time", tick)
}
}) })
} }
// unregister and stop routines of the deleted alert rules // unregister and stop routines of the deleted alert rules
for key := range registeredDefinitions { for key := range registeredDefinitions {
ruleInfo, err := sch.registry.get(key) ruleInfo, ok := sch.registry.del(key)
if err != nil { if !ok {
sch.log.Error("failed to get alert rule routine information", "err", err) sch.log.Error("unable to delete alert rule routine information because it did not exist", "uid", key.UID, "org_id", key.OrgID)
continue continue
} }
ruleInfo.stopCh <- struct{}{} ruleInfo.stop()
sch.registry.del(key)
} }
case <-ctx.Done(): case <-ctx.Done():
waitErr := dispatcherGroup.Wait() waitErr := dispatcherGroup.Wait()
@@ -430,7 +432,7 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
} }
} }
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error { func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext) error {
logger := sch.log.New("uid", key.UID, "org", key.OrgID) logger := sch.log.New("uid", key.UID, "org", key.OrgID)
logger.Debug("alert rule routine started") logger.Debug("alert rule routine started")
@@ -525,9 +527,14 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
evalRunning := false evalRunning := false
var currentRule *models.AlertRule var currentRule *models.AlertRule
defer sch.stopApplied(key)
for { for {
select { select {
case ctx := <-evalCh: case ctx, ok := <-evalCh:
if !ok {
logger.Debug("Evaluation channel has been closed. Exiting")
return nil
}
if evalRunning { if evalRunning {
continue continue
} }
@@ -555,13 +562,9 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
logger.Error("evaluation failed after all retries", "err", err) logger.Error("evaluation failed after all retries", "err", err)
} }
}() }()
case <-stopCh:
sch.stopApplied(key)
logger.Debug("stopping alert rule routine")
// interrupt evaluation if it's running
return nil
case <-grafanaCtx.Done(): case <-grafanaCtx.Done():
return grafanaCtx.Err() logger.Debug("stopping alert rule routine")
return nil
} }
} }
} }
@@ -587,51 +590,34 @@ func (sch *schedule) saveAlertStates(states []*state.State) {
type alertRuleRegistry struct { type alertRuleRegistry struct {
mu sync.Mutex mu sync.Mutex
alertRuleInfo map[models.AlertRuleKey]alertRuleInfo alertRuleInfo map[models.AlertRuleKey]*alertRuleInfo
} }
// getOrCreateInfo returns the channel for the specific alert rule // getOrCreateInfo gets rule routine information from registry by the key. If it does not exist, it creates a new one.
// if it does not exists creates one and returns it // Returns a pointer to the rule routine information and a flag that indicates whether it is a new struct or not.
func (r *alertRuleRegistry) getOrCreateInfo(key models.AlertRuleKey, ruleVersion int64) alertRuleInfo { func (r *alertRuleRegistry) getOrCreateInfo(context context.Context, key models.AlertRuleKey) (*alertRuleInfo, bool) {
r.mu.Lock() r.mu.Lock()
defer r.mu.Unlock() defer r.mu.Unlock()
info, ok := r.alertRuleInfo[key] info, ok := r.alertRuleInfo[key]
if !ok { if !ok {
r.alertRuleInfo[key] = alertRuleInfo{evalCh: make(chan *evalContext), stopCh: make(chan struct{}), version: ruleVersion} info = newAlertRuleInfo(context)
return r.alertRuleInfo[key] r.alertRuleInfo[key] = info
} }
info.version = ruleVersion return info, !ok
r.alertRuleInfo[key] = info
return info
} }
// get returns the channel for the specific alert rule // del removes pair that has specific key from alertRuleInfo.
// if the key does not exist returns an error // Returns 2-tuple where the first element is value of the removed pair
func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) { // and the second element indicates whether element with the specified key existed.
func (r *alertRuleRegistry) del(key models.AlertRuleKey) (*alertRuleInfo, bool) {
r.mu.Lock() r.mu.Lock()
defer r.mu.Unlock() defer r.mu.Unlock()
info, ok := r.alertRuleInfo[key] info, ok := r.alertRuleInfo[key]
if !ok { if ok {
return nil, fmt.Errorf("%v key not found", key) delete(r.alertRuleInfo, key)
} }
return &info, nil return info, ok
}
func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
r.mu.Lock()
defer r.mu.Unlock()
_, ok := r.alertRuleInfo[key]
return ok
}
func (r *alertRuleRegistry) del(key models.AlertRuleKey) {
r.mu.Lock()
defer r.mu.Unlock()
delete(r.alertRuleInfo, key)
} }
func (r *alertRuleRegistry) iter() <-chan models.AlertRuleKey { func (r *alertRuleRegistry) iter() <-chan models.AlertRuleKey {
@@ -660,9 +646,27 @@ func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
} }
type alertRuleInfo struct { type alertRuleInfo struct {
evalCh chan *evalContext evalCh chan *evalContext
stopCh chan struct{} ctx context.Context
version int64 stop context.CancelFunc
}
func newAlertRuleInfo(parent context.Context) *alertRuleInfo {
ctx, cancel := context.WithCancel(parent)
return &alertRuleInfo{evalCh: make(chan *evalContext), ctx: ctx, stop: cancel}
}
// eval signals the rule evaluation routine to perform the evaluation of the rule. Does nothing if the loop is stopped
func (a *alertRuleInfo) eval(t time.Time, version int64) bool {
select {
case a.evalCh <- &evalContext{
now: t,
version: version,
}:
return true
case <-a.ctx.Done():
return false
}
} }
type evalContext struct { type evalContext struct {

View File

@@ -7,6 +7,8 @@ import (
"fmt" "fmt"
"math/rand" "math/rand"
"net/url" "net/url"
"runtime"
"sync"
"testing" "testing"
"time" "time"
@@ -272,11 +274,9 @@ func TestSchedule_ruleRoutine(t *testing.T) {
rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), evalState) rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), evalState)
go func() { go func() {
stop := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(func() { t.Cleanup(cancel)
close(stop) _ = sch.ruleRoutine(ctx, rule.GetKey(), evalChan)
})
_ = sch.ruleRoutine(context.Background(), rule.GetKey(), evalChan, stop)
}() }()
expectedTime := time.UnixMicro(rand.Int63()) expectedTime := time.UnixMicro(rand.Int63())
@@ -367,35 +367,19 @@ func TestSchedule_ruleRoutine(t *testing.T) {
} }
t.Run("should exit", func(t *testing.T) { t.Run("should exit", func(t *testing.T) {
t.Run("when we signal it to stop", func(t *testing.T) {
stopChan := make(chan struct{})
stoppedChan := make(chan error)
sch, _, _, _, _ := createSchedule(make(chan time.Time))
go func() {
err := sch.ruleRoutine(context.Background(), models.AlertRuleKey{}, make(chan *evalContext), stopChan)
stoppedChan <- err
}()
stopChan <- struct{}{}
err := waitForErrChannel(t, stoppedChan)
require.NoError(t, err)
})
t.Run("when context is cancelled", func(t *testing.T) { t.Run("when context is cancelled", func(t *testing.T) {
stoppedChan := make(chan error) stoppedChan := make(chan error)
sch, _, _, _, _ := createSchedule(make(chan time.Time)) sch, _, _, _, _ := createSchedule(make(chan time.Time))
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
go func() { go func() {
err := sch.ruleRoutine(ctx, models.AlertRuleKey{}, make(chan *evalContext), make(chan struct{})) err := sch.ruleRoutine(ctx, models.AlertRuleKey{}, make(chan *evalContext))
stoppedChan <- err stoppedChan <- err
}() }()
cancel() cancel()
err := waitForErrChannel(t, stoppedChan) err := waitForErrChannel(t, stoppedChan)
require.ErrorIs(t, err, context.Canceled) require.NoError(t, err)
}) })
}) })
@@ -408,11 +392,9 @@ func TestSchedule_ruleRoutine(t *testing.T) {
rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), randomNormalState()) rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), randomNormalState())
go func() { go func() {
stop := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(func() { t.Cleanup(cancel)
close(stop) _ = sch.ruleRoutine(ctx, rule.GetKey(), evalChan)
})
_ = sch.ruleRoutine(context.Background(), rule.GetKey(), evalChan, stop)
}() }()
expectedTime := time.UnixMicro(rand.Int63()) expectedTime := time.UnixMicro(rand.Int63())
@@ -462,11 +444,9 @@ func TestSchedule_ruleRoutine(t *testing.T) {
rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), randomNormalState()) rule := CreateTestAlertRule(t, ruleStore, 10, rand.Int63(), randomNormalState())
go func() { go func() {
stop := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(func() { t.Cleanup(cancel)
close(stop) _ = sch.ruleRoutine(ctx, rule.GetKey(), evalChan)
})
_ = sch.ruleRoutine(context.Background(), rule.GetKey(), evalChan, stop)
}() }()
expectedTime := time.UnixMicro(rand.Int63()) expectedTime := time.UnixMicro(rand.Int63())
@@ -547,11 +527,9 @@ func TestSchedule_ruleRoutine(t *testing.T) {
rule := CreateTestAlertRule(t, ruleStore, 10, orgID, eval.Alerting) rule := CreateTestAlertRule(t, ruleStore, 10, orgID, eval.Alerting)
go func() { go func() {
stop := make(chan struct{}) ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(func() { t.Cleanup(cancel)
close(stop) _ = sch.ruleRoutine(ctx, rule.GetKey(), evalChan)
})
_ = sch.ruleRoutine(context.Background(), rule.GetKey(), evalChan, stop)
}() }()
evalChan <- &evalContext{ evalChan <- &evalContext{
@@ -574,6 +552,90 @@ func TestSchedule_ruleRoutine(t *testing.T) {
}) })
} }
func TestSchedule_alertRuleInfo(t *testing.T) {
t.Run("when rule evaluation is not stopped", func(t *testing.T) {
t.Run("eval should send to evalCh", func(t *testing.T) {
r := newAlertRuleInfo(context.Background())
expected := time.Now()
resultCh := make(chan bool)
version := rand.Int63()
go func() {
resultCh <- r.eval(expected, version)
}()
select {
case ctx := <-r.evalCh:
require.Equal(t, version, ctx.version)
require.Equal(t, expected, ctx.now)
require.True(t, <-resultCh)
case <-time.After(5 * time.Second):
t.Fatal("No message was received on eval channel")
}
})
t.Run("eval should exit when context is cancelled", func(t *testing.T) {
r := newAlertRuleInfo(context.Background())
resultCh := make(chan bool)
go func() {
resultCh <- r.eval(time.Now(), rand.Int63())
}()
runtime.Gosched()
r.stop()
select {
case result := <-resultCh:
require.False(t, result)
case <-time.After(5 * time.Second):
t.Fatal("No message was received on eval channel")
}
})
})
t.Run("when rule evaluation is stopped", func(t *testing.T) {
t.Run("eval should do nothing", func(t *testing.T) {
r := newAlertRuleInfo(context.Background())
r.stop()
require.False(t, r.eval(time.Now(), rand.Int63()))
})
t.Run("stop should do nothing", func(t *testing.T) {
r := newAlertRuleInfo(context.Background())
r.stop()
r.stop()
})
})
t.Run("should be thread-safe", func(t *testing.T) {
r := newAlertRuleInfo(context.Background())
wg := sync.WaitGroup{}
go func() {
for {
select {
case <-r.evalCh:
time.Sleep(time.Millisecond)
case <-r.ctx.Done():
return
}
}
}()
for i := 0; i < 10; i++ {
wg.Add(1)
go func() {
for i := 0; i < 20; i++ {
max := 2
if i <= 10 {
max = 1
}
switch rand.Intn(max) + 1 {
case 1:
r.eval(time.Now(), rand.Int63())
case 2:
r.stop()
}
}
wg.Done()
}()
}
wg.Wait()
})
}
func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, acs store.AdminConfigurationStore, registry *prometheus.Registry) (*schedule, *clock.Mock) { func setupScheduler(t *testing.T, rs store.RuleStore, is store.InstanceStore, acs store.AdminConfigurationStore, registry *prometheus.Registry) (*schedule, *clock.Mock) {
t.Helper() t.Helper()