mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Refactor Run
of the scheduler (#37157)
* Alerting: Refactor `Run` of the scheduler A bit of a refactor to make the diff easier to read for supporting external Alertmanagers. We'll introduce another routine that checks the database for configuration and spawns other routines accordingly. * Block the wait. * Fix test
This commit is contained in:
parent
0c804df763
commit
442a6677fc
@ -110,14 +110,14 @@ func (ng *AlertNG) Init() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run starts the scheduler.
|
// Run starts the scheduler and Alertmanager.
|
||||||
func (ng *AlertNG) Run(ctx context.Context) error {
|
func (ng *AlertNG) Run(ctx context.Context) error {
|
||||||
ng.Log.Debug("ngalert starting")
|
ng.Log.Debug("ngalert starting")
|
||||||
ng.stateManager.Warm()
|
ng.stateManager.Warm()
|
||||||
|
|
||||||
children, subCtx := errgroup.WithContext(ctx)
|
children, subCtx := errgroup.WithContext(ctx)
|
||||||
children.Go(func() error {
|
children.Go(func() error {
|
||||||
return ng.schedule.Ticker(subCtx)
|
return ng.schedule.Run(subCtx)
|
||||||
})
|
})
|
||||||
children.Go(func() error {
|
children.Go(func() error {
|
||||||
return ng.Alertmanager.Run(subCtx)
|
return ng.Alertmanager.Run(subCtx)
|
||||||
|
@ -25,7 +25,7 @@ var timeNow = time.Now
|
|||||||
|
|
||||||
// ScheduleService handles scheduling
|
// ScheduleService handles scheduling
|
||||||
type ScheduleService interface {
|
type ScheduleService interface {
|
||||||
Ticker(context.Context) error
|
Run(context.Context) error
|
||||||
Pause() error
|
Pause() error
|
||||||
Unpause() error
|
Unpause() error
|
||||||
|
|
||||||
@ -41,6 +41,8 @@ type Notifier interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type schedule struct {
|
type schedule struct {
|
||||||
|
wg sync.WaitGroup
|
||||||
|
|
||||||
// base tick rate (fastest possible configured check)
|
// base tick rate (fastest possible configured check)
|
||||||
baseInterval time.Duration
|
baseInterval time.Duration
|
||||||
|
|
||||||
@ -68,9 +70,7 @@ type schedule struct {
|
|||||||
evaluator eval.Evaluator
|
evaluator eval.Evaluator
|
||||||
|
|
||||||
ruleStore store.RuleStore
|
ruleStore store.RuleStore
|
||||||
|
|
||||||
instanceStore store.InstanceStore
|
instanceStore store.InstanceStore
|
||||||
|
|
||||||
dataService *tsdb.Service
|
dataService *tsdb.Service
|
||||||
|
|
||||||
stateManager *state.Manager
|
stateManager *state.Manager
|
||||||
@ -120,30 +120,6 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
|
|||||||
return &sch
|
return &sch
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
|
|
||||||
sch.clock = cfg.C
|
|
||||||
sch.baseInterval = cfg.BaseInterval
|
|
||||||
sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
|
|
||||||
sch.evalAppliedFunc = cfg.EvalAppliedFunc
|
|
||||||
sch.stopAppliedFunc = cfg.StopAppliedFunc
|
|
||||||
}
|
|
||||||
|
|
||||||
func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
|
|
||||||
if sch.evalAppliedFunc == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
sch.evalAppliedFunc(alertDefKey, now)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
|
|
||||||
if sch.stopAppliedFunc == nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
sch.stopAppliedFunc(alertDefKey)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (sch *schedule) Pause() error {
|
func (sch *schedule) Pause() error {
|
||||||
if sch == nil {
|
if sch == nil {
|
||||||
return fmt.Errorf("scheduler is not initialised")
|
return fmt.Errorf("scheduler is not initialised")
|
||||||
@ -162,6 +138,111 @@ func (sch *schedule) Unpause() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (sch *schedule) Run(ctx context.Context) error {
|
||||||
|
sch.wg.Add(1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
if err := sch.ruleEvaluationLoop(ctx); err != nil {
|
||||||
|
sch.log.Error("failure while running the rule evaluation loop", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
sch.wg.Wait()
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||||
|
defer sch.wg.Done()
|
||||||
|
|
||||||
|
dispatcherGroup, ctx := errgroup.WithContext(ctx)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case tick := <-sch.heartbeat.C:
|
||||||
|
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
||||||
|
alertRules := sch.fetchAllDetails()
|
||||||
|
sch.log.Debug("alert rules fetched", "count", len(alertRules))
|
||||||
|
|
||||||
|
// registeredDefinitions is a map used for finding deleted alert rules
|
||||||
|
// initially it is assigned to all known alert rules from the previous cycle
|
||||||
|
// each alert rule found also in this cycle is removed
|
||||||
|
// so, at the end, the remaining registered alert rules are the deleted ones
|
||||||
|
registeredDefinitions := sch.registry.keyMap()
|
||||||
|
|
||||||
|
type readyToRunItem struct {
|
||||||
|
key models.AlertRuleKey
|
||||||
|
ruleInfo alertRuleInfo
|
||||||
|
}
|
||||||
|
readyToRun := make([]readyToRunItem, 0)
|
||||||
|
for _, item := range alertRules {
|
||||||
|
key := item.GetKey()
|
||||||
|
itemVersion := item.Version
|
||||||
|
newRoutine := !sch.registry.exists(key)
|
||||||
|
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
|
||||||
|
invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
|
||||||
|
|
||||||
|
if newRoutine && !invalidInterval {
|
||||||
|
dispatcherGroup.Go(func() error {
|
||||||
|
return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if invalidInterval {
|
||||||
|
// this is expected to be always false
|
||||||
|
// give that we validate interval during alert rule updates
|
||||||
|
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
|
||||||
|
if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
|
||||||
|
readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo})
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove the alert rule from the registered alert rules
|
||||||
|
delete(registeredDefinitions, key)
|
||||||
|
}
|
||||||
|
|
||||||
|
var step int64 = 0
|
||||||
|
if len(readyToRun) > 0 {
|
||||||
|
step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range readyToRun {
|
||||||
|
item := readyToRun[i]
|
||||||
|
|
||||||
|
time.AfterFunc(time.Duration(int64(i)*step), func() {
|
||||||
|
item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// unregister and stop routines of the deleted alert rules
|
||||||
|
for key := range registeredDefinitions {
|
||||||
|
ruleInfo, err := sch.registry.get(key)
|
||||||
|
if err != nil {
|
||||||
|
sch.log.Error("failed to get alert rule routine information", "err", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ruleInfo.stopCh <- struct{}{}
|
||||||
|
sch.registry.del(key)
|
||||||
|
}
|
||||||
|
case <-ctx.Done():
|
||||||
|
waitErr := dispatcherGroup.Wait()
|
||||||
|
|
||||||
|
orgIds, err := sch.instanceStore.FetchOrgIds()
|
||||||
|
if err != nil {
|
||||||
|
sch.log.Error("unable to fetch orgIds", "msg", err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, v := range orgIds {
|
||||||
|
sch.saveAlertStates(sch.stateManager.GetAll(v))
|
||||||
|
}
|
||||||
|
|
||||||
|
sch.stateManager.Close()
|
||||||
|
return waitErr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
|
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
|
||||||
sch.log.Debug("alert rule routine started", "key", key)
|
sch.log.Debug("alert rule routine started", "key", key)
|
||||||
|
|
||||||
@ -248,96 +329,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sch *schedule) Ticker(grafanaCtx context.Context) error {
|
|
||||||
dispatcherGroup, ctx := errgroup.WithContext(grafanaCtx)
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case tick := <-sch.heartbeat.C:
|
|
||||||
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
|
||||||
alertRules := sch.fetchAllDetails()
|
|
||||||
sch.log.Debug("alert rules fetched", "count", len(alertRules))
|
|
||||||
|
|
||||||
// registeredDefinitions is a map used for finding deleted alert rules
|
|
||||||
// initially it is assigned to all known alert rules from the previous cycle
|
|
||||||
// each alert rule found also in this cycle is removed
|
|
||||||
// so, at the end, the remaining registered alert rules are the deleted ones
|
|
||||||
registeredDefinitions := sch.registry.keyMap()
|
|
||||||
|
|
||||||
type readyToRunItem struct {
|
|
||||||
key models.AlertRuleKey
|
|
||||||
ruleInfo alertRuleInfo
|
|
||||||
}
|
|
||||||
readyToRun := make([]readyToRunItem, 0)
|
|
||||||
for _, item := range alertRules {
|
|
||||||
key := item.GetKey()
|
|
||||||
itemVersion := item.Version
|
|
||||||
newRoutine := !sch.registry.exists(key)
|
|
||||||
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
|
|
||||||
invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
|
|
||||||
|
|
||||||
if newRoutine && !invalidInterval {
|
|
||||||
dispatcherGroup.Go(func() error {
|
|
||||||
return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if invalidInterval {
|
|
||||||
// this is expected to be always false
|
|
||||||
// give that we validate interval during alert rule updates
|
|
||||||
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
|
|
||||||
if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
|
|
||||||
readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo})
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove the alert rule from the registered alert rules
|
|
||||||
delete(registeredDefinitions, key)
|
|
||||||
}
|
|
||||||
|
|
||||||
var step int64 = 0
|
|
||||||
if len(readyToRun) > 0 {
|
|
||||||
step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
|
|
||||||
}
|
|
||||||
|
|
||||||
for i := range readyToRun {
|
|
||||||
item := readyToRun[i]
|
|
||||||
|
|
||||||
time.AfterFunc(time.Duration(int64(i)*step), func() {
|
|
||||||
item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// unregister and stop routines of the deleted alert rules
|
|
||||||
for key := range registeredDefinitions {
|
|
||||||
ruleInfo, err := sch.registry.get(key)
|
|
||||||
if err != nil {
|
|
||||||
sch.log.Error("failed to get alert rule routine information", "err", err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ruleInfo.stopCh <- struct{}{}
|
|
||||||
sch.registry.del(key)
|
|
||||||
}
|
|
||||||
case <-grafanaCtx.Done():
|
|
||||||
waitErr := dispatcherGroup.Wait()
|
|
||||||
|
|
||||||
orgIds, err := sch.instanceStore.FetchOrgIds()
|
|
||||||
if err != nil {
|
|
||||||
sch.log.Error("unable to fetch orgIds", "msg", err.Error())
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, v := range orgIds {
|
|
||||||
sch.saveAlertStates(sch.stateManager.GetAll(v))
|
|
||||||
}
|
|
||||||
|
|
||||||
sch.stateManager.Close()
|
|
||||||
return waitErr
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (sch *schedule) sendAlerts(alerts apimodels.PostableAlerts) error {
|
func (sch *schedule) sendAlerts(alerts apimodels.PostableAlerts) error {
|
||||||
return sch.notifier.PutAlerts(alerts)
|
return sch.notifier.PutAlerts(alerts)
|
||||||
}
|
}
|
||||||
@ -445,3 +436,30 @@ type evalContext struct {
|
|||||||
now time.Time
|
now time.Time
|
||||||
version int64
|
version int64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// overrideCfg is only used on tests.
|
||||||
|
func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
|
||||||
|
sch.clock = cfg.C
|
||||||
|
sch.baseInterval = cfg.BaseInterval
|
||||||
|
sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
|
||||||
|
sch.evalAppliedFunc = cfg.EvalAppliedFunc
|
||||||
|
sch.stopAppliedFunc = cfg.StopAppliedFunc
|
||||||
|
}
|
||||||
|
|
||||||
|
// evalApplied is only used on tests.
|
||||||
|
func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
|
||||||
|
if sch.evalAppliedFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
sch.evalAppliedFunc(alertDefKey, now)
|
||||||
|
}
|
||||||
|
|
||||||
|
// stopApplied is only used on tests.
|
||||||
|
func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
|
||||||
|
if sch.stopAppliedFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
sch.stopAppliedFunc(alertDefKey)
|
||||||
|
}
|
||||||
|
@ -158,7 +158,7 @@ func TestAlertingTicker(t *testing.T) {
|
|||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
err := sched.Ticker(ctx)
|
err := sched.Run(ctx)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
}()
|
}()
|
||||||
runtime.Gosched()
|
runtime.Gosched()
|
||||||
|
Loading…
Reference in New Issue
Block a user