Alerting: Refactor Run of the scheduler (#37157)

* Alerting: Refactor `Run` of the scheduler A bit of a refactor to make the diff easier to read for supporting external Alertmanagers. We'll introduce another routine that checks the database for configuration and spawns other routines accordingly. * Block the wait. * Fix test
2025-02-25 18:55:37 -06:00 · 2021-07-27 11:52:59 +01:00 · 2021-07-27 11:52:59 +01:00 · 442a6677fc
commit 442a6677fc
parent 0c804df763
3 changed files with 140 additions and 122 deletions
--- a/pkg/services/ngalert/ngalert.go
+++ b/pkg/services/ngalert/ngalert.go
@ -110,14 +110,14 @@ func (ng *AlertNG) Init() error {
 	return nil
 }

-// Run starts the scheduler.
+// Run starts the scheduler and Alertmanager.
 func (ng *AlertNG) Run(ctx context.Context) error {
 	ng.Log.Debug("ngalert starting")
 	ng.stateManager.Warm()

 	children, subCtx := errgroup.WithContext(ctx)
 	children.Go(func() error {
-		return ng.schedule.Ticker(subCtx)
+		return ng.schedule.Run(subCtx)
 	})
 	children.Go(func() error {
 		return ng.Alertmanager.Run(subCtx)
--- a/pkg/services/ngalert/schedule/schedule.go
+++ b/pkg/services/ngalert/schedule/schedule.go
@ -25,7 +25,7 @@ var timeNow = time.Now

 // ScheduleService handles scheduling
 type ScheduleService interface {
-	Ticker(context.Context) error
+	Run(context.Context) error
 	Pause() error
 	Unpause() error

@ -41,6 +41,8 @@ type Notifier interface {
 }

 type schedule struct {
+	wg sync.WaitGroup
+
 	// base tick rate (fastest possible configured check)
 	baseInterval time.Duration

@ -67,11 +69,9 @@ type schedule struct {

 	evaluator eval.Evaluator

-	ruleStore store.RuleStore
-
+	ruleStore     store.RuleStore
 	instanceStore store.InstanceStore
-
-	dataService *tsdb.Service
+	dataService   *tsdb.Service

 	stateManager *state.Manager

@ -120,30 +120,6 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
 	return &sch
 }

-func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
-	sch.clock = cfg.C
-	sch.baseInterval = cfg.BaseInterval
-	sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
-	sch.evalAppliedFunc = cfg.EvalAppliedFunc
-	sch.stopAppliedFunc = cfg.StopAppliedFunc
-}
-
-func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
-	if sch.evalAppliedFunc == nil {
-		return
-	}
-
-	sch.evalAppliedFunc(alertDefKey, now)
-}
-
-func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
-	if sch.stopAppliedFunc == nil {
-		return
-	}
-
-	sch.stopAppliedFunc(alertDefKey)
-}
-
 func (sch *schedule) Pause() error {
 	if sch == nil {
 		return fmt.Errorf("scheduler is not initialised")
@ -162,6 +138,111 @@ func (sch *schedule) Unpause() error {
 	return nil
 }

+func (sch *schedule) Run(ctx context.Context) error {
+	sch.wg.Add(1)
+
+	go func() {
+		if err := sch.ruleEvaluationLoop(ctx); err != nil {
+			sch.log.Error("failure while running the rule evaluation loop", "err", err)
+		}
+	}()
+
+	sch.wg.Wait()
+	return nil
+}
+
+func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
+	defer sch.wg.Done()
+
+	dispatcherGroup, ctx := errgroup.WithContext(ctx)
+	for {
+		select {
+		case tick := <-sch.heartbeat.C:
+			tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
+			alertRules := sch.fetchAllDetails()
+			sch.log.Debug("alert rules fetched", "count", len(alertRules))
+
+			// registeredDefinitions is a map used for finding deleted alert rules
+			// initially it is assigned to all known alert rules from the previous cycle
+			// each alert rule found also in this cycle is removed
+			// so, at the end, the remaining registered alert rules are the deleted ones
+			registeredDefinitions := sch.registry.keyMap()
+
+			type readyToRunItem struct {
+				key      models.AlertRuleKey
+				ruleInfo alertRuleInfo
+			}
+			readyToRun := make([]readyToRunItem, 0)
+			for _, item := range alertRules {
+				key := item.GetKey()
+				itemVersion := item.Version
+				newRoutine := !sch.registry.exists(key)
+				ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
+				invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
+
+				if newRoutine && !invalidInterval {
+					dispatcherGroup.Go(func() error {
+						return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh)
+					})
+				}
+
+				if invalidInterval {
+					// this is expected to be always false
+					// give that we validate interval during alert rule updates
+					sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
+					continue
+				}
+
+				itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
+				if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
+					readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo})
+				}
+
+				// remove the alert rule from the registered alert rules
+				delete(registeredDefinitions, key)
+			}
+
+			var step int64 = 0
+			if len(readyToRun) > 0 {
+				step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
+			}
+
+			for i := range readyToRun {
+				item := readyToRun[i]
+
+				time.AfterFunc(time.Duration(int64(i)*step), func() {
+					item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version}
+				})
+			}
+
+			// unregister and stop routines of the deleted alert rules
+			for key := range registeredDefinitions {
+				ruleInfo, err := sch.registry.get(key)
+				if err != nil {
+					sch.log.Error("failed to get alert rule routine information", "err", err)
+					continue
+				}
+				ruleInfo.stopCh <- struct{}{}
+				sch.registry.del(key)
+			}
+		case <-ctx.Done():
+			waitErr := dispatcherGroup.Wait()
+
+			orgIds, err := sch.instanceStore.FetchOrgIds()
+			if err != nil {
+				sch.log.Error("unable to fetch orgIds", "msg", err.Error())
+			}
+
+			for _, v := range orgIds {
+				sch.saveAlertStates(sch.stateManager.GetAll(v))
+			}
+
+			sch.stateManager.Close()
+			return waitErr
+		}
+	}
+}
+
 func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}) error {
 	sch.log.Debug("alert rule routine started", "key", key)

@ -248,96 +329,6 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
 	}
 }

-func (sch *schedule) Ticker(grafanaCtx context.Context) error {
-	dispatcherGroup, ctx := errgroup.WithContext(grafanaCtx)
-	for {
-		select {
-		case tick := <-sch.heartbeat.C:
-			tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
-			alertRules := sch.fetchAllDetails()
-			sch.log.Debug("alert rules fetched", "count", len(alertRules))
-
-			// registeredDefinitions is a map used for finding deleted alert rules
-			// initially it is assigned to all known alert rules from the previous cycle
-			// each alert rule found also in this cycle is removed
-			// so, at the end, the remaining registered alert rules are the deleted ones
-			registeredDefinitions := sch.registry.keyMap()
-
-			type readyToRunItem struct {
-				key      models.AlertRuleKey
-				ruleInfo alertRuleInfo
-			}
-			readyToRun := make([]readyToRunItem, 0)
-			for _, item := range alertRules {
-				key := item.GetKey()
-				itemVersion := item.Version
-				newRoutine := !sch.registry.exists(key)
-				ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
-				invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
-
-				if newRoutine && !invalidInterval {
-					dispatcherGroup.Go(func() error {
-						return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh)
-					})
-				}
-
-				if invalidInterval {
-					// this is expected to be always false
-					// give that we validate interval during alert rule updates
-					sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
-					continue
-				}
-
-				itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
-				if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
-					readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo})
-				}
-
-				// remove the alert rule from the registered alert rules
-				delete(registeredDefinitions, key)
-			}
-
-			var step int64 = 0
-			if len(readyToRun) > 0 {
-				step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
-			}
-
-			for i := range readyToRun {
-				item := readyToRun[i]
-
-				time.AfterFunc(time.Duration(int64(i)*step), func() {
-					item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version}
-				})
-			}
-
-			// unregister and stop routines of the deleted alert rules
-			for key := range registeredDefinitions {
-				ruleInfo, err := sch.registry.get(key)
-				if err != nil {
-					sch.log.Error("failed to get alert rule routine information", "err", err)
-					continue
-				}
-				ruleInfo.stopCh <- struct{}{}
-				sch.registry.del(key)
-			}
-		case <-grafanaCtx.Done():
-			waitErr := dispatcherGroup.Wait()
-
-			orgIds, err := sch.instanceStore.FetchOrgIds()
-			if err != nil {
-				sch.log.Error("unable to fetch orgIds", "msg", err.Error())
-			}
-
-			for _, v := range orgIds {
-				sch.saveAlertStates(sch.stateManager.GetAll(v))
-			}
-
-			sch.stateManager.Close()
-			return waitErr
-		}
-	}
-}
-
 func (sch *schedule) sendAlerts(alerts apimodels.PostableAlerts) error {
 	return sch.notifier.PutAlerts(alerts)
 }
@ -445,3 +436,30 @@ type evalContext struct {
 	now     time.Time
 	version int64
 }
+
+// overrideCfg is only used on tests.
+func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
+	sch.clock = cfg.C
+	sch.baseInterval = cfg.BaseInterval
+	sch.heartbeat = alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
+	sch.evalAppliedFunc = cfg.EvalAppliedFunc
+	sch.stopAppliedFunc = cfg.StopAppliedFunc
+}
+
+// evalApplied is only used on tests.
+func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
+	if sch.evalAppliedFunc == nil {
+		return
+	}
+
+	sch.evalAppliedFunc(alertDefKey, now)
+}
+
+// stopApplied is only used on tests.
+func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
+	if sch.stopAppliedFunc == nil {
+		return
+	}
+
+	sch.stopAppliedFunc(alertDefKey)
+}
--- a/pkg/services/ngalert/schedule/schedule_test.go
+++ b/pkg/services/ngalert/schedule/schedule_test.go
@ -158,7 +158,7 @@ func TestAlertingTicker(t *testing.T) {
 	ctx := context.Background()

 	go func() {
-		err := sched.Ticker(ctx)
+		err := sched.Run(ctx)
 		require.NoError(t, err)
 	}()
 	runtime.Gosched()