From f3c219a980ae3816061a2834d200be568f9c3756 Mon Sep 17 00:00:00 2001 From: Yuriy Tseretyan Date: Thu, 20 Oct 2022 13:43:48 -0400 Subject: [PATCH] Alerting: update format of logs in scheduler (#57302) * Change the severity level of the log messages --- pkg/services/ngalert/ngalert.go | 11 +++-- pkg/services/ngalert/schedule/schedule.go | 41 +++++++++---------- .../ngalert/schedule/schedule_test.go | 18 +------- .../ngalert/schedule/schedule_unit_test.go | 3 +- 4 files changed, 28 insertions(+), 45 deletions(-) diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 6c77da9f59d..5cc94a47237 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -183,7 +183,6 @@ func (ng *AlertNG) init() error { schedCfg := schedule.SchedulerCfg{ Cfg: ng.Cfg.UnifiedAlerting, C: clk, - Logger: ng.Log, Evaluator: eval.NewEvaluator(ng.Cfg, ng.Log, ng.DataSourceCache, ng.ExpressionService), RuleStore: store, Metrics: ng.Metrics.GetSchedulerMetrics(), @@ -246,19 +245,19 @@ func subscribeToFolderChanges(logger log.Logger, bus bus.Bus, dbStore api.RuleSt bus.AddEventListener(func(ctx context.Context, e *events.FolderTitleUpdated) error { // do not block the upstream execution go func(evt *events.FolderTitleUpdated) { - logger.Debug("Got folder title updated event. updating rules in the folder", "folder_uid", evt.UID) + logger.Info("Got folder title updated event. updating rules in the folder", "folderUID", evt.UID) updated, err := dbStore.IncreaseVersionForAllRulesInNamespace(context.Background(), evt.OrgID, evt.UID) if err != nil { - logger.Error("Failed to update alert rules in the folder after its title was changed", "error", err, "folder_uid", evt.UID, "folder", evt.Title) + logger.Error("Failed to update alert rules in the folder after its title was changed", "error", err, "folderUID", evt.UID, "folder", evt.Title) return } if len(updated) > 0 { - logger.Debug("rules that belong to the folder have been updated successfully. clearing their status", "updated_rules", len(updated)) + logger.Info("Rules that belong to the folder have been updated successfully. Clearing their status", "folderUID", evt.UID, "updatedRules", len(updated)) for _, key := range updated { scheduler.UpdateAlertRule(key.AlertRuleKey, key.Version) } } else { - logger.Debug("no alert rules found in the folder. nothing to update", "folder_uid", evt.UID, "folder", evt.Title) + logger.Debug("No alert rules found in the folder. nothing to update", "folderUID", evt.UID, "folder", evt.Title) } }(e) return nil @@ -267,7 +266,7 @@ func subscribeToFolderChanges(logger log.Logger, bus bus.Bus, dbStore api.RuleSt // Run starts the scheduler and Alertmanager. func (ng *AlertNG) Run(ctx context.Context) error { - ng.Log.Debug("ngalert starting") + ng.Log.Debug("Starting") ng.stateManager.Warm(ctx) children, subCtx := errgroup.WithContext(ctx) diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 15f392d7d53..6325955221a 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -103,7 +103,6 @@ type schedule struct { type SchedulerCfg struct { Cfg setting.UnifiedAlertingSettings C clock.Clock - Logger log.Logger EvalAppliedFunc func(ngmodels.AlertRuleKey, time.Time) StopAppliedFunc func(ngmodels.AlertRuleKey) Evaluator eval.Evaluator @@ -119,7 +118,7 @@ func NewScheduler(cfg SchedulerCfg, appURL *url.URL, stateManager *state.Manager maxAttempts: cfg.Cfg.MaxAttempts, clock: cfg.C, baseInterval: cfg.Cfg.BaseInterval, - log: cfg.Logger, + log: log.New("ngalert.scheduler"), evalAppliedFunc: cfg.EvalAppliedFunc, stopAppliedFunc: cfg.StopAppliedFunc, evaluator: cfg.Evaluator, @@ -141,7 +140,7 @@ func (sch *schedule) Run(ctx context.Context) error { defer t.Stop() if err := sch.schedulePeriodic(ctx, t); err != nil { - sch.log.Error("failure while running the rule evaluation loop", "error", err) + sch.log.Error("Failure while running the rule evaluation loop", "error", err) } return nil } @@ -162,12 +161,12 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) { // Ruler API has called DeleteAlertRule. This can happen as requests to // the Ruler API do not hold an exclusive lock over all scheduler operations. if _, ok := sch.schedulableAlertRules.del(key); !ok { - sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID) + sch.log.Info("Alert rule cannot be removed from the scheduler as it is not scheduled", key.LogContext()...) } // Delete the rule routine ruleInfo, ok := sch.registry.del(key) if !ok { - sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID) + sch.log.Info("Alert rule cannot be stopped as it is not running", key.LogContext()...) continue } // stop rule evaluation @@ -194,7 +193,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { tickNum := tick.Unix() / int64(sch.baseInterval.Seconds()) if err := sch.updateSchedulableAlertRules(ctx); err != nil { - sch.log.Error("scheduler failed to update alert rules", "error", err) + sch.log.Error("Failed to update alert rules", "error", err) } alertRules, folderTitles := sch.schedulableAlertRules.all() @@ -222,7 +221,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { // enforce minimum evaluation interval if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) { - sch.log.Debug("interval adjusted", "rule_interval_seconds", item.IntervalSeconds, "min_interval_seconds", sch.minRuleInterval.Seconds(), "key", key) + sch.log.Debug("Interval adjusted", append(key.LogContext(), "originalInterval", item.IntervalSeconds, "adjustedInterval", sch.minRuleInterval.Seconds())...) item.IntervalSeconds = int64(sch.minRuleInterval.Seconds()) } @@ -237,7 +236,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { if invalidInterval { // this is expected to be always false // given that we validate interval during alert rule updates - sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval) + sch.log.Warn("Rule has an invalid interval and will be ignored. Interval should be divided exactly by scheduler interval", append(key.LogContext(), "ruleInterval", time.Duration(item.IntervalSeconds)*time.Second, "schedulerInterval", sch.baseInterval)...) continue } @@ -264,7 +263,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { } if len(missingFolder) > 0 { // if this happens then there can be problems with fetching folders from the database. - sch.log.Warn("unable to find obtain folder titles for some rules", "folder_to_rule_map", missingFolder) + sch.log.Warn("Unable to obtain folder titles for some rules", "missingFolderUIDToRuleUID", missingFolder) } var step int64 = 0 @@ -279,11 +278,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { key := item.rule.GetKey() success, dropped := item.ruleInfo.eval(&item.evaluation) if !success { - sch.log.Debug("scheduled evaluation was canceled because evaluation routine was stopped", "uid", key.UID, "org", key.OrgID, "time", tick) + sch.log.Debug("Scheduled evaluation was canceled because evaluation routine was stopped", append(key.LogContext(), "time", tick)...) return } if dropped != nil { - sch.log.Warn("Alert rule evaluation is too slow - dropped tick", "uid", key.UID, "org", key.OrgID, "time", tick) + sch.log.Warn("Tick dropped because alert rule evaluation is too slow", append(key.LogContext(), "time", tick)...) orgID := fmt.Sprint(key.OrgID) sch.metrics.EvaluationMissed.WithLabelValues(orgID, item.rule.Title).Inc() } @@ -308,7 +307,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error { func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersion) error { logger := sch.log.New(key.LogContext()...) - logger.Debug("alert rule routine started") + logger.Debug("Alert rule routine started") orgID := fmt.Sprint(key.OrgID) evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID) @@ -348,12 +347,12 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR evalDuration.Observe(dur.Seconds()) if results.HasErrors() { evalTotalFailures.Inc() - logger.Error("failed to evaluate alert rule", "results", results, "duration", dur) + logger.Error("Failed to evaluate alert rule", "results", results, "duration", dur) } else { - logger.Debug("alert rule evaluated", "results", results, "duration", dur) + logger.Debug("Alert rule evaluated", "results", results, "duration", dur) } if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task. - logger.Debug("skip updating the state because the context has been cancelled") + logger.Debug("Skip updating the state because the context has been cancelled") return } processedStates := sch.stateManager.ProcessEvalResults(ctx, e.scheduledAt, e.rule, results, sch.getRuleExtraLabels(e)) @@ -387,16 +386,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR // therefore, at the time when message from updateCh is processed the current rule will have // at least the same version (or greater) and the state created for the new version of the rule. if currentRuleVersion >= int64(lastVersion) { - logger.Info("skip updating rule because its current version is actual", "version", currentRuleVersion, "new_version", lastVersion) + logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", lastVersion) continue } - logger.Info("clearing the state of the rule because version has changed", "version", currentRuleVersion, "new_version", lastVersion) + logger.Info("Clearing the state of the rule because version has changed", "version", currentRuleVersion, "newVersion", lastVersion) // clear the state. So the next evaluation will start from the scratch. clearState() // evalCh - used by the scheduler to signal that evaluation is needed. case ctx, ok := <-evalCh: if !ok { - logger.Debug("evaluation channel has been closed. Exiting") + logger.Debug("Evaluation channel has been closed. Exiting") return nil } if evalRunning { @@ -415,7 +414,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR // fetch latest alert rule version if currentRuleVersion != newVersion { if currentRuleVersion > 0 { // do not clean up state if the eval loop has just started. - logger.Debug("got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "new_version", newVersion) + logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion) clearState() } currentRuleVersion = newVersion @@ -424,7 +423,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR return nil }) if err != nil { - logger.Error("evaluation failed after all retries", "error", err) + logger.Error("Evaluation failed after all retries", "error", err) } }() case <-grafanaCtx.Done(): @@ -432,7 +431,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR if errors.Is(grafanaCtx.Err(), errRuleDeleted) { clearState() } - logger.Debug("stopping alert rule routine") + logger.Debug("Stopping alert rule routine") return nil } } diff --git a/pkg/services/ngalert/schedule/schedule_test.go b/pkg/services/ngalert/schedule/schedule_test.go index d9dbb546d20..05ccd77fc34 100644 --- a/pkg/services/ngalert/schedule/schedule_test.go +++ b/pkg/services/ngalert/schedule/schedule_test.go @@ -107,20 +107,7 @@ func TestWarmStateCache(t *testing.T) { Labels: labels, } _ = dbstore.SaveAlertInstances(ctx, instance2) - - cfg := setting.UnifiedAlertingSettings{ - BaseInterval: time.Second, - AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests. - } - - schedCfg := schedule.SchedulerCfg{ - Cfg: cfg, - C: clock.NewMock(), - Logger: log.New("ngalert cache warming test"), - RuleStore: dbstore, - Metrics: testMetrics.GetSchedulerMetrics(), - } - st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) + st := state.NewManager(log.New("test"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) st.Warm(ctx) t.Run("instance cache has expected entries", func(t *testing.T) { @@ -168,11 +155,10 @@ func TestAlertingTicker(t *testing.T) { stopAppliedCh <- alertDefKey }, RuleStore: dbstore, - Logger: log.New("ngalert schedule test"), Metrics: testMetrics.GetSchedulerMetrics(), AlertSender: notifier, } - st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) + st := state.NewManager(log.New("test"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{}) appUrl := &url.URL{ Scheme: "http", Host: "localhost", diff --git a/pkg/services/ngalert/schedule/schedule_unit_test.go b/pkg/services/ngalert/schedule/schedule_unit_test.go index 9922ee9dbad..7ce5f83e59e 100644 --- a/pkg/services/ngalert/schedule/schedule_unit_test.go +++ b/pkg/services/ngalert/schedule/schedule_unit_test.go @@ -522,13 +522,12 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor C: mockedClock, Evaluator: evaluator, RuleStore: rs, - Logger: logger, Metrics: m.GetSchedulerMetrics(), AlertSender: senderMock, } stateRs := state.FakeRuleReader{} - st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), nil, &stateRs, is, &image.NoopImageService{}, mockedClock, &state.FakeHistorian{}) + st := state.NewManager(log.New("test"), m.GetStateMetrics(), nil, &stateRs, is, &image.NoopImageService{}, mockedClock, &state.FakeHistorian{}) return NewScheduler(schedCfg, appUrl, st) }