mirror of
https://github.com/grafana/grafana.git
synced 2024-11-23 09:26:43 -06:00
Alerting: update format of logs in scheduler (#57302)
* Change the severity level of the log messages
This commit is contained in:
parent
8cebaf345d
commit
f3c219a980
@ -183,7 +183,6 @@ func (ng *AlertNG) init() error {
|
||||
schedCfg := schedule.SchedulerCfg{
|
||||
Cfg: ng.Cfg.UnifiedAlerting,
|
||||
C: clk,
|
||||
Logger: ng.Log,
|
||||
Evaluator: eval.NewEvaluator(ng.Cfg, ng.Log, ng.DataSourceCache, ng.ExpressionService),
|
||||
RuleStore: store,
|
||||
Metrics: ng.Metrics.GetSchedulerMetrics(),
|
||||
@ -246,19 +245,19 @@ func subscribeToFolderChanges(logger log.Logger, bus bus.Bus, dbStore api.RuleSt
|
||||
bus.AddEventListener(func(ctx context.Context, e *events.FolderTitleUpdated) error {
|
||||
// do not block the upstream execution
|
||||
go func(evt *events.FolderTitleUpdated) {
|
||||
logger.Debug("Got folder title updated event. updating rules in the folder", "folder_uid", evt.UID)
|
||||
logger.Info("Got folder title updated event. updating rules in the folder", "folderUID", evt.UID)
|
||||
updated, err := dbStore.IncreaseVersionForAllRulesInNamespace(context.Background(), evt.OrgID, evt.UID)
|
||||
if err != nil {
|
||||
logger.Error("Failed to update alert rules in the folder after its title was changed", "error", err, "folder_uid", evt.UID, "folder", evt.Title)
|
||||
logger.Error("Failed to update alert rules in the folder after its title was changed", "error", err, "folderUID", evt.UID, "folder", evt.Title)
|
||||
return
|
||||
}
|
||||
if len(updated) > 0 {
|
||||
logger.Debug("rules that belong to the folder have been updated successfully. clearing their status", "updated_rules", len(updated))
|
||||
logger.Info("Rules that belong to the folder have been updated successfully. Clearing their status", "folderUID", evt.UID, "updatedRules", len(updated))
|
||||
for _, key := range updated {
|
||||
scheduler.UpdateAlertRule(key.AlertRuleKey, key.Version)
|
||||
}
|
||||
} else {
|
||||
logger.Debug("no alert rules found in the folder. nothing to update", "folder_uid", evt.UID, "folder", evt.Title)
|
||||
logger.Debug("No alert rules found in the folder. nothing to update", "folderUID", evt.UID, "folder", evt.Title)
|
||||
}
|
||||
}(e)
|
||||
return nil
|
||||
@ -267,7 +266,7 @@ func subscribeToFolderChanges(logger log.Logger, bus bus.Bus, dbStore api.RuleSt
|
||||
|
||||
// Run starts the scheduler and Alertmanager.
|
||||
func (ng *AlertNG) Run(ctx context.Context) error {
|
||||
ng.Log.Debug("ngalert starting")
|
||||
ng.Log.Debug("Starting")
|
||||
ng.stateManager.Warm(ctx)
|
||||
|
||||
children, subCtx := errgroup.WithContext(ctx)
|
||||
|
@ -103,7 +103,6 @@ type schedule struct {
|
||||
type SchedulerCfg struct {
|
||||
Cfg setting.UnifiedAlertingSettings
|
||||
C clock.Clock
|
||||
Logger log.Logger
|
||||
EvalAppliedFunc func(ngmodels.AlertRuleKey, time.Time)
|
||||
StopAppliedFunc func(ngmodels.AlertRuleKey)
|
||||
Evaluator eval.Evaluator
|
||||
@ -119,7 +118,7 @@ func NewScheduler(cfg SchedulerCfg, appURL *url.URL, stateManager *state.Manager
|
||||
maxAttempts: cfg.Cfg.MaxAttempts,
|
||||
clock: cfg.C,
|
||||
baseInterval: cfg.Cfg.BaseInterval,
|
||||
log: cfg.Logger,
|
||||
log: log.New("ngalert.scheduler"),
|
||||
evalAppliedFunc: cfg.EvalAppliedFunc,
|
||||
stopAppliedFunc: cfg.StopAppliedFunc,
|
||||
evaluator: cfg.Evaluator,
|
||||
@ -141,7 +140,7 @@ func (sch *schedule) Run(ctx context.Context) error {
|
||||
defer t.Stop()
|
||||
|
||||
if err := sch.schedulePeriodic(ctx, t); err != nil {
|
||||
sch.log.Error("failure while running the rule evaluation loop", "error", err)
|
||||
sch.log.Error("Failure while running the rule evaluation loop", "error", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@ -162,12 +161,12 @@ func (sch *schedule) DeleteAlertRule(keys ...ngmodels.AlertRuleKey) {
|
||||
// Ruler API has called DeleteAlertRule. This can happen as requests to
|
||||
// the Ruler API do not hold an exclusive lock over all scheduler operations.
|
||||
if _, ok := sch.schedulableAlertRules.del(key); !ok {
|
||||
sch.log.Info("alert rule cannot be removed from the scheduler as it is not scheduled", "uid", key.UID, "org_id", key.OrgID)
|
||||
sch.log.Info("Alert rule cannot be removed from the scheduler as it is not scheduled", key.LogContext()...)
|
||||
}
|
||||
// Delete the rule routine
|
||||
ruleInfo, ok := sch.registry.del(key)
|
||||
if !ok {
|
||||
sch.log.Info("alert rule cannot be stopped as it is not running", "uid", key.UID, "org_id", key.OrgID)
|
||||
sch.log.Info("Alert rule cannot be stopped as it is not running", key.LogContext()...)
|
||||
continue
|
||||
}
|
||||
// stop rule evaluation
|
||||
@ -194,7 +193,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
||||
|
||||
if err := sch.updateSchedulableAlertRules(ctx); err != nil {
|
||||
sch.log.Error("scheduler failed to update alert rules", "error", err)
|
||||
sch.log.Error("Failed to update alert rules", "error", err)
|
||||
}
|
||||
alertRules, folderTitles := sch.schedulableAlertRules.all()
|
||||
|
||||
@ -222,7 +221,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
|
||||
// enforce minimum evaluation interval
|
||||
if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
|
||||
sch.log.Debug("interval adjusted", "rule_interval_seconds", item.IntervalSeconds, "min_interval_seconds", sch.minRuleInterval.Seconds(), "key", key)
|
||||
sch.log.Debug("Interval adjusted", append(key.LogContext(), "originalInterval", item.IntervalSeconds, "adjustedInterval", sch.minRuleInterval.Seconds())...)
|
||||
item.IntervalSeconds = int64(sch.minRuleInterval.Seconds())
|
||||
}
|
||||
|
||||
@ -237,7 +236,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
if invalidInterval {
|
||||
// this is expected to be always false
|
||||
// given that we validate interval during alert rule updates
|
||||
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
|
||||
sch.log.Warn("Rule has an invalid interval and will be ignored. Interval should be divided exactly by scheduler interval", append(key.LogContext(), "ruleInterval", time.Duration(item.IntervalSeconds)*time.Second, "schedulerInterval", sch.baseInterval)...)
|
||||
continue
|
||||
}
|
||||
|
||||
@ -264,7 +263,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
}
|
||||
|
||||
if len(missingFolder) > 0 { // if this happens then there can be problems with fetching folders from the database.
|
||||
sch.log.Warn("unable to find obtain folder titles for some rules", "folder_to_rule_map", missingFolder)
|
||||
sch.log.Warn("Unable to obtain folder titles for some rules", "missingFolderUIDToRuleUID", missingFolder)
|
||||
}
|
||||
|
||||
var step int64 = 0
|
||||
@ -279,11 +278,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
key := item.rule.GetKey()
|
||||
success, dropped := item.ruleInfo.eval(&item.evaluation)
|
||||
if !success {
|
||||
sch.log.Debug("scheduled evaluation was canceled because evaluation routine was stopped", "uid", key.UID, "org", key.OrgID, "time", tick)
|
||||
sch.log.Debug("Scheduled evaluation was canceled because evaluation routine was stopped", append(key.LogContext(), "time", tick)...)
|
||||
return
|
||||
}
|
||||
if dropped != nil {
|
||||
sch.log.Warn("Alert rule evaluation is too slow - dropped tick", "uid", key.UID, "org", key.OrgID, "time", tick)
|
||||
sch.log.Warn("Tick dropped because alert rule evaluation is too slow", append(key.LogContext(), "time", tick)...)
|
||||
orgID := fmt.Sprint(key.OrgID)
|
||||
sch.metrics.EvaluationMissed.WithLabelValues(orgID, item.rule.Title).Inc()
|
||||
}
|
||||
@ -308,7 +307,7 @@ func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
|
||||
|
||||
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertRuleKey, evalCh <-chan *evaluation, updateCh <-chan ruleVersion) error {
|
||||
logger := sch.log.New(key.LogContext()...)
|
||||
logger.Debug("alert rule routine started")
|
||||
logger.Debug("Alert rule routine started")
|
||||
|
||||
orgID := fmt.Sprint(key.OrgID)
|
||||
evalTotal := sch.metrics.EvalTotal.WithLabelValues(orgID)
|
||||
@ -348,12 +347,12 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
evalDuration.Observe(dur.Seconds())
|
||||
if results.HasErrors() {
|
||||
evalTotalFailures.Inc()
|
||||
logger.Error("failed to evaluate alert rule", "results", results, "duration", dur)
|
||||
logger.Error("Failed to evaluate alert rule", "results", results, "duration", dur)
|
||||
} else {
|
||||
logger.Debug("alert rule evaluated", "results", results, "duration", dur)
|
||||
logger.Debug("Alert rule evaluated", "results", results, "duration", dur)
|
||||
}
|
||||
if ctx.Err() != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
|
||||
logger.Debug("skip updating the state because the context has been cancelled")
|
||||
logger.Debug("Skip updating the state because the context has been cancelled")
|
||||
return
|
||||
}
|
||||
processedStates := sch.stateManager.ProcessEvalResults(ctx, e.scheduledAt, e.rule, results, sch.getRuleExtraLabels(e))
|
||||
@ -387,16 +386,16 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
// therefore, at the time when message from updateCh is processed the current rule will have
|
||||
// at least the same version (or greater) and the state created for the new version of the rule.
|
||||
if currentRuleVersion >= int64(lastVersion) {
|
||||
logger.Info("skip updating rule because its current version is actual", "version", currentRuleVersion, "new_version", lastVersion)
|
||||
logger.Info("Skip updating rule because its current version is actual", "version", currentRuleVersion, "newVersion", lastVersion)
|
||||
continue
|
||||
}
|
||||
logger.Info("clearing the state of the rule because version has changed", "version", currentRuleVersion, "new_version", lastVersion)
|
||||
logger.Info("Clearing the state of the rule because version has changed", "version", currentRuleVersion, "newVersion", lastVersion)
|
||||
// clear the state. So the next evaluation will start from the scratch.
|
||||
clearState()
|
||||
// evalCh - used by the scheduler to signal that evaluation is needed.
|
||||
case ctx, ok := <-evalCh:
|
||||
if !ok {
|
||||
logger.Debug("evaluation channel has been closed. Exiting")
|
||||
logger.Debug("Evaluation channel has been closed. Exiting")
|
||||
return nil
|
||||
}
|
||||
if evalRunning {
|
||||
@ -415,7 +414,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
// fetch latest alert rule version
|
||||
if currentRuleVersion != newVersion {
|
||||
if currentRuleVersion > 0 { // do not clean up state if the eval loop has just started.
|
||||
logger.Debug("got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "new_version", newVersion)
|
||||
logger.Debug("Got a new version of alert rule. Clear up the state and refresh extra labels", "version", currentRuleVersion, "newVersion", newVersion)
|
||||
clearState()
|
||||
}
|
||||
currentRuleVersion = newVersion
|
||||
@ -424,7 +423,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
logger.Error("evaluation failed after all retries", "error", err)
|
||||
logger.Error("Evaluation failed after all retries", "error", err)
|
||||
}
|
||||
}()
|
||||
case <-grafanaCtx.Done():
|
||||
@ -432,7 +431,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
|
||||
if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
|
||||
clearState()
|
||||
}
|
||||
logger.Debug("stopping alert rule routine")
|
||||
logger.Debug("Stopping alert rule routine")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -107,20 +107,7 @@ func TestWarmStateCache(t *testing.T) {
|
||||
Labels: labels,
|
||||
}
|
||||
_ = dbstore.SaveAlertInstances(ctx, instance2)
|
||||
|
||||
cfg := setting.UnifiedAlertingSettings{
|
||||
BaseInterval: time.Second,
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
}
|
||||
|
||||
schedCfg := schedule.SchedulerCfg{
|
||||
Cfg: cfg,
|
||||
C: clock.NewMock(),
|
||||
Logger: log.New("ngalert cache warming test"),
|
||||
RuleStore: dbstore,
|
||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{})
|
||||
st := state.NewManager(log.New("test"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{})
|
||||
st.Warm(ctx)
|
||||
|
||||
t.Run("instance cache has expected entries", func(t *testing.T) {
|
||||
@ -168,11 +155,10 @@ func TestAlertingTicker(t *testing.T) {
|
||||
stopAppliedCh <- alertDefKey
|
||||
},
|
||||
RuleStore: dbstore,
|
||||
Logger: log.New("ngalert schedule test"),
|
||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||
AlertSender: notifier,
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{})
|
||||
st := state.NewManager(log.New("test"), testMetrics.GetStateMetrics(), nil, dbstore, dbstore, &image.NoopImageService{}, clock.NewMock(), &state.FakeHistorian{})
|
||||
appUrl := &url.URL{
|
||||
Scheme: "http",
|
||||
Host: "localhost",
|
||||
|
@ -522,13 +522,12 @@ func setupScheduler(t *testing.T, rs *fakeRulesStore, is *state.FakeInstanceStor
|
||||
C: mockedClock,
|
||||
Evaluator: evaluator,
|
||||
RuleStore: rs,
|
||||
Logger: logger,
|
||||
Metrics: m.GetSchedulerMetrics(),
|
||||
AlertSender: senderMock,
|
||||
}
|
||||
|
||||
stateRs := state.FakeRuleReader{}
|
||||
st := state.NewManager(schedCfg.Logger, m.GetStateMetrics(), nil, &stateRs, is, &image.NoopImageService{}, mockedClock, &state.FakeHistorian{})
|
||||
st := state.NewManager(log.New("test"), m.GetStateMetrics(), nil, &stateRs, is, &image.NoopImageService{}, mockedClock, &state.FakeHistorian{})
|
||||
return NewScheduler(schedCfg, appUrl, st)
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user