2021-03-08 14:19:21 -06:00
package schedule
2020-12-17 08:00:09 -06:00
import (
"context"
2022-08-25 13:12:22 -05:00
"errors"
2020-12-17 08:00:09 -06:00
"fmt"
2021-08-06 07:06:56 -05:00
"net/url"
2020-12-17 08:00:09 -06:00
"time"
2022-07-14 14:59:59 -05:00
prometheusModel "github.com/prometheus/common/model"
2020-12-17 08:00:09 -06:00
"github.com/grafana/grafana/pkg/infra/log"
2022-09-14 08:30:58 -05:00
"github.com/grafana/grafana/pkg/services/datasources"
2022-01-11 10:39:34 -06:00
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
2020-12-17 08:00:09 -06:00
"github.com/grafana/grafana/pkg/services/ngalert/eval"
2021-08-06 07:06:56 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
2022-06-17 12:10:49 -05:00
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
2021-04-19 01:58:44 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/state"
2022-09-14 08:30:58 -05:00
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/user"
2022-07-11 11:41:40 -05:00
"github.com/grafana/grafana/pkg/setting"
2022-09-26 12:35:33 -05:00
"github.com/grafana/grafana/pkg/util/ticker"
2021-08-06 07:06:56 -05:00
"github.com/benbjohnson/clock"
"golang.org/x/sync/errgroup"
2020-12-17 08:00:09 -06:00
)
2021-11-25 04:12:04 -06:00
// ScheduleService is an interface for a service that schedules the evaluation
// of alert rules.
2022-09-01 11:15:44 -05:00
//
2022-08-24 14:33:33 -05:00
//go:generate mockery --name ScheduleService --structname FakeScheduleService --inpackage --filename schedule_mock.go --unroll-variadic=False
2021-03-08 14:19:21 -06:00
type ScheduleService interface {
2021-11-25 04:12:04 -06:00
// Run the scheduler until the context is canceled or the scheduler returns
// an error. The scheduler is terminated when this function returns.
2021-07-27 05:52:59 -05:00
Run ( context . Context ) error
2022-01-11 10:39:34 -06:00
// UpdateAlertRule notifies scheduler that a rule has been changed
2022-07-15 11:32:52 -05:00
UpdateAlertRule ( key ngmodels . AlertRuleKey , lastVersion int64 )
2022-08-24 14:33:33 -05:00
// DeleteAlertRule notifies scheduler that rules have been deleted
DeleteAlertRule ( keys ... ngmodels . AlertRuleKey )
2021-03-03 09:52:19 -06:00
// the following are used by tests only used for tests
2022-06-17 12:10:49 -05:00
evalApplied ( ngmodels . AlertRuleKey , time . Time )
stopApplied ( ngmodels . AlertRuleKey )
2021-03-03 09:52:19 -06:00
}
2022-07-12 14:13:04 -05:00
// AlertsSender is an interface for a service that is responsible for sending notifications to the end-user.
2022-09-01 11:15:44 -05:00
//
//go:generate mockery --name AlertsSender --structname AlertsSenderMock --inpackage --filename alerts_sender_mock.go --with-expecter
2022-07-12 14:13:04 -05:00
type AlertsSender interface {
Send ( key ngmodels . AlertRuleKey , alerts definitions . PostableAlerts )
}
2022-08-31 10:08:19 -05:00
// RulesStore is a store that provides alert rules for scheduling
type RulesStore interface {
GetAlertRulesKeysForScheduling ( ctx context . Context ) ( [ ] ngmodels . AlertRuleKeyWithVersion , error )
GetAlertRulesForScheduling ( ctx context . Context , query * ngmodels . GetAlertRulesForSchedulingQuery ) error
}
2020-12-17 08:00:09 -06:00
type schedule struct {
// base tick rate (fastest possible configured check)
baseInterval time . Duration
2021-04-03 12:13:29 -05:00
// each alert rule gets its own channel and routine
2022-06-07 10:20:06 -05:00
registry alertRuleInfoRegistry
2020-12-17 08:00:09 -06:00
maxAttempts int64
clock clock . Clock
// evalApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from evalApplied is handled.
2022-06-17 12:10:49 -05:00
evalAppliedFunc func ( ngmodels . AlertRuleKey , time . Time )
2020-12-17 08:00:09 -06:00
2021-01-11 08:14:03 -06:00
// stopApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from stopApplied is handled.
2022-06-17 12:10:49 -05:00
stopAppliedFunc func ( ngmodels . AlertRuleKey )
2021-01-11 08:14:03 -06:00
2020-12-17 08:00:09 -06:00
log log . Logger
2021-01-22 11:27:33 -06:00
2022-11-02 09:13:39 -05:00
evaluatorFactory eval . EvaluatorFactory
2021-03-03 09:52:19 -06:00
2022-08-31 10:08:19 -05:00
ruleStore RulesStore
2021-03-30 11:37:56 -05:00
2021-07-07 11:18:31 -05:00
stateManager * state . Manager
2022-07-11 11:41:40 -05:00
appURL * url . URL
disableGrafanaFolder bool
2021-06-16 05:04:12 -05:00
2022-07-12 14:13:04 -05:00
metrics * metrics . Scheduler
2021-08-06 07:06:56 -05:00
2022-07-12 14:13:04 -05:00
alertsSender AlertsSender
minRuleInterval time . Duration
2022-06-07 10:20:06 -05:00
// schedulableAlertRules contains the alert rules that are considered for
// evaluation in the current tick. The evaluation of an alert rule in the
// current tick depends on its evaluation interval and when it was
// last evaluated.
2022-07-26 08:40:06 -05:00
schedulableAlertRules alertRulesRegistry
2021-01-22 11:27:33 -06:00
}
2021-03-08 14:19:21 -06:00
// SchedulerCfg is the scheduler configuration.
type SchedulerCfg struct {
2022-11-02 09:13:39 -05:00
Cfg setting . UnifiedAlertingSettings
C clock . Clock
EvalAppliedFunc func ( ngmodels . AlertRuleKey , time . Time )
StopAppliedFunc func ( ngmodels . AlertRuleKey )
EvaluatorFactory eval . EvaluatorFactory
RuleStore RulesStore
Metrics * metrics . Scheduler
AlertSender AlertsSender
2020-12-17 08:00:09 -06:00
}
2021-03-08 14:19:21 -06:00
// NewScheduler returns a new schedule.
2022-08-01 18:28:38 -05:00
func NewScheduler ( cfg SchedulerCfg , appURL * url . URL , stateManager * state . Manager ) * schedule {
2020-12-17 08:00:09 -06:00
sch := schedule {
2022-07-12 14:13:04 -05:00
registry : alertRuleInfoRegistry { alertRuleInfo : make ( map [ ngmodels . AlertRuleKey ] * alertRuleInfo ) } ,
maxAttempts : cfg . Cfg . MaxAttempts ,
clock : cfg . C ,
baseInterval : cfg . Cfg . BaseInterval ,
2022-10-20 12:43:48 -05:00
log : log . New ( "ngalert.scheduler" ) ,
2022-07-12 14:13:04 -05:00
evalAppliedFunc : cfg . EvalAppliedFunc ,
stopAppliedFunc : cfg . StopAppliedFunc ,
2022-11-02 09:13:39 -05:00
evaluatorFactory : cfg . EvaluatorFactory ,
2022-07-12 14:13:04 -05:00
ruleStore : cfg . RuleStore ,
metrics : cfg . Metrics ,
appURL : appURL ,
disableGrafanaFolder : cfg . Cfg . ReservedLabels . IsReservedLabelDisabled ( ngmodels . FolderTitleLabel ) ,
stateManager : stateManager ,
minRuleInterval : cfg . Cfg . MinInterval ,
2022-07-26 08:40:06 -05:00
schedulableAlertRules : alertRulesRegistry { rules : make ( map [ ngmodels . AlertRuleKey ] * ngmodels . AlertRule ) } ,
2022-07-12 14:13:04 -05:00
alertsSender : cfg . AlertSender ,
2020-12-17 08:00:09 -06:00
}
2022-06-17 12:10:49 -05:00
2020-12-17 08:00:09 -06:00
return & sch
}
2021-07-27 05:52:59 -05:00
func ( sch * schedule ) Run ( ctx context . Context ) error {
2022-10-05 08:35:02 -05:00
t := ticker . New ( sch . clock , sch . baseInterval , sch . metrics . Ticker )
defer t . Stop ( )
2022-06-01 10:48:10 -05:00
2022-10-05 08:35:02 -05:00
if err := sch . schedulePeriodic ( ctx , t ) ; err != nil {
2022-10-20 12:43:48 -05:00
sch . log . Error ( "Failure while running the rule evaluation loop" , "error" , err )
2021-08-06 07:06:56 -05:00
}
return nil
}
2022-01-11 10:39:34 -06:00
// UpdateAlertRule looks for the active rule evaluation and commands it to update the rule
2022-07-15 11:32:52 -05:00
func ( sch * schedule ) UpdateAlertRule ( key ngmodels . AlertRuleKey , lastVersion int64 ) {
2022-01-11 10:39:34 -06:00
ruleInfo , err := sch . registry . get ( key )
if err != nil {
return
}
2022-07-15 11:32:52 -05:00
ruleInfo . update ( ruleVersion ( lastVersion ) )
2022-01-11 10:39:34 -06:00
}
// DeleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
2022-08-24 14:33:33 -05:00
func ( sch * schedule ) DeleteAlertRule ( keys ... ngmodels . AlertRuleKey ) {
for _ , key := range keys {
// It can happen that the scheduler has deleted the alert rule before the
// Ruler API has called DeleteAlertRule. This can happen as requests to
// the Ruler API do not hold an exclusive lock over all scheduler operations.
if _ , ok := sch . schedulableAlertRules . del ( key ) ; ! ok {
2022-10-20 12:43:48 -05:00
sch . log . Info ( "Alert rule cannot be removed from the scheduler as it is not scheduled" , key . LogContext ( ) ... )
2022-08-24 14:33:33 -05:00
}
// Delete the rule routine
ruleInfo , ok := sch . registry . del ( key )
if ! ok {
2022-10-20 12:43:48 -05:00
sch . log . Info ( "Alert rule cannot be stopped as it is not running" , key . LogContext ( ) ... )
2022-08-24 14:33:33 -05:00
continue
}
// stop rule evaluation
2022-08-25 13:12:22 -05:00
ruleInfo . stop ( errRuleDeleted )
2022-06-07 10:20:06 -05:00
}
2022-06-08 12:37:33 -05:00
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
2022-08-31 10:08:19 -05:00
alertRules , _ := sch . schedulableAlertRules . all ( )
2022-06-08 12:37:33 -05:00
sch . metrics . SchedulableAlertRules . Set ( float64 ( len ( alertRules ) ) )
sch . metrics . SchedulableAlertRulesHash . Set ( float64 ( hashUIDs ( alertRules ) ) )
2022-01-11 10:39:34 -06:00
}
2022-10-05 08:35:02 -05:00
func ( sch * schedule ) schedulePeriodic ( ctx context . Context , t * ticker . T ) error {
2021-07-27 05:52:59 -05:00
dispatcherGroup , ctx := errgroup . WithContext ( ctx )
2020-12-17 08:00:09 -06:00
for {
select {
2022-10-05 08:35:02 -05:00
case tick := <- t . C :
2022-02-25 05:40:30 -06:00
// We use Round(0) on the start time to remove the monotonic clock.
2022-02-25 08:43:08 -06:00
// This is required as ticks from the ticker and time.Now() can have
// a monotonic clock that when subtracted do not represent the delta
// in wall clock time.
2022-02-25 05:40:30 -06:00
start := time . Now ( ) . Round ( 0 )
2022-01-31 10:56:43 -06:00
sch . metrics . BehindSeconds . Set ( start . Sub ( tick ) . Seconds ( ) )
2021-03-03 09:52:19 -06:00
tickNum := tick . Unix ( ) / int64 ( sch . baseInterval . Seconds ( ) )
2022-01-31 10:56:43 -06:00
2022-07-15 13:13:30 -05:00
if err := sch . updateSchedulableAlertRules ( ctx ) ; err != nil {
2022-10-20 12:43:48 -05:00
sch . log . Error ( "Failed to update alert rules" , "error" , err )
2022-06-07 10:20:06 -05:00
}
2022-08-31 10:08:19 -05:00
alertRules , folderTitles := sch . schedulableAlertRules . all ( )
2022-06-07 10:20:06 -05:00
2021-04-03 12:13:29 -05:00
// registeredDefinitions is a map used for finding deleted alert rules
// initially it is assigned to all known alert rules from the previous cycle
// each alert rule found also in this cycle is removed
// so, at the end, the remaining registered alert rules are the deleted ones
2021-03-03 09:52:19 -06:00
registeredDefinitions := sch . registry . keyMap ( )
2020-12-17 08:00:09 -06:00
2022-06-08 12:37:33 -05:00
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch . metrics . SchedulableAlertRules . Set ( float64 ( len ( alertRules ) ) )
sch . metrics . SchedulableAlertRulesHash . Set ( float64 ( hashUIDs ( alertRules ) ) )
2020-12-17 08:00:09 -06:00
type readyToRunItem struct {
2021-12-16 13:52:47 -06:00
ruleInfo * alertRuleInfo
2022-08-31 10:08:19 -05:00
evaluation
2020-12-17 08:00:09 -06:00
}
2021-08-06 07:06:56 -05:00
2020-12-17 08:00:09 -06:00
readyToRun := make ( [ ] readyToRunItem , 0 )
2022-08-31 10:08:19 -05:00
missingFolder := make ( map [ string ] [ ] string )
2021-04-03 12:13:29 -05:00
for _ , item := range alertRules {
2021-03-08 14:19:21 -06:00
key := item . GetKey ( )
2021-12-16 13:52:47 -06:00
ruleInfo , newRoutine := sch . registry . getOrCreateInfo ( ctx , key )
2021-09-28 05:00:16 -05:00
// enforce minimum evaluation interval
if item . IntervalSeconds < int64 ( sch . minRuleInterval . Seconds ( ) ) {
2022-10-20 12:43:48 -05:00
sch . log . Debug ( "Interval adjusted" , append ( key . LogContext ( ) , "originalInterval" , item . IntervalSeconds , "adjustedInterval" , sch . minRuleInterval . Seconds ( ) ) ... )
2021-09-28 05:00:16 -05:00
item . IntervalSeconds = int64 ( sch . minRuleInterval . Seconds ( ) )
}
2021-03-03 09:52:19 -06:00
invalidInterval := item . IntervalSeconds % int64 ( sch . baseInterval . Seconds ( ) ) != 0
2020-12-17 08:00:09 -06:00
if newRoutine && ! invalidInterval {
dispatcherGroup . Go ( func ( ) error {
2022-01-11 10:39:34 -06:00
return sch . ruleRoutine ( ruleInfo . ctx , key , ruleInfo . evalCh , ruleInfo . updateCh )
2020-12-17 08:00:09 -06:00
} )
}
if invalidInterval {
// this is expected to be always false
2021-09-28 05:00:16 -05:00
// given that we validate interval during alert rule updates
2022-10-20 12:43:48 -05:00
sch . log . Warn ( "Rule has an invalid interval and will be ignored. Interval should be divided exactly by scheduler interval" , append ( key . LogContext ( ) , "ruleInterval" , time . Duration ( item . IntervalSeconds ) * time . Second , "schedulerInterval" , sch . baseInterval ) ... )
2020-12-17 08:00:09 -06:00
continue
}
2021-03-03 09:52:19 -06:00
itemFrequency := item . IntervalSeconds / int64 ( sch . baseInterval . Seconds ( ) )
2020-12-17 08:00:09 -06:00
if item . IntervalSeconds != 0 && tickNum % itemFrequency == 0 {
2022-08-31 10:08:19 -05:00
var folderTitle string
if ! sch . disableGrafanaFolder {
title , ok := folderTitles [ item . NamespaceUID ]
if ok {
folderTitle = title
} else {
missingFolder [ item . NamespaceUID ] = append ( missingFolder [ item . NamespaceUID ] , item . UID )
}
}
readyToRun = append ( readyToRun , readyToRunItem { ruleInfo : ruleInfo , evaluation : evaluation {
scheduledAt : tick ,
rule : item ,
folderTitle : folderTitle ,
} } )
2020-12-17 08:00:09 -06:00
}
2021-04-03 12:13:29 -05:00
// remove the alert rule from the registered alert rules
2021-01-07 09:45:42 -06:00
delete ( registeredDefinitions , key )
2020-12-17 08:00:09 -06:00
}
2022-08-31 10:08:19 -05:00
if len ( missingFolder ) > 0 { // if this happens then there can be problems with fetching folders from the database.
2022-10-20 12:43:48 -05:00
sch . log . Warn ( "Unable to obtain folder titles for some rules" , "missingFolderUIDToRuleUID" , missingFolder )
2022-08-31 10:08:19 -05:00
}
2020-12-17 08:00:09 -06:00
var step int64 = 0
if len ( readyToRun ) > 0 {
2021-03-03 09:52:19 -06:00
step = sch . baseInterval . Nanoseconds ( ) / int64 ( len ( readyToRun ) )
2020-12-17 08:00:09 -06:00
}
for i := range readyToRun {
item := readyToRun [ i ]
time . AfterFunc ( time . Duration ( int64 ( i ) * step ) , func ( ) {
2022-07-26 08:40:06 -05:00
key := item . rule . GetKey ( )
2022-08-31 10:08:19 -05:00
success , dropped := item . ruleInfo . eval ( & item . evaluation )
2021-12-16 13:52:47 -06:00
if ! success {
2022-10-20 12:43:48 -05:00
sch . log . Debug ( "Scheduled evaluation was canceled because evaluation routine was stopped" , append ( key . LogContext ( ) , "time" , tick ) ... )
2022-06-08 11:50:44 -05:00
return
}
if dropped != nil {
2022-10-20 12:43:48 -05:00
sch . log . Warn ( "Tick dropped because alert rule evaluation is too slow" , append ( key . LogContext ( ) , "time" , tick ) ... )
2022-07-26 08:40:06 -05:00
orgID := fmt . Sprint ( key . OrgID )
sch . metrics . EvaluationMissed . WithLabelValues ( orgID , item . rule . Title ) . Inc ( )
2021-12-16 13:52:47 -06:00
}
2020-12-17 08:00:09 -06:00
} )
}
2021-04-03 12:13:29 -05:00
// unregister and stop routines of the deleted alert rules
2021-01-07 09:45:42 -06:00
for key := range registeredDefinitions {
2022-01-11 10:39:34 -06:00
sch . DeleteAlertRule ( key )
2020-12-17 08:00:09 -06:00
}
2022-01-31 10:56:43 -06:00
sch . metrics . SchedulePeriodicDuration . Observe ( time . Since ( start ) . Seconds ( ) )
2021-07-27 05:52:59 -05:00
case <- ctx . Done ( ) :
2022-08-18 08:40:33 -05:00
// waiting for all rule evaluation routines to stop
2021-05-12 06:17:43 -05:00
waitErr := dispatcherGroup . Wait ( )
2022-08-18 08:40:33 -05:00
// close the state manager and flush the state
2022-09-21 09:10:17 -05:00
sch . stateManager . Close ( )
2021-05-12 06:17:43 -05:00
return waitErr
2020-12-17 08:00:09 -06:00
}
}
}
2022-07-15 11:32:52 -05:00
func ( sch * schedule ) ruleRoutine ( grafanaCtx context . Context , key ngmodels . AlertRuleKey , evalCh <- chan * evaluation , updateCh <- chan ruleVersion ) error {
2022-10-26 18:16:02 -05:00
grafanaCtx = ngmodels . WithRuleKey ( grafanaCtx , key )
logger := sch . log . FromContext ( grafanaCtx )
2022-10-20 12:43:48 -05:00
logger . Debug ( "Alert rule routine started" )
2021-07-27 05:52:59 -05:00
2021-11-02 16:04:13 -05:00
orgID := fmt . Sprint ( key . OrgID )
evalTotal := sch . metrics . EvalTotal . WithLabelValues ( orgID )
evalDuration := sch . metrics . EvalDuration . WithLabelValues ( orgID )
evalTotalFailures := sch . metrics . EvalFailures . WithLabelValues ( orgID )
2021-07-27 05:52:59 -05:00
2022-01-11 10:39:34 -06:00
clearState := func ( ) {
2022-08-25 13:12:22 -05:00
states := sch . stateManager . ResetStateByRuleUID ( grafanaCtx , key )
2022-01-11 10:39:34 -06:00
expiredAlerts := FromAlertsStateToStoppedAlert ( states , sch . appURL , sch . clock )
2022-07-26 08:40:06 -05:00
if len ( expiredAlerts . PostableAlerts ) > 0 {
sch . alertsSender . Send ( key , expiredAlerts )
2022-06-17 12:10:49 -05:00
}
2021-11-02 16:04:13 -05:00
}
2021-07-27 05:52:59 -05:00
2022-08-31 10:08:19 -05:00
evaluate := func ( ctx context . Context , attempt int64 , e * evaluation ) {
2022-07-26 08:40:06 -05:00
logger := logger . New ( "version" , e . rule . Version , "attempt" , attempt , "now" , e . scheduledAt )
2021-11-02 16:04:13 -05:00
start := sch . clock . Now ( )
2021-07-27 05:52:59 -05:00
2022-09-14 08:30:58 -05:00
schedulerUser := & user . SignedInUser {
UserID : - 1 ,
Login : "grafana_scheduler" ,
OrgID : e . rule . OrgID ,
OrgRole : org . RoleAdmin ,
Permissions : map [ int64 ] map [ string ] [ ] string {
e . rule . OrgID : {
datasources . ActionQuery : [ ] string {
datasources . ScopeAll ,
} ,
} ,
} ,
}
2022-11-02 09:13:39 -05:00
evalCtx := eval . Context ( ctx , schedulerUser )
ruleEval , err := sch . evaluatorFactory . Create ( evalCtx , e . rule . GetEvalCondition ( ) )
var results eval . Results
var dur time . Duration
if err == nil {
results , err = ruleEval . Evaluate ( ctx , e . scheduledAt )
if err != nil {
logger . Error ( "Failed to evaluate rule" , "error" , err , "duration" , dur )
}
} else {
logger . Error ( "Failed to build rule evaluator" , "error" , err )
}
dur = sch . clock . Now ( ) . Sub ( start )
2022-09-14 08:30:58 -05:00
2021-11-02 16:04:13 -05:00
evalTotal . Inc ( )
evalDuration . Observe ( dur . Seconds ( ) )
2022-11-02 09:13:39 -05:00
if err != nil || results . HasErrors ( ) {
2021-11-02 16:04:13 -05:00
evalTotalFailures . Inc ( )
2022-11-02 09:13:39 -05:00
if results == nil {
results = append ( results , eval . NewResultFromError ( err , e . scheduledAt , dur ) )
}
2022-07-12 15:51:32 -05:00
} else {
2022-10-20 12:43:48 -05:00
logger . Debug ( "Alert rule evaluated" , "results" , results , "duration" , dur )
2021-11-02 16:04:13 -05:00
}
2022-08-25 13:12:22 -05:00
if ctx . Err ( ) != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
2022-10-20 12:43:48 -05:00
logger . Debug ( "Skip updating the state because the context has been cancelled" )
2022-08-25 13:12:22 -05:00
return
}
2022-08-31 10:08:19 -05:00
processedStates := sch . stateManager . ProcessEvalResults ( ctx , e . scheduledAt , e . rule , results , sch . getRuleExtraLabels ( e ) )
2021-11-02 16:04:13 -05:00
alerts := FromAlertStateToPostableAlerts ( processedStates , sch . stateManager , sch . appURL )
2022-07-26 08:40:06 -05:00
if len ( alerts . PostableAlerts ) > 0 {
sch . alertsSender . Send ( key , alerts )
}
2021-11-02 16:04:13 -05:00
}
2021-08-06 07:06:56 -05:00
2021-11-02 16:04:13 -05:00
retryIfError := func ( f func ( attempt int64 ) error ) error {
var attempt int64
var err error
for attempt = 0 ; attempt < sch . maxAttempts ; attempt ++ {
err = f ( attempt )
if err == nil {
2021-07-27 05:52:59 -05:00
return nil
}
2021-11-02 16:04:13 -05:00
}
return err
}
evalRunning := false
2022-07-26 08:40:06 -05:00
var currentRuleVersion int64 = 0
2021-12-16 13:52:47 -06:00
defer sch . stopApplied ( key )
2021-11-02 16:04:13 -05:00
for {
select {
2022-01-11 10:39:34 -06:00
// used by external services (API) to notify that rule is updated.
2022-07-26 08:40:06 -05:00
case lastVersion := <- updateCh :
2022-07-15 11:32:52 -05:00
// sometimes it can happen when, for example, the rule evaluation took so long,
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
// therefore, at the time when message from updateCh is processed the current rule will have
// at least the same version (or greater) and the state created for the new version of the rule.
2022-07-26 08:40:06 -05:00
if currentRuleVersion >= int64 ( lastVersion ) {
2022-10-20 12:43:48 -05:00
logger . Info ( "Skip updating rule because its current version is actual" , "version" , currentRuleVersion , "newVersion" , lastVersion )
2022-07-15 11:32:52 -05:00
continue
}
2022-10-20 12:43:48 -05:00
logger . Info ( "Clearing the state of the rule because version has changed" , "version" , currentRuleVersion , "newVersion" , lastVersion )
2022-07-26 08:40:06 -05:00
// clear the state. So the next evaluation will start from the scratch.
clearState ( )
2022-01-11 10:39:34 -06:00
// evalCh - used by the scheduler to signal that evaluation is needed.
2021-12-16 13:52:47 -06:00
case ctx , ok := <- evalCh :
if ! ok {
2022-10-20 12:43:48 -05:00
logger . Debug ( "Evaluation channel has been closed. Exiting" )
2021-12-16 13:52:47 -06:00
return nil
}
2021-11-02 16:04:13 -05:00
if evalRunning {
continue
}
2021-07-27 05:52:59 -05:00
func ( ) {
evalRunning = true
defer func ( ) {
evalRunning = false
2022-02-25 04:09:20 -06:00
sch . evalApplied ( key , ctx . scheduledAt )
2021-07-27 05:52:59 -05:00
} ( )
2021-11-02 16:04:13 -05:00
err := retryIfError ( func ( attempt int64 ) error {
2022-07-26 08:40:06 -05:00
newVersion := ctx . rule . Version
2021-11-02 16:04:13 -05:00
// fetch latest alert rule version
2022-07-26 08:40:06 -05:00
if currentRuleVersion != newVersion {
if currentRuleVersion > 0 { // do not clean up state if the eval loop has just started.
2022-10-20 12:43:48 -05:00
logger . Debug ( "Got a new version of alert rule. Clear up the state and refresh extra labels" , "version" , currentRuleVersion , "newVersion" , newVersion )
2022-07-26 08:40:06 -05:00
clearState ( )
}
currentRuleVersion = newVersion
2021-07-27 05:52:59 -05:00
}
2022-08-31 10:08:19 -05:00
evaluate ( grafanaCtx , attempt , ctx )
2022-07-12 15:51:32 -05:00
return nil
2021-11-02 16:04:13 -05:00
} )
if err != nil {
2022-10-20 12:43:48 -05:00
logger . Error ( "Evaluation failed after all retries" , "error" , err )
2021-07-27 05:52:59 -05:00
}
} ( )
2021-12-16 13:52:47 -06:00
case <- grafanaCtx . Done ( ) :
2022-08-25 13:12:22 -05:00
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
if errors . Is ( grafanaCtx . Err ( ) , errRuleDeleted ) {
clearState ( )
}
2022-10-20 12:43:48 -05:00
logger . Debug ( "Stopping alert rule routine" )
2021-07-27 05:52:59 -05:00
return nil
}
}
}
// evalApplied is only used on tests.
2022-06-17 12:10:49 -05:00
func ( sch * schedule ) evalApplied ( alertDefKey ngmodels . AlertRuleKey , now time . Time ) {
2021-07-27 05:52:59 -05:00
if sch . evalAppliedFunc == nil {
return
}
sch . evalAppliedFunc ( alertDefKey , now )
}
// stopApplied is only used on tests.
2022-06-17 12:10:49 -05:00
func ( sch * schedule ) stopApplied ( alertDefKey ngmodels . AlertRuleKey ) {
2021-07-27 05:52:59 -05:00
if sch . stopAppliedFunc == nil {
return
}
sch . stopAppliedFunc ( alertDefKey )
}
2022-07-14 14:59:59 -05:00
2022-08-31 10:08:19 -05:00
func ( sch * schedule ) getRuleExtraLabels ( evalCtx * evaluation ) map [ string ] string {
2022-07-14 14:59:59 -05:00
extraLabels := make ( map [ string ] string , 4 )
2022-08-31 10:08:19 -05:00
extraLabels [ ngmodels . NamespaceUIDLabel ] = evalCtx . rule . NamespaceUID
extraLabels [ prometheusModel . AlertNameLabel ] = evalCtx . rule . Title
extraLabels [ ngmodels . RuleUIDLabel ] = evalCtx . rule . UID
2022-07-14 14:59:59 -05:00
if ! sch . disableGrafanaFolder {
2022-08-31 10:08:19 -05:00
extraLabels [ ngmodels . FolderTitleLabel ] = evalCtx . folderTitle
2022-07-14 14:59:59 -05:00
}
2022-08-31 10:08:19 -05:00
return extraLabels
2022-07-14 14:59:59 -05:00
}