2021-03-08 22:19:21 +02:00
package schedule
2020-12-17 16:00:09 +02:00
import (
"context"
2022-08-25 14:12:22 -04:00
"errors"
2020-12-17 16:00:09 +02:00
"fmt"
2021-08-06 13:06:56 +01:00
"net/url"
2020-12-17 16:00:09 +02:00
"time"
2023-01-30 09:55:35 +01:00
"github.com/benbjohnson/clock"
2023-02-03 11:36:49 -05:00
alertingModels "github.com/grafana/alerting/models"
2023-01-06 21:21:43 -05:00
"github.com/hashicorp/go-multierror"
2022-07-14 15:59:59 -04:00
prometheusModel "github.com/prometheus/common/model"
2023-01-06 21:21:43 -05:00
"go.opentelemetry.io/otel/attribute"
2023-01-30 09:55:35 +01:00
"golang.org/x/sync/errgroup"
2023-01-27 03:46:21 -05:00
2020-12-17 16:00:09 +02:00
"github.com/grafana/grafana/pkg/infra/log"
2023-01-06 21:21:43 -05:00
"github.com/grafana/grafana/pkg/infra/tracing"
2022-09-14 09:30:58 -04:00
"github.com/grafana/grafana/pkg/services/datasources"
2022-01-11 11:39:34 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
2020-12-17 16:00:09 +02:00
"github.com/grafana/grafana/pkg/services/ngalert/eval"
2021-08-06 13:06:56 +01:00
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
2022-06-17 13:10:49 -04:00
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
2021-04-19 12:28:44 +05:30
"github.com/grafana/grafana/pkg/services/ngalert/state"
2022-09-14 09:30:58 -04:00
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/user"
2022-09-26 12:35:33 -05:00
"github.com/grafana/grafana/pkg/util/ticker"
2020-12-17 16:00:09 +02:00
)
2021-11-25 10:12:04 +00:00
// ScheduleService is an interface for a service that schedules the evaluation
// of alert rules.
2021-03-08 22:19:21 +02:00
type ScheduleService interface {
2021-11-25 10:12:04 +00:00
// Run the scheduler until the context is canceled or the scheduler returns
// an error. The scheduler is terminated when this function returns.
2021-07-27 11:52:59 +01:00
Run ( context . Context ) error
2021-03-03 17:52:19 +02:00
}
2022-07-12 15:13:04 -04:00
// AlertsSender is an interface for a service that is responsible for sending notifications to the end-user.
2022-09-01 18:15:44 +02:00
//
//go:generate mockery --name AlertsSender --structname AlertsSenderMock --inpackage --filename alerts_sender_mock.go --with-expecter
2022-07-12 15:13:04 -04:00
type AlertsSender interface {
Send ( key ngmodels . AlertRuleKey , alerts definitions . PostableAlerts )
}
2022-08-31 11:08:19 -04:00
// RulesStore is a store that provides alert rules for scheduling
type RulesStore interface {
GetAlertRulesKeysForScheduling ( ctx context . Context ) ( [ ] ngmodels . AlertRuleKeyWithVersion , error )
GetAlertRulesForScheduling ( ctx context . Context , query * ngmodels . GetAlertRulesForSchedulingQuery ) error
}
2020-12-17 16:00:09 +02:00
type schedule struct {
// base tick rate (fastest possible configured check)
baseInterval time . Duration
2021-04-03 20:13:29 +03:00
// each alert rule gets its own channel and routine
2022-06-07 16:20:06 +01:00
registry alertRuleInfoRegistry
2020-12-17 16:00:09 +02:00
maxAttempts int64
clock clock . Clock
// evalApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from evalApplied is handled.
2022-06-17 13:10:49 -04:00
evalAppliedFunc func ( ngmodels . AlertRuleKey , time . Time )
2020-12-17 16:00:09 +02:00
2021-01-11 16:14:03 +02:00
// stopApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from stopApplied is handled.
2022-06-17 13:10:49 -04:00
stopAppliedFunc func ( ngmodels . AlertRuleKey )
2021-01-11 16:14:03 +02:00
2020-12-17 16:00:09 +02:00
log log . Logger
2021-01-22 19:27:33 +02:00
2022-11-02 10:13:39 -04:00
evaluatorFactory eval . EvaluatorFactory
2021-03-03 17:52:19 +02:00
2022-08-31 11:08:19 -04:00
ruleStore RulesStore
2021-03-30 09:37:56 -07:00
2021-07-07 17:18:31 +01:00
stateManager * state . Manager
2022-07-11 12:41:40 -04:00
appURL * url . URL
disableGrafanaFolder bool
2021-06-16 15:34:12 +05:30
2022-07-12 15:13:04 -04:00
metrics * metrics . Scheduler
2021-08-06 13:06:56 +01:00
2022-07-12 15:13:04 -04:00
alertsSender AlertsSender
minRuleInterval time . Duration
2022-06-07 16:20:06 +01:00
// schedulableAlertRules contains the alert rules that are considered for
// evaluation in the current tick. The evaluation of an alert rule in the
// current tick depends on its evaluation interval and when it was
// last evaluated.
2022-07-26 09:40:06 -04:00
schedulableAlertRules alertRulesRegistry
2023-01-06 21:21:43 -05:00
tracer tracing . Tracer
2021-01-22 19:27:33 +02:00
}
2021-03-08 22:19:21 +02:00
// SchedulerCfg is the scheduler configuration.
type SchedulerCfg struct {
2022-12-02 16:02:07 -06:00
MaxAttempts int64
BaseInterval time . Duration
C clock . Clock
MinRuleInterval time . Duration
DisableGrafanaFolder bool
AppURL * url . URL
EvaluatorFactory eval . EvaluatorFactory
RuleStore RulesStore
Metrics * metrics . Scheduler
AlertSender AlertsSender
2023-01-06 21:21:43 -05:00
Tracer tracing . Tracer
2020-12-17 16:00:09 +02:00
}
2021-03-08 22:19:21 +02:00
// NewScheduler returns a new schedule.
2022-12-02 16:02:07 -06:00
func NewScheduler ( cfg SchedulerCfg , stateManager * state . Manager ) * schedule {
2020-12-17 16:00:09 +02:00
sch := schedule {
2022-07-12 15:13:04 -04:00
registry : alertRuleInfoRegistry { alertRuleInfo : make ( map [ ngmodels . AlertRuleKey ] * alertRuleInfo ) } ,
2022-12-02 16:02:07 -06:00
maxAttempts : cfg . MaxAttempts ,
2022-07-12 15:13:04 -04:00
clock : cfg . C ,
2022-12-02 16:02:07 -06:00
baseInterval : cfg . BaseInterval ,
2022-10-20 13:43:48 -04:00
log : log . New ( "ngalert.scheduler" ) ,
2022-11-02 10:13:39 -04:00
evaluatorFactory : cfg . EvaluatorFactory ,
2022-07-12 15:13:04 -04:00
ruleStore : cfg . RuleStore ,
metrics : cfg . Metrics ,
2022-12-02 16:02:07 -06:00
appURL : cfg . AppURL ,
disableGrafanaFolder : cfg . DisableGrafanaFolder ,
2022-07-12 15:13:04 -04:00
stateManager : stateManager ,
2022-12-02 16:02:07 -06:00
minRuleInterval : cfg . MinRuleInterval ,
2022-07-26 09:40:06 -04:00
schedulableAlertRules : alertRulesRegistry { rules : make ( map [ ngmodels . AlertRuleKey ] * ngmodels . AlertRule ) } ,
2022-07-12 15:13:04 -04:00
alertsSender : cfg . AlertSender ,
2023-01-06 21:21:43 -05:00
tracer : cfg . Tracer ,
2020-12-17 16:00:09 +02:00
}
2022-06-17 13:10:49 -04:00
2020-12-17 16:00:09 +02:00
return & sch
}
2021-07-27 11:52:59 +01:00
func ( sch * schedule ) Run ( ctx context . Context ) error {
2022-10-05 09:35:02 -04:00
t := ticker . New ( sch . clock , sch . baseInterval , sch . metrics . Ticker )
defer t . Stop ( )
2022-06-01 11:48:10 -04:00
2022-10-05 09:35:02 -04:00
if err := sch . schedulePeriodic ( ctx , t ) ; err != nil {
2022-10-20 13:43:48 -04:00
sch . log . Error ( "Failure while running the rule evaluation loop" , "error" , err )
2021-08-06 13:06:56 +01:00
}
return nil
}
2023-03-14 18:02:51 -04:00
// deleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
func ( sch * schedule ) deleteAlertRule ( keys ... ngmodels . AlertRuleKey ) {
2022-08-24 15:33:33 -04:00
for _ , key := range keys {
// It can happen that the scheduler has deleted the alert rule before the
// Ruler API has called DeleteAlertRule. This can happen as requests to
// the Ruler API do not hold an exclusive lock over all scheduler operations.
if _ , ok := sch . schedulableAlertRules . del ( key ) ; ! ok {
2022-10-20 13:43:48 -04:00
sch . log . Info ( "Alert rule cannot be removed from the scheduler as it is not scheduled" , key . LogContext ( ) ... )
2022-08-24 15:33:33 -04:00
}
// Delete the rule routine
ruleInfo , ok := sch . registry . del ( key )
if ! ok {
2022-10-20 13:43:48 -04:00
sch . log . Info ( "Alert rule cannot be stopped as it is not running" , key . LogContext ( ) ... )
2022-08-24 15:33:33 -04:00
continue
}
// stop rule evaluation
2022-08-25 14:12:22 -04:00
ruleInfo . stop ( errRuleDeleted )
2022-06-07 16:20:06 +01:00
}
2022-06-08 18:37:33 +01:00
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
2022-08-31 11:08:19 -04:00
alertRules , _ := sch . schedulableAlertRules . all ( )
2023-02-09 17:05:19 +01:00
sch . updateRulesMetrics ( alertRules )
2022-01-11 11:39:34 -05:00
}
2022-10-05 09:35:02 -04:00
func ( sch * schedule ) schedulePeriodic ( ctx context . Context , t * ticker . T ) error {
2021-07-27 11:52:59 +01:00
dispatcherGroup , ctx := errgroup . WithContext ( ctx )
2020-12-17 16:00:09 +02:00
for {
select {
2022-10-05 09:35:02 -04:00
case tick := <- t . C :
2022-02-25 11:40:30 +00:00
// We use Round(0) on the start time to remove the monotonic clock.
2022-02-25 14:43:08 +00:00
// This is required as ticks from the ticker and time.Now() can have
// a monotonic clock that when subtracted do not represent the delta
// in wall clock time.
2022-02-25 11:40:30 +00:00
start := time . Now ( ) . Round ( 0 )
2022-01-31 16:56:43 +00:00
sch . metrics . BehindSeconds . Set ( start . Sub ( tick ) . Seconds ( ) )
2022-11-09 15:08:57 -05:00
sch . processTick ( ctx , dispatcherGroup , tick )
2022-01-31 16:56:43 +00:00
2022-11-09 15:08:57 -05:00
sch . metrics . SchedulePeriodicDuration . Observe ( time . Since ( start ) . Seconds ( ) )
case <- ctx . Done ( ) :
// waiting for all rule evaluation routines to stop
waitErr := dispatcherGroup . Wait ( )
return waitErr
}
}
}
2021-08-06 13:06:56 +01:00
2022-11-09 15:08:57 -05:00
type readyToRunItem struct {
ruleInfo * alertRuleInfo
evaluation
}
2021-09-28 13:00:16 +03:00
2023-02-09 17:05:19 +01:00
func ( sch * schedule ) updateRulesMetrics ( alertRules [ ] * ngmodels . AlertRule ) {
2023-02-23 12:38:27 +01:00
orgs := make ( map [ int64 ] int64 , len ( alertRules ) )
orgsPaused := make ( map [ int64 ] int64 , len ( alertRules ) )
2023-02-09 17:05:19 +01:00
for _ , rule := range alertRules {
orgs [ rule . OrgID ] ++
2023-02-23 12:38:27 +01:00
if rule . IsPaused {
orgsPaused [ rule . OrgID ] ++
}
2023-02-09 17:05:19 +01:00
}
2023-02-23 12:38:27 +01:00
for orgID , numRules := range orgs {
numRulesPaused := orgsPaused [ orgID ]
sch . metrics . GroupRules . WithLabelValues ( fmt . Sprint ( orgID ) , metrics . AlertRuleActiveLabelValue ) . Set ( float64 ( numRules - numRulesPaused ) )
sch . metrics . GroupRules . WithLabelValues ( fmt . Sprint ( orgID ) , metrics . AlertRulePausedLabelValue ) . Set ( float64 ( numRulesPaused ) )
2023-02-09 17:05:19 +01:00
}
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch . metrics . SchedulableAlertRules . Set ( float64 ( len ( alertRules ) ) )
sch . metrics . SchedulableAlertRulesHash . Set ( float64 ( hashUIDs ( alertRules ) ) )
}
2023-03-14 18:02:51 -04:00
// TODO refactor to accept a callback for tests that will be called with things that are returned currently, and return nothing.
// Returns a slice of rules that were scheduled for evaluation, map of stopped rules, and a slice of updated rules
func ( sch * schedule ) processTick ( ctx context . Context , dispatcherGroup * errgroup . Group , tick time . Time ) ( [ ] readyToRunItem , map [ ngmodels . AlertRuleKey ] struct { } , [ ] ngmodels . AlertRuleKeyWithVersion ) {
2022-11-09 15:08:57 -05:00
tickNum := tick . Unix ( ) / int64 ( sch . baseInterval . Seconds ( ) )
2021-09-28 13:00:16 +03:00
2023-03-14 18:02:51 -04:00
// update the local registry. If there was a difference between the previous state and the current new state, rulesDiff will contains keys of rules that were updated.
rulesDiff , err := sch . updateSchedulableAlertRules ( ctx )
updated := rulesDiff . updated
if updated == nil { // make sure map is not nil
updated = map [ ngmodels . AlertRuleKey ] struct { } { }
}
if err != nil {
2022-11-09 15:08:57 -05:00
sch . log . Error ( "Failed to update alert rules" , "error" , err )
}
2023-03-14 18:02:51 -04:00
// this is the new current state. rulesDiff contains the previously existing rules that were different between this state and the previous state.
2022-11-09 15:08:57 -05:00
alertRules , folderTitles := sch . schedulableAlertRules . all ( )
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
// registeredDefinitions is a map used for finding deleted alert rules
// initially it is assigned to all known alert rules from the previous cycle
// each alert rule found also in this cycle is removed
// so, at the end, the remaining registered alert rules are the deleted ones
registeredDefinitions := sch . registry . keyMap ( )
2020-12-17 16:00:09 +02:00
2023-02-09 17:05:19 +01:00
sch . updateRulesMetrics ( alertRules )
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
readyToRun := make ( [ ] readyToRunItem , 0 )
2023-03-14 18:02:51 -04:00
updatedRules := make ( [ ] ngmodels . AlertRuleKeyWithVersion , 0 , len ( updated ) ) // this is needed for tests only
2022-11-09 15:08:57 -05:00
missingFolder := make ( map [ string ] [ ] string )
for _ , item := range alertRules {
key := item . GetKey ( )
ruleInfo , newRoutine := sch . registry . getOrCreateInfo ( ctx , key )
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
// enforce minimum evaluation interval
if item . IntervalSeconds < int64 ( sch . minRuleInterval . Seconds ( ) ) {
sch . log . Debug ( "Interval adjusted" , append ( key . LogContext ( ) , "originalInterval" , item . IntervalSeconds , "adjustedInterval" , sch . minRuleInterval . Seconds ( ) ) ... )
item . IntervalSeconds = int64 ( sch . minRuleInterval . Seconds ( ) )
}
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
invalidInterval := item . IntervalSeconds % int64 ( sch . baseInterval . Seconds ( ) ) != 0
2022-08-31 11:08:19 -04:00
2022-11-09 15:08:57 -05:00
if newRoutine && ! invalidInterval {
dispatcherGroup . Go ( func ( ) error {
return sch . ruleRoutine ( ruleInfo . ctx , key , ruleInfo . evalCh , ruleInfo . updateCh )
} )
}
if invalidInterval {
// this is expected to be always false
// given that we validate interval during alert rule updates
sch . log . Warn ( "Rule has an invalid interval and will be ignored. Interval should be divided exactly by scheduler interval" , append ( key . LogContext ( ) , "ruleInterval" , time . Duration ( item . IntervalSeconds ) * time . Second , "schedulerInterval" , sch . baseInterval ) ... )
continue
}
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
itemFrequency := item . IntervalSeconds / int64 ( sch . baseInterval . Seconds ( ) )
2023-03-14 18:02:51 -04:00
isReadyToRun := item . IntervalSeconds != 0 && tickNum % itemFrequency == 0
if isReadyToRun {
2022-11-09 15:08:57 -05:00
var folderTitle string
if ! sch . disableGrafanaFolder {
title , ok := folderTitles [ item . NamespaceUID ]
if ok {
folderTitle = title
} else {
missingFolder [ item . NamespaceUID ] = append ( missingFolder [ item . NamespaceUID ] , item . UID )
}
2020-12-17 16:00:09 +02:00
}
2022-11-09 15:08:57 -05:00
readyToRun = append ( readyToRun , readyToRunItem { ruleInfo : ruleInfo , evaluation : evaluation {
scheduledAt : tick ,
rule : item ,
folderTitle : folderTitle ,
} } )
}
2023-03-14 18:02:51 -04:00
if _ , isUpdated := updated [ key ] ; isUpdated && ! isReadyToRun {
// if we do not need to eval the rule, check the whether rule was just updated and if it was, notify evaluation routine about that
sch . log . Debug ( "Rule has been updated. Notifying evaluation routine" , key . LogContext ( ) ... )
go func ( ri * alertRuleInfo , rule * ngmodels . AlertRule ) {
ri . update ( ruleVersionAndPauseStatus {
Version : ruleVersion ( rule . Version ) ,
IsPaused : rule . IsPaused ,
} )
} ( ruleInfo , item )
updatedRules = append ( updatedRules , ngmodels . AlertRuleKeyWithVersion {
Version : item . Version ,
AlertRuleKey : item . GetKey ( ) ,
} )
}
2022-11-09 15:08:57 -05:00
// remove the alert rule from the registered alert rules
delete ( registeredDefinitions , key )
}
if len ( missingFolder ) > 0 { // if this happens then there can be problems with fetching folders from the database.
sch . log . Warn ( "Unable to obtain folder titles for some rules" , "missingFolderUIDToRuleUID" , missingFolder )
}
var step int64 = 0
if len ( readyToRun ) > 0 {
step = sch . baseInterval . Nanoseconds ( ) / int64 ( len ( readyToRun ) )
}
for i := range readyToRun {
item := readyToRun [ i ]
2020-12-17 16:00:09 +02:00
2022-11-09 15:08:57 -05:00
time . AfterFunc ( time . Duration ( int64 ( i ) * step ) , func ( ) {
key := item . rule . GetKey ( )
success , dropped := item . ruleInfo . eval ( & item . evaluation )
if ! success {
sch . log . Debug ( "Scheduled evaluation was canceled because evaluation routine was stopped" , append ( key . LogContext ( ) , "time" , tick ) ... )
return
}
if dropped != nil {
sch . log . Warn ( "Tick dropped because alert rule evaluation is too slow" , append ( key . LogContext ( ) , "time" , tick ) ... )
orgID := fmt . Sprint ( key . OrgID )
sch . metrics . EvaluationMissed . WithLabelValues ( orgID , item . rule . Title ) . Inc ( )
2020-12-17 16:00:09 +02:00
}
2022-11-09 15:08:57 -05:00
} )
}
2022-01-31 16:56:43 +00:00
2022-11-09 15:08:57 -05:00
// unregister and stop routines of the deleted alert rules
2023-01-09 14:39:32 -05:00
toDelete := make ( [ ] ngmodels . AlertRuleKey , 0 , len ( registeredDefinitions ) )
2022-11-09 15:08:57 -05:00
for key := range registeredDefinitions {
2023-01-09 14:39:32 -05:00
toDelete = append ( toDelete , key )
2020-12-17 16:00:09 +02:00
}
2023-03-14 18:02:51 -04:00
sch . deleteAlertRule ( toDelete ... )
return readyToRun , registeredDefinitions , updatedRules
2020-12-17 16:00:09 +02:00
}
2023-01-26 18:29:10 +01:00
func ( sch * schedule ) ruleRoutine ( grafanaCtx context . Context , key ngmodels . AlertRuleKey , evalCh <- chan * evaluation , updateCh <- chan ruleVersionAndPauseStatus ) error {
2022-10-26 19:16:02 -04:00
grafanaCtx = ngmodels . WithRuleKey ( grafanaCtx , key )
logger := sch . log . FromContext ( grafanaCtx )
2022-10-20 13:43:48 -04:00
logger . Debug ( "Alert rule routine started" )
2021-07-27 11:52:59 +01:00
2021-11-02 17:04:13 -04:00
orgID := fmt . Sprint ( key . OrgID )
evalTotal := sch . metrics . EvalTotal . WithLabelValues ( orgID )
evalDuration := sch . metrics . EvalDuration . WithLabelValues ( orgID )
evalTotalFailures := sch . metrics . EvalFailures . WithLabelValues ( orgID )
2021-07-27 11:52:59 +01:00
2023-01-27 03:46:21 -05:00
notify := func ( states [ ] state . StateTransition ) {
2022-01-11 11:39:34 -05:00
expiredAlerts := FromAlertsStateToStoppedAlert ( states , sch . appURL , sch . clock )
2022-07-26 09:40:06 -04:00
if len ( expiredAlerts . PostableAlerts ) > 0 {
sch . alertsSender . Send ( key , expiredAlerts )
2022-06-17 13:10:49 -04:00
}
2021-11-02 17:04:13 -04:00
}
2021-07-27 11:52:59 +01:00
2023-01-26 18:29:10 +01:00
resetState := func ( ctx context . Context , isPaused bool ) {
rule := sch . schedulableAlertRules . get ( key )
reason := ngmodels . StateReasonUpdated
if isPaused {
reason = ngmodels . StateReasonPaused
}
states := sch . stateManager . ResetStateByRuleUID ( ctx , rule , reason )
notify ( states )
}
2023-01-06 21:21:43 -05:00
evaluate := func ( ctx context . Context , attempt int64 , e * evaluation , span tracing . Span ) {
2022-07-26 09:40:06 -04:00
logger := logger . New ( "version" , e . rule . Version , "attempt" , attempt , "now" , e . scheduledAt )
2021-11-02 17:04:13 -04:00
start := sch . clock . Now ( )
2021-07-27 11:52:59 +01:00
2022-09-14 09:30:58 -04:00
schedulerUser := & user . SignedInUser {
2022-11-04 12:53:35 -07:00
UserID : - 1 ,
IsServiceAccount : true ,
Login : "grafana_scheduler" ,
OrgID : e . rule . OrgID ,
OrgRole : org . RoleAdmin ,
2022-09-14 09:30:58 -04:00
Permissions : map [ int64 ] map [ string ] [ ] string {
e . rule . OrgID : {
datasources . ActionQuery : [ ] string {
datasources . ScopeAll ,
} ,
} ,
} ,
}
2022-11-02 10:13:39 -04:00
evalCtx := eval . Context ( ctx , schedulerUser )
ruleEval , err := sch . evaluatorFactory . Create ( evalCtx , e . rule . GetEvalCondition ( ) )
var results eval . Results
var dur time . Duration
2023-03-17 07:15:49 -05:00
if err != nil {
dur = sch . clock . Now ( ) . Sub ( start )
logger . Error ( "Failed to build rule evaluator" , "error" , err )
} else {
2022-11-02 10:13:39 -04:00
results , err = ruleEval . Evaluate ( ctx , e . scheduledAt )
2023-03-17 07:15:49 -05:00
dur = sch . clock . Now ( ) . Sub ( start )
2022-11-02 10:13:39 -04:00
if err != nil {
logger . Error ( "Failed to evaluate rule" , "error" , err , "duration" , dur )
}
}
2022-09-14 09:30:58 -04:00
2021-11-02 17:04:13 -04:00
evalTotal . Inc ( )
evalDuration . Observe ( dur . Seconds ( ) )
2022-11-02 10:13:39 -04:00
if err != nil || results . HasErrors ( ) {
2021-11-02 17:04:13 -04:00
evalTotalFailures . Inc ( )
2022-11-02 10:13:39 -04:00
if results == nil {
results = append ( results , eval . NewResultFromError ( err , e . scheduledAt , dur ) )
}
2023-01-06 21:21:43 -05:00
if err == nil {
for _ , result := range results {
if result . Error != nil {
err = multierror . Append ( err , result . Error )
}
}
}
span . RecordError ( err )
span . AddEvents (
[ ] string { "error" , "message" } ,
[ ] tracing . EventValue {
{ Str : fmt . Sprintf ( "%v" , err ) } ,
{ Str : "rule evaluation failed" } ,
} )
2022-07-12 16:51:32 -04:00
} else {
2022-10-20 13:43:48 -04:00
logger . Debug ( "Alert rule evaluated" , "results" , results , "duration" , dur )
2023-01-06 21:21:43 -05:00
span . AddEvents (
[ ] string { "message" , "results" } ,
[ ] tracing . EventValue {
{ Str : "rule evaluated" } ,
{ Num : int64 ( len ( results ) ) } ,
} )
2021-11-02 17:04:13 -04:00
}
2022-08-25 14:12:22 -04:00
if ctx . Err ( ) != nil { // check if the context is not cancelled. The evaluation can be a long-running task.
2022-10-20 13:43:48 -04:00
logger . Debug ( "Skip updating the state because the context has been cancelled" )
2022-08-25 14:12:22 -04:00
return
}
2022-08-31 11:08:19 -04:00
processedStates := sch . stateManager . ProcessEvalResults ( ctx , e . scheduledAt , e . rule , results , sch . getRuleExtraLabels ( e ) )
2022-12-06 13:07:39 -05:00
alerts := FromStateTransitionToPostableAlerts ( processedStates , sch . stateManager , sch . appURL )
2023-01-06 21:21:43 -05:00
span . AddEvents (
[ ] string { "message" , "state_transitions" , "alerts_to_send" } ,
[ ] tracing . EventValue {
{ Str : "results processed" } ,
{ Num : int64 ( len ( processedStates ) ) } ,
{ Num : int64 ( len ( alerts . PostableAlerts ) ) } ,
} )
2022-07-26 09:40:06 -04:00
if len ( alerts . PostableAlerts ) > 0 {
sch . alertsSender . Send ( key , alerts )
}
2021-11-02 17:04:13 -04:00
}
2021-08-06 13:06:56 +01:00
2021-11-02 17:04:13 -04:00
retryIfError := func ( f func ( attempt int64 ) error ) error {
var attempt int64
var err error
for attempt = 0 ; attempt < sch . maxAttempts ; attempt ++ {
err = f ( attempt )
if err == nil {
2021-07-27 11:52:59 +01:00
return nil
}
2021-11-02 17:04:13 -04:00
}
return err
}
evalRunning := false
2022-07-26 09:40:06 -04:00
var currentRuleVersion int64 = 0
2021-12-16 14:52:47 -05:00
defer sch . stopApplied ( key )
2021-11-02 17:04:13 -04:00
for {
select {
2022-01-11 11:39:34 -05:00
// used by external services (API) to notify that rule is updated.
2023-01-26 18:29:10 +01:00
case ctx := <- updateCh :
2022-07-15 12:32:52 -04:00
// sometimes it can happen when, for example, the rule evaluation took so long,
// and there were two concurrent messages in updateCh and evalCh, and the eval's one got processed first.
// therefore, at the time when message from updateCh is processed the current rule will have
// at least the same version (or greater) and the state created for the new version of the rule.
2023-01-26 18:29:10 +01:00
if currentRuleVersion >= int64 ( ctx . Version ) {
logger . Info ( "Skip updating rule because its current version is actual" , "version" , currentRuleVersion , "newVersion" , ctx . Version )
2022-07-15 12:32:52 -04:00
continue
}
2023-01-26 18:29:10 +01:00
logger . Info ( "Clearing the state of the rule because it was updated" , "version" , currentRuleVersion , "newVersion" , ctx . Version , "isPaused" , ctx . IsPaused )
2022-07-26 09:40:06 -04:00
// clear the state. So the next evaluation will start from the scratch.
2023-01-26 18:29:10 +01:00
resetState ( grafanaCtx , ctx . IsPaused )
2022-01-11 11:39:34 -05:00
// evalCh - used by the scheduler to signal that evaluation is needed.
2021-12-16 14:52:47 -05:00
case ctx , ok := <- evalCh :
if ! ok {
2022-10-20 13:43:48 -04:00
logger . Debug ( "Evaluation channel has been closed. Exiting" )
2021-12-16 14:52:47 -05:00
return nil
}
2021-11-02 17:04:13 -04:00
if evalRunning {
continue
}
2021-07-27 11:52:59 +01:00
func ( ) {
evalRunning = true
defer func ( ) {
evalRunning = false
2022-02-25 10:09:20 +00:00
sch . evalApplied ( key , ctx . scheduledAt )
2021-07-27 11:52:59 +01:00
} ( )
2021-11-02 17:04:13 -04:00
err := retryIfError ( func ( attempt int64 ) error {
2022-07-26 09:40:06 -04:00
newVersion := ctx . rule . Version
2023-01-26 18:29:10 +01:00
isPaused := ctx . rule . IsPaused
2021-11-02 17:04:13 -04:00
// fetch latest alert rule version
2022-07-26 09:40:06 -04:00
if currentRuleVersion != newVersion {
2023-01-30 16:29:05 +01:00
// Do not clean up state if the eval loop has just started.
// We need to reset state if the loop has started and the alert is already paused. It can happen,
// if we have an alert with state and we do file provision with stateful Grafana, that state
// lingers in DB and won't be cleaned up until next alert rule update.
if currentRuleVersion > 0 || isPaused {
2022-10-20 13:43:48 -04:00
logger . Debug ( "Got a new version of alert rule. Clear up the state and refresh extra labels" , "version" , currentRuleVersion , "newVersion" , newVersion )
2023-01-26 18:29:10 +01:00
resetState ( grafanaCtx , isPaused )
2022-07-26 09:40:06 -04:00
}
currentRuleVersion = newVersion
2021-07-27 11:52:59 +01:00
}
2023-01-26 18:29:10 +01:00
if isPaused {
return nil
}
2023-01-06 21:21:43 -05:00
tracingCtx , span := sch . tracer . Start ( grafanaCtx , "alert rule execution" )
defer span . End ( )
span . SetAttributes ( "rule_uid" , ctx . rule . UID , attribute . String ( "rule_uid" , ctx . rule . UID ) )
span . SetAttributes ( "org_id" , ctx . rule . OrgID , attribute . Int64 ( "org_id" , ctx . rule . OrgID ) )
span . SetAttributes ( "rule_version" , ctx . rule . Version , attribute . Int64 ( "rule_version" , ctx . rule . Version ) )
utcTick := ctx . scheduledAt . UTC ( ) . Format ( time . RFC3339Nano )
span . SetAttributes ( "tick" , utcTick , attribute . String ( "tick" , utcTick ) )
evaluate ( tracingCtx , attempt , ctx , span )
2022-07-12 16:51:32 -04:00
return nil
2021-11-02 17:04:13 -04:00
} )
if err != nil {
2022-10-20 13:43:48 -04:00
logger . Error ( "Evaluation failed after all retries" , "error" , err )
2021-07-27 11:52:59 +01:00
}
} ( )
2021-12-16 14:52:47 -05:00
case <- grafanaCtx . Done ( ) :
2022-08-25 14:12:22 -04:00
// clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
if errors . Is ( grafanaCtx . Err ( ) , errRuleDeleted ) {
2023-01-26 18:29:10 +01:00
// We do not want a context to be unbounded which could potentially cause a go routine running
// indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
// cases.
ctx , cancelFunc := context . WithTimeout ( context . Background ( ) , time . Minute )
defer cancelFunc ( )
2023-01-27 03:46:21 -05:00
states := sch . stateManager . DeleteStateByRuleUID ( ngmodels . WithRuleKey ( ctx , key ) , key , ngmodels . StateReasonRuleDeleted )
2023-01-26 18:29:10 +01:00
notify ( states )
2022-08-25 14:12:22 -04:00
}
2022-10-20 13:43:48 -04:00
logger . Debug ( "Stopping alert rule routine" )
2021-07-27 11:52:59 +01:00
return nil
}
}
}
// evalApplied is only used on tests.
2022-06-17 13:10:49 -04:00
func ( sch * schedule ) evalApplied ( alertDefKey ngmodels . AlertRuleKey , now time . Time ) {
2021-07-27 11:52:59 +01:00
if sch . evalAppliedFunc == nil {
return
}
sch . evalAppliedFunc ( alertDefKey , now )
}
// stopApplied is only used on tests.
2022-06-17 13:10:49 -04:00
func ( sch * schedule ) stopApplied ( alertDefKey ngmodels . AlertRuleKey ) {
2021-07-27 11:52:59 +01:00
if sch . stopAppliedFunc == nil {
return
}
sch . stopAppliedFunc ( alertDefKey )
}
2022-07-14 15:59:59 -04:00
2022-08-31 11:08:19 -04:00
func ( sch * schedule ) getRuleExtraLabels ( evalCtx * evaluation ) map [ string ] string {
2022-07-14 15:59:59 -04:00
extraLabels := make ( map [ string ] string , 4 )
2023-01-10 19:59:13 +00:00
extraLabels [ alertingModels . NamespaceUIDLabel ] = evalCtx . rule . NamespaceUID
2022-08-31 11:08:19 -04:00
extraLabels [ prometheusModel . AlertNameLabel ] = evalCtx . rule . Title
2023-01-10 19:59:13 +00:00
extraLabels [ alertingModels . RuleUIDLabel ] = evalCtx . rule . UID
2022-07-14 15:59:59 -04:00
if ! sch . disableGrafanaFolder {
2022-08-31 11:08:19 -04:00
extraLabels [ ngmodels . FolderTitleLabel ] = evalCtx . folderTitle
2022-07-14 15:59:59 -04:00
}
2022-08-31 11:08:19 -04:00
return extraLabels
2022-07-14 15:59:59 -04:00
}