2021-04-23 14:32:25 -05:00
package state
import (
2021-09-14 09:08:04 -05:00
"context"
2021-10-04 13:04:37 -05:00
"net/url"
2021-04-23 14:32:25 -05:00
"time"
2022-06-22 11:18:42 -05:00
"github.com/benbjohnson/clock"
2023-06-23 05:36:07 -05:00
"github.com/grafana/dskit/concurrency"
2022-02-24 04:58:54 -06:00
"github.com/grafana/grafana-plugin-sdk-go/data"
2023-08-16 02:04:18 -05:00
"go.opentelemetry.io/otel/attribute"
2023-01-27 02:46:21 -06:00
2021-04-23 14:32:25 -05:00
"github.com/grafana/grafana/pkg/infra/log"
2023-08-16 02:04:18 -05:00
"github.com/grafana/grafana/pkg/infra/tracing"
2021-04-23 14:32:25 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/eval"
2021-04-30 11:28:06 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
2021-04-23 14:32:25 -05:00
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
2023-01-25 11:29:57 -06:00
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
2021-04-23 14:32:25 -05:00
)
2022-11-04 16:06:47 -05:00
var (
ResendDelay = 30 * time . Second
MetricsScrapeInterval = 15 * time . Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
)
2021-09-02 10:22:59 -05:00
2022-03-09 12:20:29 -06:00
// AlertInstanceManager defines the interface for querying the current alert instances.
type AlertInstanceManager interface {
GetAll ( orgID int64 ) [ ] * State
GetStatesForRuleUID ( orgID int64 , alertRuleUID string ) [ ] * State
}
2021-04-23 14:32:25 -05:00
type Manager struct {
2021-07-07 11:18:31 -05:00
log log . Logger
2021-09-14 06:55:01 -05:00
metrics * metrics . State
2023-08-16 02:04:18 -05:00
tracer tracing . Tracer
2021-07-07 11:18:31 -05:00
2022-06-22 11:18:42 -05:00
clock clock . Clock
2021-05-19 15:15:09 -05:00
cache * cache
ResendDelay time . Duration
2021-07-07 11:18:31 -05:00
2022-10-05 15:32:20 -05:00
instanceStore InstanceStore
2022-11-09 15:06:49 -06:00
images ImageCapturer
2022-10-05 15:32:20 -05:00
historian Historian
2022-10-06 14:30:12 -05:00
externalURL * url . URL
2023-01-13 17:29:29 -06:00
2023-08-15 09:27:15 -05:00
doNotSaveNormalState bool
maxStateSaveConcurrency int
applyNoDataAndErrorToAllStates bool
2021-04-23 14:32:25 -05:00
}
2023-01-10 15:26:15 -06:00
type ManagerCfg struct {
Metrics * metrics . State
ExternalURL * url . URL
InstanceStore InstanceStore
Images ImageCapturer
Clock clock . Clock
Historian Historian
2023-01-13 17:29:29 -06:00
// DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods
DoNotSaveNormalState bool
2023-06-23 05:36:07 -05:00
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
MaxStateSaveConcurrency int
2023-08-15 09:27:15 -05:00
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
ApplyNoDataAndErrorToAllStates bool
2023-08-16 02:04:18 -05:00
Tracer tracing . Tracer
2023-09-20 08:07:02 -05:00
Log log . Logger
2023-01-10 15:26:15 -06:00
}
func NewManager ( cfg ManagerCfg ) * Manager {
2022-11-04 16:06:47 -05:00
return & Manager {
2023-08-15 09:27:15 -05:00
cache : newCache ( ) ,
ResendDelay : ResendDelay , // TODO: make this configurable
2023-09-20 08:07:02 -05:00
log : cfg . Log ,
2023-08-15 09:27:15 -05:00
metrics : cfg . Metrics ,
instanceStore : cfg . InstanceStore ,
images : cfg . Images ,
historian : cfg . Historian ,
clock : cfg . Clock ,
externalURL : cfg . ExternalURL ,
doNotSaveNormalState : cfg . DoNotSaveNormalState ,
maxStateSaveConcurrency : cfg . MaxStateSaveConcurrency ,
applyNoDataAndErrorToAllStates : cfg . ApplyNoDataAndErrorToAllStates ,
2023-08-16 02:04:18 -05:00
tracer : cfg . Tracer ,
2021-04-23 14:32:25 -05:00
}
}
2022-11-04 16:06:47 -05:00
func ( st * Manager ) Run ( ctx context . Context ) error {
2023-08-15 09:27:15 -05:00
if st . applyNoDataAndErrorToAllStates {
st . log . Info ( "Running in alternative execution of Error/NoData mode" )
}
2022-11-04 16:06:47 -05:00
ticker := st . clock . Ticker ( MetricsScrapeInterval )
for {
select {
case <- ticker . C :
st . log . Debug ( "Recording state cache metrics" , "now" , st . clock . Now ( ) )
st . cache . recordMetrics ( st . metrics )
case <- ctx . Done ( ) :
st . log . Debug ( "Stopping" )
ticker . Stop ( )
return ctx . Err ( )
}
}
2021-04-23 14:32:25 -05:00
}
2022-11-04 13:23:08 -05:00
func ( st * Manager ) Warm ( ctx context . Context , rulesReader RuleReader ) {
2022-10-28 12:10:28 -05:00
if st . instanceStore == nil {
st . log . Info ( "Skip warming the state because instance store is not configured" )
2022-11-04 13:23:08 -05:00
return
2022-10-28 12:10:28 -05:00
}
2022-10-06 14:30:12 -05:00
startTime := time . Now ( )
st . log . Info ( "Warming state cache for startup" )
2021-07-07 11:18:31 -05:00
2022-02-08 07:49:04 -06:00
orgIds , err := st . instanceStore . FetchOrgIds ( ctx )
2021-07-07 11:18:31 -05:00
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch orgIds" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2022-10-06 14:30:12 -05:00
statesCount := 0
states := make ( map [ int64 ] map [ string ] * ruleStates , len ( orgIds ) )
2021-07-07 11:18:31 -05:00
for _ , orgId := range orgIds {
// Get Rules
ruleCmd := ngModels . ListAlertRulesQuery {
OrgID : orgId ,
}
2023-03-28 03:34:35 -05:00
alertRules , err := rulesReader . ListAlertRules ( ctx , & ruleCmd )
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch previous state" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2023-03-28 03:34:35 -05:00
ruleByUID := make ( map [ string ] * ngModels . AlertRule , len ( alertRules ) )
for _ , rule := range alertRules {
2021-07-07 11:18:31 -05:00
ruleByUID [ rule . UID ] = rule
}
2022-10-06 14:30:12 -05:00
orgStates := make ( map [ string ] * ruleStates , len ( ruleByUID ) )
states [ orgId ] = orgStates
2021-07-07 11:18:31 -05:00
// Get Instances
cmd := ngModels . ListAlertInstancesQuery {
RuleOrgID : orgId ,
}
2023-03-28 03:34:35 -05:00
alertInstances , err := st . instanceStore . ListAlertInstances ( ctx , & cmd )
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch previous state" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2023-03-28 03:34:35 -05:00
for _ , entry := range alertInstances {
2021-07-07 11:18:31 -05:00
ruleForEntry , ok := ruleByUID [ entry . RuleUID ]
if ! ok {
2022-10-06 14:30:12 -05:00
// TODO Should we delete the orphaned state from the db?
2021-07-07 11:18:31 -05:00
continue
}
2022-10-06 14:30:12 -05:00
rulesStates , ok := orgStates [ entry . RuleUID ]
if ! ok {
rulesStates = & ruleStates { states : make ( map [ string ] * State ) }
orgStates [ entry . RuleUID ] = rulesStates
}
2021-07-07 11:18:31 -05:00
lbs := map [ string ] string ( entry . Labels )
2022-10-11 03:30:33 -05:00
cacheID , err := entry . Labels . StringKey ( )
2021-07-07 11:18:31 -05:00
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Error getting cacheId for entry" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2022-10-11 03:30:33 -05:00
rulesStates . states [ cacheID ] = & State {
2022-02-02 12:18:20 -06:00
AlertRuleUID : entry . RuleUID ,
OrgID : entry . RuleOrgID ,
2022-10-11 03:30:33 -05:00
CacheID : cacheID ,
2022-02-02 12:18:20 -06:00
Labels : lbs ,
State : translateInstanceState ( entry . CurrentState ) ,
2022-05-23 03:49:49 -05:00
StateReason : entry . CurrentReason ,
2022-02-02 12:18:20 -06:00
LastEvaluationString : "" ,
StartsAt : entry . CurrentStateSince ,
EndsAt : entry . CurrentStateEnd ,
LastEvaluationTime : entry . LastEvalTime ,
Annotations : ruleForEntry . Annotations ,
2021-07-07 11:18:31 -05:00
}
2022-10-06 14:30:12 -05:00
statesCount ++
2021-07-07 11:18:31 -05:00
}
}
2022-10-06 14:30:12 -05:00
st . cache . setAllStates ( states )
2022-10-21 16:16:51 -05:00
st . log . Info ( "State cache has been initialized" , "states" , statesCount , "duration" , time . Since ( startTime ) )
2021-04-23 14:32:25 -05:00
}
2022-10-06 14:30:12 -05:00
func ( st * Manager ) Get ( orgID int64 , alertRuleUID , stateId string ) * State {
2021-05-04 11:57:50 -05:00
return st . cache . get ( orgID , alertRuleUID , stateId )
2021-04-23 14:32:25 -05:00
}
2023-01-26 11:29:10 -06:00
// DeleteStateByRuleUID removes the rule instances from cache and instanceStore. A closed channel is returned to be able
// to gracefully handle the clear state step in scheduler in case we do not need to use the historian to save state
// history.
2023-01-27 02:46:21 -06:00
func ( st * Manager ) DeleteStateByRuleUID ( ctx context . Context , ruleKey ngModels . AlertRuleKey , reason string ) [ ] StateTransition {
logger := st . log . FromContext ( ctx )
2022-10-21 16:16:51 -05:00
logger . Debug ( "Resetting state of the rule" )
2023-01-26 11:29:10 -06:00
2022-08-25 13:12:22 -05:00
states := st . cache . removeByRuleUID ( ruleKey . OrgID , ruleKey . UID )
2023-01-27 02:46:21 -06:00
2023-01-26 11:29:10 -06:00
if len ( states ) == 0 {
2023-01-27 02:46:21 -06:00
return nil
}
now := st . clock . Now ( )
transitions := make ( [ ] StateTransition , 0 , len ( states ) )
for _ , s := range states {
oldState := s . State
oldReason := s . StateReason
startsAt := s . StartsAt
if s . State != eval . Normal {
startsAt = now
}
s . SetNormal ( reason , startsAt , now )
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
s . Resolved = oldState == eval . Alerting
s . LastEvaluationTime = now
s . Values = map [ string ] float64 { }
transitions = append ( transitions , StateTransition {
State : s ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
} )
2023-01-26 11:29:10 -06:00
}
2023-01-27 02:46:21 -06:00
2023-01-26 11:29:10 -06:00
if st . instanceStore != nil {
2022-08-25 13:12:22 -05:00
err := st . instanceStore . DeleteAlertInstancesByRule ( ctx , ruleKey )
if err != nil {
2022-10-21 16:16:51 -05:00
logger . Error ( "Failed to delete states that belong to a rule from database" , "error" , err )
2022-08-25 13:12:22 -05:00
}
}
2022-10-21 16:16:51 -05:00
logger . Info ( "Rules state was reset" , "states" , len ( states ) )
2023-01-26 11:29:10 -06:00
2023-01-27 02:46:21 -06:00
return transitions
2023-01-26 11:29:10 -06:00
}
// ResetStateByRuleUID removes the rule instances from cache and instanceStore and saves state history. If the state
// history has to be saved, rule must not be nil.
2023-01-27 02:46:21 -06:00
func ( st * Manager ) ResetStateByRuleUID ( ctx context . Context , rule * ngModels . AlertRule , reason string ) [ ] StateTransition {
2023-01-26 11:29:10 -06:00
ruleKey := rule . GetKey ( )
2023-01-27 02:46:21 -06:00
transitions := st . DeleteStateByRuleUID ( ctx , ruleKey , reason )
2023-01-26 11:29:10 -06:00
2023-01-27 02:46:21 -06:00
if rule == nil || st . historian == nil || len ( transitions ) == 0 {
return transitions
2023-01-26 11:29:10 -06:00
}
ruleMeta := history_model . NewRuleMeta ( rule , st . log )
2023-03-17 12:41:18 -05:00
errCh := st . historian . Record ( ctx , ruleMeta , transitions )
2023-01-26 11:29:10 -06:00
go func ( ) {
err := <- errCh
if err != nil {
st . log . FromContext ( ctx ) . Error ( "Error updating historian state reset transitions" , append ( ruleKey . LogContext ( ) , "reason" , reason , "error" , err ) ... )
}
} ( )
2023-01-27 02:46:21 -06:00
return transitions
2021-05-03 13:01:33 -05:00
}
2022-07-14 14:59:59 -05:00
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
2022-12-06 12:07:39 -06:00
func ( st * Manager ) ProcessEvalResults ( ctx context . Context , evaluatedAt time . Time , alertRule * ngModels . AlertRule , results eval . Results , extraLabels data . Labels ) [ ] StateTransition {
2023-08-16 02:04:18 -05:00
tracingCtx , span := st . tracer . Start ( ctx , "alert rule state calculation" )
defer span . End ( )
span . SetAttributes ( "rule_uid" , alertRule . UID , attribute . String ( "rule_uid" , alertRule . UID ) )
span . SetAttributes ( "org_id" , alertRule . OrgID , attribute . Int64 ( "org_id" , alertRule . OrgID ) )
span . SetAttributes ( "rule_version" , alertRule . Version , attribute . Int64 ( "rule_version" , alertRule . Version ) )
utcTick := evaluatedAt . UTC ( ) . Format ( time . RFC3339Nano )
span . SetAttributes ( "tick" , utcTick , attribute . String ( "tick" , utcTick ) )
span . SetAttributes ( "results" , len ( results ) , attribute . Int ( "tick" , len ( results ) ) )
logger := st . log . FromContext ( tracingCtx )
2022-10-21 16:16:51 -05:00
logger . Debug ( "State manager processing evaluation results" , "resultCount" , len ( results ) )
2023-08-16 02:04:18 -05:00
states := st . setNextStateForRule ( tracingCtx , alertRule , results , extraLabels , logger )
span . AddEvents ( [ ] string { "message" , "state_transitions" } ,
[ ] tracing . EventValue {
{ Str : "results processed" } ,
{ Num : int64 ( len ( states ) ) } ,
} )
2022-11-07 10:03:53 -06:00
2022-11-14 09:57:51 -06:00
staleStates := st . deleteStaleStatesFromCache ( ctx , logger , evaluatedAt , alertRule )
2023-08-16 02:04:18 -05:00
st . deleteAlertStates ( tracingCtx , logger , staleStates )
if len ( staleStates ) > 0 {
span . AddEvents ( [ ] string { "message" , "state_transitions" } ,
[ ] tracing . EventValue {
{ Str : "deleted stale states" } ,
{ Num : int64 ( len ( staleStates ) ) } ,
} )
}
2022-11-07 08:09:19 -06:00
2023-08-16 02:04:18 -05:00
st . saveAlertStates ( tracingCtx , logger , states ... )
span . AddEvents ( [ ] string { "message" } ,
[ ] tracing . EventValue {
{ Str : "updated database" } ,
} )
2022-11-04 10:39:26 -05:00
2022-12-06 11:33:15 -06:00
allChanges := append ( states , staleStates ... )
if st . historian != nil {
2023-08-16 02:04:18 -05:00
st . historian . Record ( tracingCtx , history_model . NewRuleMeta ( alertRule , logger ) , allChanges )
2022-12-06 11:33:15 -06:00
}
2022-12-06 12:07:39 -06:00
return allChanges
2021-04-23 14:32:25 -05:00
}
2023-08-15 09:27:15 -05:00
func ( st * Manager ) setNextStateForRule ( ctx context . Context , alertRule * ngModels . AlertRule , results eval . Results , extraLabels data . Labels , logger log . Logger ) [ ] StateTransition {
if st . applyNoDataAndErrorToAllStates && results . IsNoData ( ) && ( alertRule . NoDataState == ngModels . Alerting || alertRule . NoDataState == ngModels . OK ) { // If it is no data, check the mapping and switch all results to the new state
// TODO aggregate UID of datasources that returned NoData into one and provide as auxiliary info, probably annotation
transitions := st . setNextStateForAll ( ctx , alertRule , results [ 0 ] , logger )
if len ( transitions ) > 0 {
return transitions // if there are no current states for the rule. Create ones for each result
}
}
if st . applyNoDataAndErrorToAllStates && results . IsError ( ) && ( alertRule . ExecErrState == ngModels . AlertingErrState || alertRule . ExecErrState == ngModels . OkErrState ) {
// TODO squash all errors into one, and provide as annotation
transitions := st . setNextStateForAll ( ctx , alertRule , results [ 0 ] , logger )
if len ( transitions ) > 0 {
return transitions // if there are no current states for the rule. Create ones for each result
}
}
transitions := make ( [ ] StateTransition , 0 , len ( results ) )
for _ , result := range results {
currentState := st . cache . getOrCreate ( ctx , logger , alertRule , result , extraLabels , st . externalURL )
s := st . setNextState ( ctx , alertRule , currentState , result , logger )
transitions = append ( transitions , s )
}
return transitions
}
2021-04-23 14:32:25 -05:00
2023-08-15 09:27:15 -05:00
func ( st * Manager ) setNextStateForAll ( ctx context . Context , alertRule * ngModels . AlertRule , result eval . Result , logger log . Logger ) [ ] StateTransition {
currentStates := st . cache . getStatesForRuleUID ( alertRule . OrgID , alertRule . UID , false )
transitions := make ( [ ] StateTransition , 0 , len ( currentStates ) )
for _ , currentState := range currentStates {
t := st . setNextState ( ctx , alertRule , currentState , result , logger )
transitions = append ( transitions , t )
}
return transitions
}
// Set the current state based on evaluation results
func ( st * Manager ) setNextState ( ctx context . Context , alertRule * ngModels . AlertRule , currentState * State , result eval . Result , logger log . Logger ) StateTransition {
2023-08-16 02:04:18 -05:00
start := st . clock . Now ( )
2021-04-23 14:32:25 -05:00
currentState . LastEvaluationTime = result . EvaluatedAt
currentState . EvaluationDuration = result . EvaluationDuration
currentState . Results = append ( currentState . Results , Evaluation {
2022-02-02 12:18:20 -06:00
EvaluationTime : result . EvaluatedAt ,
EvaluationState : result . State ,
Values : NewEvaluationValues ( result . Values ) ,
2022-04-05 13:36:42 -05:00
Condition : alertRule . Condition ,
2021-04-23 14:32:25 -05:00
} )
2022-02-02 12:18:20 -06:00
currentState . LastEvaluationString = result . EvaluationString
2021-05-18 12:56:14 -05:00
currentState . TrimResults ( alertRule )
2021-07-13 11:50:10 -05:00
oldState := currentState . State
2022-05-23 03:49:49 -05:00
oldReason := currentState . StateReason
2021-04-23 14:32:25 -05:00
2022-12-07 04:45:56 -06:00
// Add the instance to the log context to help correlate log lines for a state
logger = logger . New ( "instance" , result . Instance )
2023-07-26 10:41:46 -05:00
// if the current state is Error but the result is different, then we need o clean up the extra labels
// that were added after the state key was calculated
// https://github.com/grafana/grafana/blob/1df4d332c982dc5e394201bb2ef35b442727ce63/pkg/services/ngalert/state/state.go#L298-L311
// Usually, it happens in the case of classic conditions when the evalResult does not have labels.
//
// This is temporary change to make sure that the labels are not persistent in the state after it was in Error state
2023-08-15 09:27:15 -05:00
// TODO yuri. Remove it when correct Error result with labels is provided
2023-07-26 10:41:46 -05:00
if currentState . State == eval . Error && result . State != eval . Error {
// This is possible because state was updated after the CacheID was calculated.
_ , curOk := currentState . Labels [ "ref_id" ]
_ , resOk := result . Instance [ "ref_id" ]
if curOk && ! resOk {
delete ( currentState . Labels , "ref_id" )
}
_ , curOk = currentState . Labels [ "datasource_uid" ]
_ , resOk = result . Instance [ "datasource_uid" ]
if curOk && ! resOk {
delete ( currentState . Labels , "datasource_uid" )
}
}
2021-04-23 14:32:25 -05:00
switch result . State {
case eval . Normal :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultNormal" )
resultNormal ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . Alerting :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultAlerting" )
resultAlerting ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . Error :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultError" )
resultError ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . NoData :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultNoData" )
resultNoData ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . Pending : // we do not emit results with this state
2022-12-07 04:45:56 -06:00
logger . Debug ( "Ignoring set next state as result is pending" )
2021-04-23 14:32:25 -05:00
}
2023-01-26 11:29:10 -06:00
// Set reason iff: result and state are different, reason is not Alerting or Normal
2022-05-23 03:49:49 -05:00
currentState . StateReason = ""
if currentState . State != result . State &&
result . State != eval . Normal &&
result . State != eval . Alerting {
currentState . StateReason = result . State . String ( )
}
2021-07-29 13:29:17 -05:00
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
currentState . Resolved = oldState == eval . Alerting && currentState . State == eval . Normal
2022-11-02 17:14:22 -05:00
if shouldTakeImage ( currentState . State , oldState , currentState . Image , currentState . Resolved ) {
2022-11-09 15:06:49 -06:00
image , err := takeImage ( ctx , st . images , alertRule )
2022-11-02 17:14:22 -05:00
if err != nil {
logger . Warn ( "Failed to take an image" ,
2022-11-10 03:58:38 -06:00
"dashboard" , alertRule . GetDashboardUID ( ) ,
"panel" , alertRule . GetPanelID ( ) ,
2022-11-02 17:14:22 -05:00
"error" , err )
} else if image != nil {
currentState . Image = image
}
2022-05-22 21:53:41 -05:00
}
2022-10-06 14:30:12 -05:00
st . cache . set ( currentState )
2022-05-23 03:49:49 -05:00
2022-11-04 10:39:26 -05:00
nextState := StateTransition {
State : currentState ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
2021-07-13 11:50:10 -05:00
}
2022-11-04 10:39:26 -05:00
2023-08-16 02:04:18 -05:00
if st . metrics != nil {
st . metrics . StateUpdateDuration . Observe ( st . clock . Now ( ) . Sub ( start ) . Seconds ( ) )
}
2022-11-04 10:39:26 -05:00
return nextState
2021-04-23 14:32:25 -05:00
}
2021-05-04 11:57:50 -05:00
func ( st * Manager ) GetAll ( orgID int64 ) [ ] * State {
2023-01-13 17:29:29 -06:00
allStates := st . cache . getAll ( orgID , st . doNotSaveNormalState )
return allStates
2021-04-23 14:32:25 -05:00
}
2021-05-04 11:57:50 -05:00
func ( st * Manager ) GetStatesForRuleUID ( orgID int64 , alertRuleUID string ) [ ] * State {
2023-01-13 17:29:29 -06:00
return st . cache . getStatesForRuleUID ( orgID , alertRuleUID , st . doNotSaveNormalState )
2021-04-23 14:32:25 -05:00
}
func ( st * Manager ) Put ( states [ ] * State ) {
for _ , s := range states {
2022-10-06 14:30:12 -05:00
st . cache . set ( s )
2021-04-23 14:32:25 -05:00
}
}
2021-07-07 11:18:31 -05:00
2022-10-06 01:22:58 -05:00
// TODO: Is the `State` type necessary? Should it embed the instance?
2022-11-07 08:09:19 -06:00
func ( st * Manager ) saveAlertStates ( ctx context . Context , logger log . Logger , states ... StateTransition ) {
2022-11-14 09:57:51 -06:00
if st . instanceStore == nil || len ( states ) == 0 {
2022-11-07 08:09:19 -06:00
return
2022-10-28 12:10:28 -05:00
}
2023-06-23 05:36:07 -05:00
saveState := func ( ctx context . Context , idx int ) error {
s := states [ idx ]
2023-01-13 17:29:29 -06:00
// Do not save normal state to database and remove transition to Normal state but keep mapped states
if st . doNotSaveNormalState && IsNormalStateWithNoReason ( s . State ) && ! s . Changed ( ) {
2023-06-23 05:36:07 -05:00
return nil
2023-01-13 17:29:29 -06:00
}
2022-11-07 08:35:29 -06:00
key , err := s . GetAlertInstanceKey ( )
2022-10-06 01:22:58 -05:00
if err != nil {
2023-01-13 17:29:29 -06:00
logger . Error ( "Failed to create a key for alert state to save it to database. The state will be ignored " , "cacheID" , s . CacheID , "error" , err , "labels" , s . Labels . String ( ) )
2023-06-23 05:36:07 -05:00
return nil
2022-10-06 01:22:58 -05:00
}
2023-04-06 11:06:25 -05:00
instance := ngModels . AlertInstance {
2022-11-07 08:35:29 -06:00
AlertInstanceKey : key ,
2022-10-06 01:22:58 -05:00
Labels : ngModels . InstanceLabels ( s . Labels ) ,
2022-11-04 10:39:26 -05:00
CurrentState : ngModels . InstanceStateType ( s . State . State . String ( ) ) ,
2022-10-06 01:22:58 -05:00
CurrentReason : s . StateReason ,
LastEvalTime : s . LastEvaluationTime ,
CurrentStateSince : s . StartsAt ,
CurrentStateEnd : s . EndsAt ,
}
2023-04-06 11:06:25 -05:00
err = st . instanceStore . SaveAlertInstance ( ctx , instance )
if err != nil {
logger . Error ( "Failed to save alert state" , "labels" , s . Labels . String ( ) , "state" , s . State , "error" , err )
2023-06-23 05:36:07 -05:00
return nil
2022-10-06 01:22:58 -05:00
}
2023-06-23 05:36:07 -05:00
return nil
2022-10-06 01:22:58 -05:00
}
2023-06-23 05:36:07 -05:00
2023-06-28 09:19:21 -05:00
start := time . Now ( )
2023-06-23 05:36:07 -05:00
logger . Debug ( "Saving alert states" , "count" , len ( states ) , "max_state_save_concurrency" , st . maxStateSaveConcurrency )
_ = concurrency . ForEachJob ( ctx , len ( states ) , st . maxStateSaveConcurrency , saveState )
2023-06-28 09:19:21 -05:00
logger . Debug ( "Saving alert states done" , "count" , len ( states ) , "max_state_save_concurrency" , st . maxStateSaveConcurrency , "duration" , time . Since ( start ) )
2022-08-18 08:40:33 -05:00
}
2022-11-14 09:57:51 -06:00
func ( st * Manager ) deleteAlertStates ( ctx context . Context , logger log . Logger , states [ ] StateTransition ) {
if st . instanceStore == nil || len ( states ) == 0 {
return
}
logger . Debug ( "Deleting alert states" , "count" , len ( states ) )
toDelete := make ( [ ] ngModels . AlertInstanceKey , 0 , len ( states ) )
for _ , s := range states {
key , err := s . GetAlertInstanceKey ( )
if err != nil {
logger . Error ( "Failed to delete alert instance with invalid labels" , "cacheID" , s . CacheID , "error" , err )
continue
}
toDelete = append ( toDelete , key )
}
err := st . instanceStore . DeleteAlertInstances ( ctx , toDelete ... )
if err != nil {
logger . Error ( "Failed to delete stale states" , "error" , err )
}
}
2021-07-07 11:18:31 -05:00
func translateInstanceState ( state ngModels . InstanceStateType ) eval . State {
2023-01-04 08:40:04 -06:00
switch state {
case ngModels . InstanceStateFiring :
2021-07-07 11:18:31 -05:00
return eval . Alerting
2023-01-04 08:40:04 -06:00
case ngModels . InstanceStateNormal :
2021-07-07 11:18:31 -05:00
return eval . Normal
2023-01-04 08:40:04 -06:00
case ngModels . InstanceStateError :
return eval . Error
case ngModels . InstanceStateNoData :
return eval . NoData
case ngModels . InstanceStatePending :
return eval . Pending
2021-07-07 11:18:31 -05:00
default :
return eval . Error
}
}
2021-07-13 11:50:10 -05:00
2022-11-14 09:57:51 -06:00
func ( st * Manager ) deleteStaleStatesFromCache ( ctx context . Context , logger log . Logger , evaluatedAt time . Time , alertRule * ngModels . AlertRule ) [ ] StateTransition {
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
staleStates := st . cache . deleteRuleStates ( alertRule . GetKey ( ) , func ( s * State ) bool {
return stateIsStale ( evaluatedAt , s . LastEvaluationTime , alertRule . IntervalSeconds )
2022-11-09 05:08:32 -06:00
} )
2023-01-17 05:50:17 -06:00
resolvedStates := make ( [ ] StateTransition , 0 , len ( staleStates ) )
2022-10-06 01:22:58 -05:00
2022-11-07 10:03:53 -06:00
for _ , s := range staleStates {
logger . Info ( "Detected stale state entry" , "cacheID" , s . CacheID , "state" , s . State , "reason" , s . StateReason )
2022-11-14 09:57:51 -06:00
oldState := s . State
oldReason := s . StateReason
2022-11-07 08:35:29 -06:00
2022-11-14 09:57:51 -06:00
s . State = eval . Normal
s . StateReason = ngModels . StateReasonMissingSeries
s . EndsAt = evaluatedAt
s . LastEvaluationTime = evaluatedAt
2022-11-02 17:14:22 -05:00
2022-11-14 09:57:51 -06:00
if oldState == eval . Alerting {
s . Resolved = true
2023-04-26 11:06:18 -05:00
image , err := takeImage ( ctx , st . images , alertRule )
if err != nil {
logger . Warn ( "Failed to take an image" ,
"dashboard" , alertRule . GetDashboardUID ( ) ,
"panel" , alertRule . GetPanelID ( ) ,
"error" , err )
} else if image != nil {
s . Image = image
2022-02-24 10:25:28 -06:00
}
2021-07-26 11:12:04 -05:00
}
2022-10-06 01:22:58 -05:00
2022-11-14 09:57:51 -06:00
record := StateTransition {
State : s ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
2022-10-28 12:10:28 -05:00
}
2022-11-14 09:57:51 -06:00
resolvedStates = append ( resolvedStates , record )
2022-10-06 01:22:58 -05:00
}
2022-09-21 12:24:47 -05:00
return resolvedStates
2021-07-26 11:12:04 -05:00
}
2022-10-06 01:22:58 -05:00
func stateIsStale ( evaluatedAt time . Time , lastEval time . Time , intervalSeconds int64 ) bool {
2022-06-21 17:16:53 -05:00
return ! lastEval . Add ( 2 * time . Duration ( intervalSeconds ) * time . Second ) . After ( evaluatedAt )
2021-07-26 11:12:04 -05:00
}