2021-04-23 14:32:25 -05:00
package state
import (
2021-09-14 09:08:04 -05:00
"context"
2021-10-04 13:04:37 -05:00
"net/url"
2024-01-04 10:47:13 -06:00
"strconv"
2024-07-30 11:55:59 -05:00
"strings"
2021-04-23 14:32:25 -05:00
"time"
2022-06-22 11:18:42 -05:00
"github.com/benbjohnson/clock"
2022-02-24 04:58:54 -06:00
"github.com/grafana/grafana-plugin-sdk-go/data"
2023-08-16 02:04:18 -05:00
"go.opentelemetry.io/otel/attribute"
2023-10-03 07:54:20 -05:00
"go.opentelemetry.io/otel/trace"
2023-01-27 02:46:21 -06:00
2021-04-23 14:32:25 -05:00
"github.com/grafana/grafana/pkg/infra/log"
2023-08-16 02:04:18 -05:00
"github.com/grafana/grafana/pkg/infra/tracing"
2021-04-23 14:32:25 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/eval"
2021-04-30 11:28:06 -05:00
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
2021-04-23 14:32:25 -05:00
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
2023-01-25 11:29:57 -06:00
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
2021-04-23 14:32:25 -05:00
)
2022-11-04 16:06:47 -05:00
var (
2023-10-02 10:36:23 -05:00
ResendDelay = 30 * time . Second
2022-11-04 16:06:47 -05:00
)
2021-09-02 10:22:59 -05:00
2022-03-09 12:20:29 -06:00
// AlertInstanceManager defines the interface for querying the current alert instances.
type AlertInstanceManager interface {
GetAll ( orgID int64 ) [ ] * State
GetStatesForRuleUID ( orgID int64 , alertRuleUID string ) [ ] * State
}
2024-01-17 06:33:13 -06:00
type StatePersister interface {
2024-01-23 10:03:30 -06:00
Async ( ctx context . Context , cache * cache )
2024-06-25 12:01:26 -05:00
Sync ( ctx context . Context , span trace . Span , states StateTransitions )
2024-01-17 06:33:13 -06:00
}
2024-06-25 12:01:26 -05:00
// Sender is an optional callback intended for sending the states to an alertmanager.
type Sender func ( context . Context , StateTransitions )
2021-04-23 14:32:25 -05:00
type Manager struct {
2021-07-07 11:18:31 -05:00
log log . Logger
2021-09-14 06:55:01 -05:00
metrics * metrics . State
2023-08-16 02:04:18 -05:00
tracer tracing . Tracer
2021-07-07 11:18:31 -05:00
2024-06-20 15:33:03 -05:00
clock clock . Clock
cache * cache
ResendDelay time . Duration
ResolvedRetention time . Duration
2021-07-07 11:18:31 -05:00
2022-10-05 15:32:20 -05:00
instanceStore InstanceStore
2022-11-09 15:06:49 -06:00
images ImageCapturer
2022-10-05 15:32:20 -05:00
historian Historian
2022-10-06 14:30:12 -05:00
externalURL * url . URL
2023-01-13 17:29:29 -06:00
2023-08-15 09:27:15 -05:00
doNotSaveNormalState bool
applyNoDataAndErrorToAllStates bool
2024-02-13 08:29:03 -06:00
rulesPerRuleGroupLimit int64
2024-01-17 06:33:13 -06:00
persister StatePersister
2021-04-23 14:32:25 -05:00
}
2023-01-10 15:26:15 -06:00
type ManagerCfg struct {
Metrics * metrics . State
ExternalURL * url . URL
InstanceStore InstanceStore
Images ImageCapturer
Clock clock . Clock
Historian Historian
2023-01-13 17:29:29 -06:00
// DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods
DoNotSaveNormalState bool
2023-06-23 05:36:07 -05:00
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
MaxStateSaveConcurrency int
2023-08-15 09:27:15 -05:00
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
ApplyNoDataAndErrorToAllStates bool
2024-02-13 08:29:03 -06:00
RulesPerRuleGroupLimit int64
2023-08-16 02:04:18 -05:00
2024-04-03 10:18:02 -05:00
DisableExecution bool
2024-06-20 15:33:03 -05:00
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
ResolvedRetention time . Duration
2023-08-16 02:04:18 -05:00
Tracer tracing . Tracer
2023-09-20 08:07:02 -05:00
Log log . Logger
2023-01-10 15:26:15 -06:00
}
2024-01-17 06:33:13 -06:00
func NewManager ( cfg ManagerCfg , statePersister StatePersister ) * Manager {
2023-09-25 04:27:30 -05:00
// Metrics for the cache use a collector, so they need access to the register directly.
c := newCache ( )
2024-04-03 10:18:02 -05:00
// Only expose the metrics if this grafana server does execute alerts.
if cfg . Metrics != nil && ! cfg . DisableExecution {
2023-09-25 04:27:30 -05:00
c . RegisterMetrics ( cfg . Metrics . Registerer ( ) )
}
m := & Manager {
cache : c ,
2023-08-15 09:27:15 -05:00
ResendDelay : ResendDelay , // TODO: make this configurable
2024-06-20 15:33:03 -05:00
ResolvedRetention : cfg . ResolvedRetention ,
2023-09-20 08:07:02 -05:00
log : cfg . Log ,
2023-08-15 09:27:15 -05:00
metrics : cfg . Metrics ,
instanceStore : cfg . InstanceStore ,
images : cfg . Images ,
historian : cfg . Historian ,
clock : cfg . Clock ,
externalURL : cfg . ExternalURL ,
doNotSaveNormalState : cfg . DoNotSaveNormalState ,
applyNoDataAndErrorToAllStates : cfg . ApplyNoDataAndErrorToAllStates ,
2024-02-13 08:29:03 -06:00
rulesPerRuleGroupLimit : cfg . RulesPerRuleGroupLimit ,
2024-01-17 06:33:13 -06:00
persister : statePersister ,
2023-08-16 02:04:18 -05:00
tracer : cfg . Tracer ,
2021-04-23 14:32:25 -05:00
}
2023-09-25 04:27:30 -05:00
if m . applyNoDataAndErrorToAllStates {
m . log . Info ( "Running in alternative execution of Error/NoData mode" )
2022-11-04 16:06:47 -05:00
}
2023-09-25 04:27:30 -05:00
return m
2021-04-23 14:32:25 -05:00
}
2024-01-23 10:03:30 -06:00
func ( st * Manager ) Run ( ctx context . Context ) error {
st . persister . Async ( ctx , st . cache )
return nil
}
2022-11-04 13:23:08 -05:00
func ( st * Manager ) Warm ( ctx context . Context , rulesReader RuleReader ) {
2022-10-28 12:10:28 -05:00
if st . instanceStore == nil {
st . log . Info ( "Skip warming the state because instance store is not configured" )
2022-11-04 13:23:08 -05:00
return
2022-10-28 12:10:28 -05:00
}
2022-10-06 14:30:12 -05:00
startTime := time . Now ( )
st . log . Info ( "Warming state cache for startup" )
2021-07-07 11:18:31 -05:00
2022-02-08 07:49:04 -06:00
orgIds , err := st . instanceStore . FetchOrgIds ( ctx )
2021-07-07 11:18:31 -05:00
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch orgIds" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2022-10-06 14:30:12 -05:00
statesCount := 0
states := make ( map [ int64 ] map [ string ] * ruleStates , len ( orgIds ) )
2021-07-07 11:18:31 -05:00
for _ , orgId := range orgIds {
// Get Rules
ruleCmd := ngModels . ListAlertRulesQuery {
OrgID : orgId ,
}
2023-03-28 03:34:35 -05:00
alertRules , err := rulesReader . ListAlertRules ( ctx , & ruleCmd )
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch previous state" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2023-03-28 03:34:35 -05:00
ruleByUID := make ( map [ string ] * ngModels . AlertRule , len ( alertRules ) )
2024-02-13 08:29:03 -06:00
groupSizes := make ( map [ string ] int64 )
2023-03-28 03:34:35 -05:00
for _ , rule := range alertRules {
2021-07-07 11:18:31 -05:00
ruleByUID [ rule . UID ] = rule
2024-02-13 08:29:03 -06:00
groupSizes [ rule . RuleGroup ] += 1
}
// Emit a warning if we detect a large group.
// We will not enforce this here, but it's convenient to emit the warning here as we load up all the rules.
for name , size := range groupSizes {
if st . rulesPerRuleGroupLimit > 0 && size > st . rulesPerRuleGroupLimit {
st . log . Warn (
"Large rule group was loaded. Large groups are discouraged and changes to them may be disallowed in the future." ,
"limit" , st . rulesPerRuleGroupLimit ,
"actual" , size ,
"group" , name ,
)
}
2021-07-07 11:18:31 -05:00
}
2022-10-06 14:30:12 -05:00
orgStates := make ( map [ string ] * ruleStates , len ( ruleByUID ) )
states [ orgId ] = orgStates
2021-07-07 11:18:31 -05:00
// Get Instances
cmd := ngModels . ListAlertInstancesQuery {
RuleOrgID : orgId ,
}
2023-03-28 03:34:35 -05:00
alertInstances , err := st . instanceStore . ListAlertInstances ( ctx , & cmd )
if err != nil {
2022-10-21 16:16:51 -05:00
st . log . Error ( "Unable to fetch previous state" , "error" , err )
2021-07-07 11:18:31 -05:00
}
2023-03-28 03:34:35 -05:00
for _ , entry := range alertInstances {
2021-07-07 11:18:31 -05:00
ruleForEntry , ok := ruleByUID [ entry . RuleUID ]
if ! ok {
2022-10-06 14:30:12 -05:00
// TODO Should we delete the orphaned state from the db?
2021-07-07 11:18:31 -05:00
continue
}
2024-08-02 14:15:57 -05:00
// nil safety.
annotations := ruleForEntry . Annotations
if annotations == nil {
annotations = make ( map [ string ] string )
}
2022-10-06 14:30:12 -05:00
rulesStates , ok := orgStates [ entry . RuleUID ]
if ! ok {
2024-06-11 11:34:58 -05:00
rulesStates = & ruleStates { states : make ( map [ data . Fingerprint ] * State ) }
2022-10-06 14:30:12 -05:00
orgStates [ entry . RuleUID ] = rulesStates
}
2021-07-07 11:18:31 -05:00
lbs := map [ string ] string ( entry . Labels )
2024-06-11 11:34:58 -05:00
cacheID := entry . Labels . Fingerprint ( )
2024-01-04 10:47:13 -06:00
var resultFp data . Fingerprint
if entry . ResultFingerprint != "" {
fp , err := strconv . ParseUint ( entry . ResultFingerprint , 16 , 64 )
if err != nil {
st . log . Error ( "Failed to parse result fingerprint of alert instance" , "error" , err , "ruleUID" , entry . RuleUID )
}
resultFp = data . Fingerprint ( fp )
}
2022-10-11 03:30:33 -05:00
rulesStates . states [ cacheID ] = & State {
2022-02-02 12:18:20 -06:00
AlertRuleUID : entry . RuleUID ,
OrgID : entry . RuleOrgID ,
2022-10-11 03:30:33 -05:00
CacheID : cacheID ,
2022-02-02 12:18:20 -06:00
Labels : lbs ,
State : translateInstanceState ( entry . CurrentState ) ,
2022-05-23 03:49:49 -05:00
StateReason : entry . CurrentReason ,
2022-02-02 12:18:20 -06:00
LastEvaluationString : "" ,
StartsAt : entry . CurrentStateSince ,
EndsAt : entry . CurrentStateEnd ,
LastEvaluationTime : entry . LastEvalTime ,
2024-08-02 14:15:57 -05:00
Annotations : annotations ,
2024-01-04 10:47:13 -06:00
ResultFingerprint : resultFp ,
2024-07-12 11:26:58 -05:00
ResolvedAt : entry . ResolvedAt ,
LastSentAt : entry . LastSentAt ,
2021-07-07 11:18:31 -05:00
}
2022-10-06 14:30:12 -05:00
statesCount ++
2021-07-07 11:18:31 -05:00
}
}
2024-06-11 11:34:58 -05:00
2022-10-06 14:30:12 -05:00
st . cache . setAllStates ( states )
2022-10-21 16:16:51 -05:00
st . log . Info ( "State cache has been initialized" , "states" , statesCount , "duration" , time . Since ( startTime ) )
2021-04-23 14:32:25 -05:00
}
2024-06-11 11:34:58 -05:00
func ( st * Manager ) Get ( orgID int64 , alertRuleUID string , stateId data . Fingerprint ) * State {
2021-05-04 11:57:50 -05:00
return st . cache . get ( orgID , alertRuleUID , stateId )
2021-04-23 14:32:25 -05:00
}
2023-01-26 11:29:10 -06:00
// DeleteStateByRuleUID removes the rule instances from cache and instanceStore. A closed channel is returned to be able
// to gracefully handle the clear state step in scheduler in case we do not need to use the historian to save state
// history.
2023-01-27 02:46:21 -06:00
func ( st * Manager ) DeleteStateByRuleUID ( ctx context . Context , ruleKey ngModels . AlertRuleKey , reason string ) [ ] StateTransition {
logger := st . log . FromContext ( ctx )
2022-10-21 16:16:51 -05:00
logger . Debug ( "Resetting state of the rule" )
2023-01-26 11:29:10 -06:00
2022-08-25 13:12:22 -05:00
states := st . cache . removeByRuleUID ( ruleKey . OrgID , ruleKey . UID )
2023-01-27 02:46:21 -06:00
2023-01-26 11:29:10 -06:00
if len ( states ) == 0 {
2023-01-27 02:46:21 -06:00
return nil
}
now := st . clock . Now ( )
transitions := make ( [ ] StateTransition , 0 , len ( states ) )
for _ , s := range states {
oldState := s . State
oldReason := s . StateReason
startsAt := s . StartsAt
if s . State != eval . Normal {
startsAt = now
}
s . SetNormal ( reason , startsAt , now )
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
2024-06-20 15:33:03 -05:00
if oldState == eval . Alerting || oldState == eval . Error || oldState == eval . NoData {
s . ResolvedAt = & now
} else {
s . ResolvedAt = nil
}
2023-01-27 02:46:21 -06:00
s . LastEvaluationTime = now
s . Values = map [ string ] float64 { }
transitions = append ( transitions , StateTransition {
State : s ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
} )
2023-01-26 11:29:10 -06:00
}
2023-01-27 02:46:21 -06:00
2023-01-26 11:29:10 -06:00
if st . instanceStore != nil {
2022-08-25 13:12:22 -05:00
err := st . instanceStore . DeleteAlertInstancesByRule ( ctx , ruleKey )
if err != nil {
2022-10-21 16:16:51 -05:00
logger . Error ( "Failed to delete states that belong to a rule from database" , "error" , err )
2022-08-25 13:12:22 -05:00
}
}
2022-10-21 16:16:51 -05:00
logger . Info ( "Rules state was reset" , "states" , len ( states ) )
2023-01-26 11:29:10 -06:00
2023-01-27 02:46:21 -06:00
return transitions
2023-01-26 11:29:10 -06:00
}
// ResetStateByRuleUID removes the rule instances from cache and instanceStore and saves state history. If the state
// history has to be saved, rule must not be nil.
2023-01-27 02:46:21 -06:00
func ( st * Manager ) ResetStateByRuleUID ( ctx context . Context , rule * ngModels . AlertRule , reason string ) [ ] StateTransition {
2023-01-26 11:29:10 -06:00
ruleKey := rule . GetKey ( )
2023-01-27 02:46:21 -06:00
transitions := st . DeleteStateByRuleUID ( ctx , ruleKey , reason )
2023-01-26 11:29:10 -06:00
2023-01-27 02:46:21 -06:00
if rule == nil || st . historian == nil || len ( transitions ) == 0 {
return transitions
2023-01-26 11:29:10 -06:00
}
ruleMeta := history_model . NewRuleMeta ( rule , st . log )
2023-03-17 12:41:18 -05:00
errCh := st . historian . Record ( ctx , ruleMeta , transitions )
2023-01-26 11:29:10 -06:00
go func ( ) {
err := <- errCh
if err != nil {
st . log . FromContext ( ctx ) . Error ( "Error updating historian state reset transitions" , append ( ruleKey . LogContext ( ) , "reason" , reason , "error" , err ) ... )
}
} ( )
2023-01-27 02:46:21 -06:00
return transitions
2021-05-03 13:01:33 -05:00
}
2022-07-14 14:59:59 -05:00
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
2024-06-25 12:01:26 -05:00
// This will update the states in cache/store and return the state transitions that need to be sent to the alertmanager.
func ( st * Manager ) ProcessEvalResults (
ctx context . Context ,
evaluatedAt time . Time ,
alertRule * ngModels . AlertRule ,
results eval . Results ,
extraLabels data . Labels ,
send Sender ,
) StateTransitions {
2023-08-16 02:04:18 -05:00
utcTick := evaluatedAt . UTC ( ) . Format ( time . RFC3339Nano )
2024-06-25 12:01:26 -05:00
ctx , span := st . tracer . Start ( ctx , "alert rule state calculation" , trace . WithAttributes (
2023-10-03 07:54:20 -05:00
attribute . String ( "rule_uid" , alertRule . UID ) ,
attribute . Int64 ( "org_id" , alertRule . OrgID ) ,
attribute . Int64 ( "rule_version" , alertRule . Version ) ,
attribute . String ( "tick" , utcTick ) ,
attribute . Int ( "results" , len ( results ) ) ) )
defer span . End ( )
2023-08-16 02:04:18 -05:00
2024-06-25 12:01:26 -05:00
logger := st . log . FromContext ( ctx )
2022-10-21 16:16:51 -05:00
logger . Debug ( "State manager processing evaluation results" , "resultCount" , len ( results ) )
2024-06-25 12:01:26 -05:00
states := st . setNextStateForRule ( ctx , alertRule , results , extraLabels , logger )
staleStates := st . deleteStaleStatesFromCache ( ctx , logger , evaluatedAt , alertRule )
2023-10-03 07:54:20 -05:00
span . AddEvent ( "results processed" , trace . WithAttributes (
attribute . Int64 ( "state_transitions" , int64 ( len ( states ) ) ) ,
2024-06-25 12:01:26 -05:00
attribute . Int64 ( "stale_states" , int64 ( len ( staleStates ) ) ) ,
2023-10-03 07:54:20 -05:00
) )
2022-11-07 10:03:53 -06:00
2024-06-25 12:01:26 -05:00
allChanges := StateTransitions ( append ( states , staleStates ... ) )
// It's important that this is done *before* we sync the states to the persister. Otherwise, we will not persist
// the LastSentAt field to the store.
var statesToSend StateTransitions
if send != nil {
statesToSend = st . updateLastSentAt ( allChanges , evaluatedAt )
}
2022-11-04 10:39:26 -05:00
2024-06-25 12:01:26 -05:00
st . persister . Sync ( ctx , span , allChanges )
2022-12-06 11:33:15 -06:00
if st . historian != nil {
2024-06-25 12:01:26 -05:00
st . historian . Record ( ctx , history_model . NewRuleMeta ( alertRule , logger ) , allChanges )
2022-12-06 11:33:15 -06:00
}
2024-06-25 12:01:26 -05:00
// Optional callback intended for sending the states to an alertmanager.
// Some uses ,such as backtesting or the testing api, do not send.
if send != nil {
send ( ctx , statesToSend )
}
2022-12-06 12:07:39 -06:00
return allChanges
2021-04-23 14:32:25 -05:00
}
2024-06-25 12:01:26 -05:00
// updateLastSentAt returns the subset StateTransitions that need sending and updates their LastSentAt field.
// Note: This is not idempotent, running this twice can (and usually will) return different results.
func ( st * Manager ) updateLastSentAt ( states StateTransitions , evaluatedAt time . Time ) StateTransitions {
var result StateTransitions
for _ , t := range states {
if t . NeedsSending ( st . ResendDelay , st . ResolvedRetention ) {
t . LastSentAt = & evaluatedAt
result = append ( result , t )
}
}
return result
}
2023-08-15 09:27:15 -05:00
func ( st * Manager ) setNextStateForRule ( ctx context . Context , alertRule * ngModels . AlertRule , results eval . Results , extraLabels data . Labels , logger log . Logger ) [ ] StateTransition {
2024-03-12 09:00:43 -05:00
if st . applyNoDataAndErrorToAllStates && results . IsNoData ( ) && ( alertRule . NoDataState == ngModels . Alerting || alertRule . NoDataState == ngModels . OK || alertRule . NoDataState == ngModels . KeepLast ) { // If it is no data, check the mapping and switch all results to the new state
2024-07-30 11:55:59 -05:00
// aggregate UID of datasources that returned NoData into one and provide as auxiliary info via annotationa. See: https://github.com/grafana/grafana/issues/88184
var refIds strings . Builder
var datasourceUIDs strings . Builder
// for deduplication of datasourceUIDs
dsUIDSet := make ( map [ string ] bool )
for i , result := range results {
if refid , ok := result . Instance [ "ref_id" ] ; ok {
if i > 0 {
refIds . WriteString ( "," )
}
refIds . WriteString ( refid )
}
if dsUID , ok := result . Instance [ "datasource_uid" ] ; ok {
if ! dsUIDSet [ dsUID ] {
if i > 0 {
refIds . WriteString ( "," )
}
datasourceUIDs . WriteString ( dsUID )
dsUIDSet [ dsUID ] = true
}
}
}
2023-08-15 09:27:15 -05:00
transitions := st . setNextStateForAll ( ctx , alertRule , results [ 0 ] , logger )
if len ( transitions ) > 0 {
2024-07-30 11:55:59 -05:00
for _ , t := range transitions {
2024-08-02 14:15:57 -05:00
if t . State . Annotations == nil {
t . State . Annotations = make ( map [ string ] string )
}
2024-07-30 11:55:59 -05:00
t . State . Annotations [ "datasource_uid" ] = datasourceUIDs . String ( )
t . State . Annotations [ "ref_id" ] = refIds . String ( )
}
2023-08-15 09:27:15 -05:00
return transitions // if there are no current states for the rule. Create ones for each result
}
}
2024-03-12 09:00:43 -05:00
if st . applyNoDataAndErrorToAllStates && results . IsError ( ) && ( alertRule . ExecErrState == ngModels . AlertingErrState || alertRule . ExecErrState == ngModels . OkErrState || alertRule . ExecErrState == ngModels . KeepLastErrState ) {
2023-08-15 09:27:15 -05:00
// TODO squash all errors into one, and provide as annotation
transitions := st . setNextStateForAll ( ctx , alertRule , results [ 0 ] , logger )
if len ( transitions ) > 0 {
return transitions // if there are no current states for the rule. Create ones for each result
}
}
transitions := make ( [ ] StateTransition , 0 , len ( results ) )
for _ , result := range results {
currentState := st . cache . getOrCreate ( ctx , logger , alertRule , result , extraLabels , st . externalURL )
s := st . setNextState ( ctx , alertRule , currentState , result , logger )
transitions = append ( transitions , s )
}
return transitions
}
2021-04-23 14:32:25 -05:00
2023-08-15 09:27:15 -05:00
func ( st * Manager ) setNextStateForAll ( ctx context . Context , alertRule * ngModels . AlertRule , result eval . Result , logger log . Logger ) [ ] StateTransition {
currentStates := st . cache . getStatesForRuleUID ( alertRule . OrgID , alertRule . UID , false )
transitions := make ( [ ] StateTransition , 0 , len ( currentStates ) )
for _ , currentState := range currentStates {
t := st . setNextState ( ctx , alertRule , currentState , result , logger )
transitions = append ( transitions , t )
}
return transitions
}
// Set the current state based on evaluation results
func ( st * Manager ) setNextState ( ctx context . Context , alertRule * ngModels . AlertRule , currentState * State , result eval . Result , logger log . Logger ) StateTransition {
2023-08-16 02:04:18 -05:00
start := st . clock . Now ( )
2024-03-12 09:00:43 -05:00
2021-04-23 14:32:25 -05:00
currentState . LastEvaluationTime = result . EvaluatedAt
currentState . EvaluationDuration = result . EvaluationDuration
2024-07-08 12:30:23 -05:00
currentState . SetNextValues ( result )
2024-05-09 15:51:55 -05:00
currentState . LatestResult = & Evaluation {
2022-02-02 12:18:20 -06:00
EvaluationTime : result . EvaluatedAt ,
EvaluationState : result . State ,
2024-07-08 12:30:23 -05:00
Values : currentState . Values ,
2022-04-05 13:36:42 -05:00
Condition : alertRule . Condition ,
2024-05-09 15:51:55 -05:00
}
2022-02-02 12:18:20 -06:00
currentState . LastEvaluationString = result . EvaluationString
2021-07-13 11:50:10 -05:00
oldState := currentState . State
2022-05-23 03:49:49 -05:00
oldReason := currentState . StateReason
2021-04-23 14:32:25 -05:00
2022-12-07 04:45:56 -06:00
// Add the instance to the log context to help correlate log lines for a state
logger = logger . New ( "instance" , result . Instance )
2023-07-26 10:41:46 -05:00
// if the current state is Error but the result is different, then we need o clean up the extra labels
// that were added after the state key was calculated
// https://github.com/grafana/grafana/blob/1df4d332c982dc5e394201bb2ef35b442727ce63/pkg/services/ngalert/state/state.go#L298-L311
// Usually, it happens in the case of classic conditions when the evalResult does not have labels.
//
// This is temporary change to make sure that the labels are not persistent in the state after it was in Error state
2023-08-15 09:27:15 -05:00
// TODO yuri. Remove it when correct Error result with labels is provided
2023-07-26 10:41:46 -05:00
if currentState . State == eval . Error && result . State != eval . Error {
// This is possible because state was updated after the CacheID was calculated.
_ , curOk := currentState . Labels [ "ref_id" ]
_ , resOk := result . Instance [ "ref_id" ]
if curOk && ! resOk {
delete ( currentState . Labels , "ref_id" )
}
_ , curOk = currentState . Labels [ "datasource_uid" ]
_ , resOk = result . Instance [ "datasource_uid" ]
if curOk && ! resOk {
delete ( currentState . Labels , "datasource_uid" )
}
}
2021-04-23 14:32:25 -05:00
switch result . State {
case eval . Normal :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultNormal" )
2024-03-12 09:00:43 -05:00
resultNormal ( currentState , alertRule , result , logger , "" )
2021-04-23 14:32:25 -05:00
case eval . Alerting :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultAlerting" )
2024-03-12 09:00:43 -05:00
resultAlerting ( currentState , alertRule , result , logger , "" )
2021-04-23 14:32:25 -05:00
case eval . Error :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultError" )
resultError ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . NoData :
2022-12-07 04:45:56 -06:00
logger . Debug ( "Setting next state" , "handler" , "resultNoData" )
resultNoData ( currentState , alertRule , result , logger )
2021-04-23 14:32:25 -05:00
case eval . Pending : // we do not emit results with this state
2022-12-07 04:45:56 -06:00
logger . Debug ( "Ignoring set next state as result is pending" )
2021-04-23 14:32:25 -05:00
}
2023-01-26 11:29:10 -06:00
// Set reason iff: result and state are different, reason is not Alerting or Normal
2022-05-23 03:49:49 -05:00
currentState . StateReason = ""
if currentState . State != result . State &&
result . State != eval . Normal &&
result . State != eval . Alerting {
2024-03-12 09:00:43 -05:00
currentState . StateReason = resultStateReason ( result , alertRule )
2022-05-23 03:49:49 -05:00
}
2021-07-29 13:29:17 -05:00
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
2024-06-20 15:33:03 -05:00
newlyResolved := false
if oldState == eval . Alerting && currentState . State == eval . Normal {
currentState . ResolvedAt = & result . EvaluatedAt
newlyResolved = true
} else if currentState . State != eval . Normal && currentState . State != eval . Pending { // Retain the last resolved time for Normal->Normal and Normal->Pending.
currentState . ResolvedAt = nil
}
2021-07-29 13:29:17 -05:00
2024-06-20 15:33:03 -05:00
if shouldTakeImage ( currentState . State , oldState , currentState . Image , newlyResolved ) {
2022-11-09 15:06:49 -06:00
image , err := takeImage ( ctx , st . images , alertRule )
2022-11-02 17:14:22 -05:00
if err != nil {
logger . Warn ( "Failed to take an image" ,
2022-11-10 03:58:38 -06:00
"dashboard" , alertRule . GetDashboardUID ( ) ,
"panel" , alertRule . GetPanelID ( ) ,
2022-11-02 17:14:22 -05:00
"error" , err )
} else if image != nil {
currentState . Image = image
}
2022-05-22 21:53:41 -05:00
}
2022-10-06 14:30:12 -05:00
st . cache . set ( currentState )
2022-05-23 03:49:49 -05:00
2022-11-04 10:39:26 -05:00
nextState := StateTransition {
State : currentState ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
2021-07-13 11:50:10 -05:00
}
2022-11-04 10:39:26 -05:00
2023-08-16 02:04:18 -05:00
if st . metrics != nil {
st . metrics . StateUpdateDuration . Observe ( st . clock . Now ( ) . Sub ( start ) . Seconds ( ) )
}
2022-11-04 10:39:26 -05:00
return nextState
2021-04-23 14:32:25 -05:00
}
2024-03-12 09:00:43 -05:00
func resultStateReason ( result eval . Result , rule * ngModels . AlertRule ) string {
if rule . ExecErrState == ngModels . KeepLastErrState || rule . NoDataState == ngModels . KeepLast {
return ngModels . ConcatReasons ( result . State . String ( ) , ngModels . StateReasonKeepLast )
}
return result . State . String ( )
}
2021-05-04 11:57:50 -05:00
func ( st * Manager ) GetAll ( orgID int64 ) [ ] * State {
2023-01-13 17:29:29 -06:00
allStates := st . cache . getAll ( orgID , st . doNotSaveNormalState )
return allStates
2021-04-23 14:32:25 -05:00
}
2021-05-04 11:57:50 -05:00
func ( st * Manager ) GetStatesForRuleUID ( orgID int64 , alertRuleUID string ) [ ] * State {
2023-01-13 17:29:29 -06:00
return st . cache . getStatesForRuleUID ( orgID , alertRuleUID , st . doNotSaveNormalState )
2021-04-23 14:32:25 -05:00
}
2024-09-30 16:52:49 -05:00
func ( st * Manager ) GetStatusForRuleUID ( orgID int64 , alertRuleUID string ) ngModels . RuleStatus {
states := st . GetStatesForRuleUID ( orgID , alertRuleUID )
return StatesToRuleStatus ( states )
}
2021-04-23 14:32:25 -05:00
func ( st * Manager ) Put ( states [ ] * State ) {
for _ , s := range states {
2022-10-06 14:30:12 -05:00
st . cache . set ( s )
2021-04-23 14:32:25 -05:00
}
}
2021-07-07 11:18:31 -05:00
func translateInstanceState ( state ngModels . InstanceStateType ) eval . State {
2023-01-04 08:40:04 -06:00
switch state {
case ngModels . InstanceStateFiring :
2021-07-07 11:18:31 -05:00
return eval . Alerting
2023-01-04 08:40:04 -06:00
case ngModels . InstanceStateNormal :
2021-07-07 11:18:31 -05:00
return eval . Normal
2023-01-04 08:40:04 -06:00
case ngModels . InstanceStateError :
return eval . Error
case ngModels . InstanceStateNoData :
return eval . NoData
case ngModels . InstanceStatePending :
return eval . Pending
2021-07-07 11:18:31 -05:00
default :
return eval . Error
}
}
2021-07-13 11:50:10 -05:00
2022-11-14 09:57:51 -06:00
func ( st * Manager ) deleteStaleStatesFromCache ( ctx context . Context , logger log . Logger , evaluatedAt time . Time , alertRule * ngModels . AlertRule ) [ ] StateTransition {
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
staleStates := st . cache . deleteRuleStates ( alertRule . GetKey ( ) , func ( s * State ) bool {
return stateIsStale ( evaluatedAt , s . LastEvaluationTime , alertRule . IntervalSeconds )
2022-11-09 05:08:32 -06:00
} )
2023-01-17 05:50:17 -06:00
resolvedStates := make ( [ ] StateTransition , 0 , len ( staleStates ) )
2022-10-06 01:22:58 -05:00
2022-11-07 10:03:53 -06:00
for _ , s := range staleStates {
logger . Info ( "Detected stale state entry" , "cacheID" , s . CacheID , "state" , s . State , "reason" , s . StateReason )
2022-11-14 09:57:51 -06:00
oldState := s . State
oldReason := s . StateReason
2022-11-07 08:35:29 -06:00
2022-11-14 09:57:51 -06:00
s . State = eval . Normal
s . StateReason = ngModels . StateReasonMissingSeries
s . EndsAt = evaluatedAt
s . LastEvaluationTime = evaluatedAt
2022-11-02 17:14:22 -05:00
2022-11-14 09:57:51 -06:00
if oldState == eval . Alerting {
2024-06-20 15:33:03 -05:00
s . ResolvedAt = & evaluatedAt
2023-04-26 11:06:18 -05:00
image , err := takeImage ( ctx , st . images , alertRule )
if err != nil {
logger . Warn ( "Failed to take an image" ,
"dashboard" , alertRule . GetDashboardUID ( ) ,
"panel" , alertRule . GetPanelID ( ) ,
"error" , err )
} else if image != nil {
s . Image = image
2022-02-24 10:25:28 -06:00
}
2021-07-26 11:12:04 -05:00
}
2022-10-06 01:22:58 -05:00
2022-11-14 09:57:51 -06:00
record := StateTransition {
State : s ,
PreviousState : oldState ,
PreviousStateReason : oldReason ,
2022-10-28 12:10:28 -05:00
}
2022-11-14 09:57:51 -06:00
resolvedStates = append ( resolvedStates , record )
2022-10-06 01:22:58 -05:00
}
2022-09-21 12:24:47 -05:00
return resolvedStates
2021-07-26 11:12:04 -05:00
}
2022-10-06 01:22:58 -05:00
func stateIsStale ( evaluatedAt time . Time , lastEval time . Time , intervalSeconds int64 ) bool {
2022-06-21 17:16:53 -05:00
return ! lastEval . Add ( 2 * time . Duration ( intervalSeconds ) * time . Second ) . After ( evaluatedAt )
2021-07-26 11:12:04 -05:00
}
2024-09-30 16:52:49 -05:00
func StatesToRuleStatus ( states [ ] * State ) ngModels . RuleStatus {
status := ngModels . RuleStatus {
Health : "ok" ,
LastError : nil ,
EvaluationTimestamp : time . Time { } ,
}
for _ , state := range states {
if state . LastEvaluationTime . After ( status . EvaluationTimestamp ) {
status . EvaluationTimestamp = state . LastEvaluationTime
}
status . EvaluationDuration = state . EvaluationDuration
switch state . State {
case eval . Normal :
case eval . Pending :
case eval . Alerting :
case eval . Error :
status . Health = "error"
case eval . NoData :
status . Health = "nodata"
}
if state . Error != nil {
status . LastError = state . Error
status . Health = "error"
}
}
return status
}