2020-11-12 15:11:30 +02:00
// Package eval executes the condition for an alert definition, evaluates the condition results, and
// returns the alert instance states.
package eval
import (
"context"
"fmt"
2021-06-15 19:30:21 +03:00
"runtime/debug"
2021-03-23 12:11:15 -04:00
"sort"
2020-11-12 15:11:30 +02:00
"time"
2021-05-28 11:04:20 -04:00
"github.com/grafana/grafana/pkg/expr/classic"
2021-06-15 19:30:21 +03:00
"github.com/grafana/grafana/pkg/infra/log"
2021-03-08 22:19:21 +02:00
"github.com/grafana/grafana/pkg/services/ngalert/models"
2021-01-22 19:27:33 +02:00
"github.com/grafana/grafana/pkg/setting"
2021-03-08 07:02:49 +01:00
"github.com/grafana/grafana/pkg/tsdb"
2021-01-22 19:27:33 +02:00
2020-11-19 07:17:00 -05:00
"github.com/grafana/grafana-plugin-sdk-go/backend"
2020-11-12 15:11:30 +02:00
"github.com/grafana/grafana-plugin-sdk-go/data"
2020-11-19 07:17:00 -05:00
"github.com/grafana/grafana/pkg/expr"
2020-11-12 15:11:30 +02:00
)
2021-01-22 19:27:33 +02:00
type Evaluator struct {
Cfg * setting . Cfg
2021-06-15 19:30:21 +03:00
Log log . Logger
2021-01-22 19:27:33 +02:00
}
2020-11-12 15:11:30 +02:00
// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
type invalidEvalResultFormatError struct {
refID string
reason string
err error
}
func ( e * invalidEvalResultFormatError ) Error ( ) string {
s := fmt . Sprintf ( "invalid format of evaluation results for the alert definition %s: %s" , e . refID , e . reason )
if e . err != nil {
s = fmt . Sprintf ( "%s: %s" , s , e . err . Error ( ) )
}
return s
}
func ( e * invalidEvalResultFormatError ) Unwrap ( ) error {
return e . err
}
// ExecutionResults contains the unevaluated results from executing
// a condition.
type ExecutionResults struct {
Error error
Results data . Frames
}
// Results is a slice of evaluated alert instances states.
2021-03-24 15:34:18 -07:00
type Results [ ] Result
2020-11-12 15:11:30 +02:00
2021-03-24 15:34:18 -07:00
// Result contains the evaluated State of an alert instance
2020-11-12 15:11:30 +02:00
// identified by its labels.
2021-03-24 15:34:18 -07:00
type Result struct {
2021-04-29 07:27:32 -04:00
Instance data . Labels
State State // Enum
// Error message for Error state. should be nil if State != Error.
Error error
2021-04-23 12:32:25 -07:00
EvaluatedAt time . Time
EvaluationDuration time . Duration
2021-05-18 09:12:39 -04:00
2021-07-15 13:10:56 +01:00
// EvaluationString is a string representation of evaluation data such
2021-05-18 09:12:39 -04:00
// as EvalMatches (from "classic condition"), and in the future from operations
// like SSE "math".
EvaluationString string
2021-07-15 13:10:56 +01:00
// Values contains the RefID and value of reduce and math expressions.
// It does not contain values for classic conditions as the values
// in classic conditions do not have a RefID.
Values map [ string ] NumberValueCapture
2020-11-12 15:11:30 +02:00
}
2021-03-24 15:34:18 -07:00
// State is an enum of the evaluation State for an alert instance.
type State int
2020-11-12 15:11:30 +02:00
const (
// Normal is the eval state for an alert instance condition
// that evaluated to false.
2021-03-24 15:34:18 -07:00
Normal State = iota
2020-11-12 15:11:30 +02:00
// Alerting is the eval state for an alert instance condition
2021-03-23 12:11:15 -04:00
// that evaluated to true (Alerting).
2020-11-12 15:11:30 +02:00
Alerting
2021-03-23 12:11:15 -04:00
2021-04-21 09:30:03 -07:00
// Pending is the eval state for an alert instance condition
// that evaluated to true (Alerting) but has not yet met
2021-04-23 11:47:52 -07:00
// the For duration defined in AlertRule.
2021-04-21 09:30:03 -07:00
Pending
2021-03-23 12:11:15 -04:00
// NoData is the eval state for an alert rule condition
// that evaluated to NoData.
NoData
// Error is the eval state for an alert rule condition
// that evaluated to Error.
Error
2020-11-12 15:11:30 +02:00
)
2021-03-24 15:34:18 -07:00
func ( s State ) String ( ) string {
2021-04-21 09:30:03 -07:00
return [ ... ] string { "Normal" , "Alerting" , "Pending" , "NoData" , "Error" } [ s ]
2020-11-12 15:11:30 +02:00
}
// AlertExecCtx is the context provided for executing an alert condition.
type AlertExecCtx struct {
2021-01-22 19:27:33 +02:00
OrgID int64
ExpressionsEnabled bool
2021-06-15 19:30:21 +03:00
Log log . Logger
2020-11-12 15:11:30 +02:00
Ctx context . Context
}
2021-04-23 10:52:32 -04:00
// GetExprRequest validates the condition and creates a expr.Request from it.
func GetExprRequest ( ctx AlertExecCtx , data [ ] models . AlertQuery , now time . Time ) ( * expr . Request , error ) {
req := & expr . Request {
OrgId : ctx . OrgID ,
2021-07-09 13:43:22 +02:00
Headers : map [ string ] string {
// Some data sources check this in query method as sometimes alerting needs special considerations.
2021-09-09 10:16:05 -04:00
"FromAlert" : "true" ,
"X-Cache-Skip" : "true" ,
2021-07-09 13:43:22 +02:00
} ,
2020-11-12 15:11:30 +02:00
}
2021-04-21 22:44:50 +03:00
for i := range data {
q := data [ i ]
2021-03-08 22:19:21 +02:00
model , err := q . GetModel ( )
2020-11-12 15:11:30 +02:00
if err != nil {
return nil , fmt . Errorf ( "failed to get query model: %w" , err )
}
2021-03-08 22:19:21 +02:00
interval , err := q . GetIntervalDuration ( )
2020-11-12 15:11:30 +02:00
if err != nil {
return nil , fmt . Errorf ( "failed to retrieve intervalMs from the model: %w" , err )
}
2021-03-08 22:19:21 +02:00
maxDatapoints , err := q . GetMaxDatapoints ( )
2020-11-12 15:11:30 +02:00
if err != nil {
return nil , fmt . Errorf ( "failed to retrieve maxDatapoints from the model: %w" , err )
}
2021-04-23 10:52:32 -04:00
req . Queries = append ( req . Queries , expr . Query {
TimeRange : expr . TimeRange {
From : q . RelativeTimeRange . ToTimeRange ( now ) . From ,
To : q . RelativeTimeRange . ToTimeRange ( now ) . To ,
} ,
DatasourceUID : q . DatasourceUID ,
2020-11-19 07:17:00 -05:00
JSON : model ,
Interval : interval ,
RefID : q . RefID ,
2020-11-12 15:11:30 +02:00
MaxDataPoints : maxDatapoints ,
QueryType : q . QueryType ,
} )
}
2021-04-23 10:52:32 -04:00
return req , nil
2021-03-23 12:11:15 -04:00
}
2021-05-28 11:04:20 -04:00
type NumberValueCapture struct {
Var string // RefID
Labels data . Labels
Value * float64
}
2021-04-29 07:27:32 -04:00
func executeCondition ( ctx AlertExecCtx , c * models . Condition , now time . Time , dataService * tsdb . Service ) ExecutionResults {
2021-03-23 12:11:15 -04:00
result := ExecutionResults { }
2021-04-21 22:44:50 +03:00
execResp , err := executeQueriesAndExpressions ( ctx , c . Data , now , dataService )
2020-11-12 15:11:30 +02:00
if err != nil {
2021-04-29 07:27:32 -04:00
return ExecutionResults { Error : err }
2020-11-12 15:11:30 +02:00
}
2021-06-24 03:15:49 -04:00
// eval captures for the '__value_string__' annotation and the Value property of the API response.
2021-05-28 11:04:20 -04:00
captures := make ( [ ] NumberValueCapture , 0 , len ( execResp . Responses ) )
captureVal := func ( refID string , labels data . Labels , value * float64 ) {
captures = append ( captures , NumberValueCapture {
Var : refID ,
Value : value ,
Labels : labels . Copy ( ) ,
} )
}
2021-04-21 22:44:50 +03:00
for refID , res := range execResp . Responses {
2021-05-28 11:04:20 -04:00
// for each frame within each response, the response can contain several data types including time-series data.
// For now, we favour simplicity and only care about single scalar values.
for _ , frame := range res . Frames {
if len ( frame . Fields ) != 1 || frame . Fields [ 0 ] . Type ( ) != data . FieldTypeNullableFloat64 {
continue
}
var v * float64
if frame . Fields [ 0 ] . Len ( ) == 1 {
v = frame . At ( 0 , 0 ) . ( * float64 ) // type checked above
}
captureVal ( frame . RefID , frame . Fields [ 0 ] . Labels , v )
}
if refID == c . Condition {
result . Results = res . Frames
}
}
// add capture values as data frame metadata to each result (frame) that has matching labels.
for _ , frame := range result . Results {
// classic conditions already have metadata set and only have one value, there's no need to add anything in this case.
if frame . Meta != nil && frame . Meta . Custom != nil {
if _ , ok := frame . Meta . Custom . ( [ ] classic . EvalMatch ) ; ok {
continue // do not overwrite EvalMatch from classic condition.
}
}
frame . SetMeta ( & data . FrameMeta { } ) // overwrite metadata
if len ( frame . Fields ) == 1 {
theseLabels := frame . Fields [ 0 ] . Labels
for _ , cap := range captures {
// matching labels are equal labels, or when one set of labels includes the labels of the other.
if theseLabels . Equals ( cap . Labels ) || theseLabels . Contains ( cap . Labels ) || cap . Labels . Contains ( theseLabels ) {
if frame . Meta . Custom == nil {
frame . Meta . Custom = [ ] NumberValueCapture { }
}
frame . Meta . Custom = append ( frame . Meta . Custom . ( [ ] NumberValueCapture ) , cap )
}
}
2020-11-12 15:11:30 +02:00
}
}
2021-04-29 07:27:32 -04:00
return result
2020-11-12 15:11:30 +02:00
}
2021-06-15 19:30:21 +03:00
func executeQueriesAndExpressions ( ctx AlertExecCtx , data [ ] models . AlertQuery , now time . Time , dataService * tsdb . Service ) ( resp * backend . QueryDataResponse , err error ) {
defer func ( ) {
if e := recover ( ) ; e != nil {
ctx . Log . Error ( "alert rule panic" , "error" , e , "stack" , string ( debug . Stack ( ) ) )
panicErr := fmt . Errorf ( "alert rule panic; please check the logs for the full stack" )
if err != nil {
err = fmt . Errorf ( "queries and expressions execution failed: %w; %v" , err , panicErr . Error ( ) )
} else {
err = panicErr
}
}
} ( )
2021-04-23 10:52:32 -04:00
queryDataReq , err := GetExprRequest ( ctx , data , now )
2021-04-21 22:44:50 +03:00
if err != nil {
return nil , err
}
exprService := expr . Service {
Cfg : & setting . Cfg { ExpressionsEnabled : ctx . ExpressionsEnabled } ,
DataService : dataService ,
}
return exprService . TransformData ( ctx . Ctx , queryDataReq )
}
2021-04-29 07:27:32 -04:00
// evaluateExecutionResult takes the ExecutionResult which includes data.Frames returned
// from SSE (Server Side Expressions). It will create Results (slice of Result) with a State
// extracted from each Frame.
//
// If the ExecutionResults error property is not nil, a single Error result will be returned.
// If there is no error and no results then a single NoData state Result will be returned.
//
// Each non-empty Frame must be a single Field of type []*float64 and of length 1.
// Also, each Frame must be uniquely identified by its Field.Labels or a single Error result will be returned.
//
// Per Frame, data becomes a State based on the following rules:
// - Empty or zero length Frames result in NoData.
// - If a value:
// - 0 results in Normal.
// - Nonzero (e.g 1.2, NaN) results in Alerting.
// - nil results in noData.
// - unsupported Frame schemas results in Error.
func evaluateExecutionResult ( execResults ExecutionResults , ts time . Time ) Results {
2021-03-24 15:34:18 -07:00
evalResults := make ( [ ] Result , 0 )
2021-04-29 07:27:32 -04:00
appendErrRes := func ( e error ) {
evalResults = append ( evalResults , Result {
State : Error ,
Error : e ,
EvaluatedAt : ts ,
EvaluationDuration : time . Since ( ts ) ,
} )
}
appendNoData := func ( l data . Labels ) {
evalResults = append ( evalResults , Result {
State : NoData ,
Instance : l ,
EvaluatedAt : ts ,
EvaluationDuration : time . Since ( ts ) ,
} )
}
if execResults . Error != nil {
appendErrRes ( execResults . Error )
return evalResults
}
if len ( execResults . Results ) == 0 {
appendNoData ( nil )
return evalResults
}
for _ , f := range execResults . Results {
2020-11-12 15:11:30 +02:00
rowLen , err := f . RowLen ( )
if err != nil {
2021-04-29 07:27:32 -04:00
appendErrRes ( & invalidEvalResultFormatError { refID : f . RefID , reason : "unable to get frame row length" , err : err } )
continue
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
if len ( f . TypeIndices ( data . FieldTypeTime , data . FieldTypeNullableTime ) ) > 0 {
appendErrRes ( & invalidEvalResultFormatError { refID : f . RefID , reason : "looks like time series data, only reduced data can be alerted on." } )
continue
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
if rowLen == 0 {
if len ( f . Fields ) == 0 {
appendNoData ( nil )
continue
}
if len ( f . Fields ) == 1 {
appendNoData ( f . Fields [ 0 ] . Labels )
continue
}
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
if rowLen > 1 {
appendErrRes ( & invalidEvalResultFormatError { refID : f . RefID , reason : fmt . Sprintf ( "unexpected row length: %d instead of 0 or 1" , rowLen ) } )
continue
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
if len ( f . Fields ) > 1 {
appendErrRes ( & invalidEvalResultFormatError { refID : f . RefID , reason : fmt . Sprintf ( "unexpected field length: %d instead of 1" , len ( f . Fields ) ) } )
continue
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
if f . Fields [ 0 ] . Type ( ) != data . FieldTypeNullableFloat64 {
appendErrRes ( & invalidEvalResultFormatError { refID : f . RefID , reason : fmt . Sprintf ( "invalid field type: %s" , f . Fields [ 0 ] . Type ( ) ) } )
continue
2021-03-23 12:11:15 -04:00
}
2021-04-29 07:27:32 -04:00
val := f . Fields [ 0 ] . At ( 0 ) . ( * float64 ) // type checked by data.FieldTypeNullableFloat64 above
2021-03-24 15:34:18 -07:00
r := Result {
2021-04-23 12:32:25 -07:00
Instance : f . Fields [ 0 ] . Labels ,
EvaluatedAt : ts ,
EvaluationDuration : time . Since ( ts ) ,
2021-05-18 09:12:39 -04:00
EvaluationString : extractEvalString ( f ) ,
2021-07-15 13:10:56 +01:00
Values : extractValues ( f ) ,
2021-03-24 20:27:04 +00:00
}
2021-03-23 12:11:15 -04:00
switch {
case val == nil :
2021-03-24 20:27:04 +00:00
r . State = NoData
2021-03-23 12:11:15 -04:00
case * val == 0 :
2021-03-24 20:27:04 +00:00
r . State = Normal
2021-03-23 12:11:15 -04:00
default :
2021-03-24 20:27:04 +00:00
r . State = Alerting
2020-11-12 15:11:30 +02:00
}
2021-03-24 20:27:04 +00:00
evalResults = append ( evalResults , r )
2020-11-12 15:11:30 +02:00
}
2021-04-29 07:27:32 -04:00
seenLabels := make ( map [ string ] bool )
for _ , res := range evalResults {
labelsStr := res . Instance . String ( )
_ , ok := seenLabels [ labelsStr ]
if ok {
return Results {
Result {
State : Error ,
Instance : res . Instance ,
EvaluatedAt : ts ,
EvaluationDuration : time . Since ( ts ) ,
Error : & invalidEvalResultFormatError { reason : fmt . Sprintf ( "frame cannot uniquely be identified by its labels: has duplicate results with labels {%s}" , labelsStr ) } ,
} ,
}
}
seenLabels [ labelsStr ] = true
}
return evalResults
2020-11-12 15:11:30 +02:00
}
// AsDataFrame forms the EvalResults in Frame suitable for displaying in the table panel of the front end.
2021-03-23 12:11:15 -04:00
// It displays one row per alert instance, with a column for each label and one for the alerting state.
2020-11-12 15:11:30 +02:00
func ( evalResults Results ) AsDataFrame ( ) data . Frame {
2021-03-23 12:11:15 -04:00
fieldLen := len ( evalResults )
uniqueLabelKeys := make ( map [ string ] struct { } )
2020-11-12 15:11:30 +02:00
for _ , evalResult := range evalResults {
2021-03-23 12:11:15 -04:00
for k := range evalResult . Instance {
uniqueLabelKeys [ k ] = struct { } { }
}
}
labelColumns := make ( [ ] string , 0 , len ( uniqueLabelKeys ) )
for k := range uniqueLabelKeys {
labelColumns = append ( labelColumns , k )
}
labelColumns = sort . StringSlice ( labelColumns )
frame := data . NewFrame ( "evaluation results" )
for _ , lKey := range labelColumns {
frame . Fields = append ( frame . Fields , data . NewField ( lKey , nil , make ( [ ] string , fieldLen ) ) )
}
frame . Fields = append ( frame . Fields , data . NewField ( "State" , nil , make ( [ ] string , fieldLen ) ) )
2021-05-26 10:06:28 +02:00
frame . Fields = append ( frame . Fields , data . NewField ( "Info" , nil , make ( [ ] string , fieldLen ) ) )
2021-03-23 12:11:15 -04:00
for evalIdx , evalResult := range evalResults {
for lIdx , v := range labelColumns {
frame . Set ( lIdx , evalIdx , evalResult . Instance [ v ] )
}
2021-05-26 10:06:28 +02:00
2021-03-23 12:11:15 -04:00
frame . Set ( len ( labelColumns ) , evalIdx , evalResult . State . String ( ) )
2021-05-26 10:06:28 +02:00
switch {
case evalResult . Error != nil :
frame . Set ( len ( labelColumns ) + 1 , evalIdx , evalResult . Error . Error ( ) )
case evalResult . EvaluationString != "" :
frame . Set ( len ( labelColumns ) + 1 , evalIdx , evalResult . EvaluationString )
}
2020-11-12 15:11:30 +02:00
}
2021-03-23 12:11:15 -04:00
return * frame
2020-11-12 15:11:30 +02:00
}
2020-12-17 16:00:09 +02:00
// ConditionEval executes conditions and evaluates the result.
2021-03-11 18:56:58 +02:00
func ( e * Evaluator ) ConditionEval ( condition * models . Condition , now time . Time , dataService * tsdb . Service ) ( Results , error ) {
2021-09-28 13:00:16 +03:00
alertCtx , cancelFn := context . WithTimeout ( context . Background ( ) , e . Cfg . UnifiedAlerting . EvaluationTimeout )
2020-12-17 16:00:09 +02:00
defer cancelFn ( )
2021-06-15 19:30:21 +03:00
alertExecCtx := AlertExecCtx { OrgID : condition . OrgID , Ctx : alertCtx , ExpressionsEnabled : e . Cfg . ExpressionsEnabled , Log : e . Log }
2020-12-17 16:00:09 +02:00
2021-04-29 07:27:32 -04:00
execResult := executeCondition ( alertExecCtx , condition , now , dataService )
2020-12-17 16:00:09 +02:00
2021-04-29 07:27:32 -04:00
evalResults := evaluateExecutionResult ( execResult , now )
2020-12-17 16:00:09 +02:00
return evalResults , nil
}
2021-04-21 22:44:50 +03:00
// QueriesAndExpressionsEval executes queries and expressions and returns the result.
func ( e * Evaluator ) QueriesAndExpressionsEval ( orgID int64 , data [ ] models . AlertQuery , now time . Time , dataService * tsdb . Service ) ( * backend . QueryDataResponse , error ) {
2021-09-28 13:00:16 +03:00
alertCtx , cancelFn := context . WithTimeout ( context . Background ( ) , e . Cfg . UnifiedAlerting . EvaluationTimeout )
2021-04-21 22:44:50 +03:00
defer cancelFn ( )
2021-06-15 19:30:21 +03:00
alertExecCtx := AlertExecCtx { OrgID : orgID , Ctx : alertCtx , ExpressionsEnabled : e . Cfg . ExpressionsEnabled , Log : e . Log }
2021-04-21 22:44:50 +03:00
execResult , err := executeQueriesAndExpressions ( alertExecCtx , data , now , dataService )
if err != nil {
return nil , fmt . Errorf ( "failed to execute conditions: %w" , err )
}
return execResult , nil
}