mirror of
				https://github.com/grafana/grafana.git
				synced 2025-02-25 18:55:37 -06:00 
			
		
		
		
	Alerting: Add error recovery during rule evaluations (#35450)
* Alerting: Eval recovery after query failure * Apply suggestions from code review
This commit is contained in:
		
				
					committed by
					
						
						GitHub
					
				
			
			
				
	
			
			
			
						parent
						
							f7ed35336d
						
					
				
				
					commit
					abe35c8c01
				
			@@ -10,6 +10,7 @@ import (
 | 
				
			|||||||
	"github.com/grafana/grafana/pkg/bus"
 | 
						"github.com/grafana/grafana/pkg/bus"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/expr"
 | 
						"github.com/grafana/grafana/pkg/expr"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/expr/classic"
 | 
						"github.com/grafana/grafana/pkg/expr/classic"
 | 
				
			||||||
 | 
						"github.com/grafana/grafana/pkg/infra/log"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/models"
 | 
						"github.com/grafana/grafana/pkg/models"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/services/ngalert/eval"
 | 
						"github.com/grafana/grafana/pkg/services/ngalert/eval"
 | 
				
			||||||
	ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
 | 
						ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
 | 
				
			||||||
@@ -35,7 +36,7 @@ func DashboardAlertConditions(rawDCondJSON []byte, orgID int64) (*ngmodels.Condi
 | 
				
			|||||||
		return nil, err
 | 
							return nil, err
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true}, ngCond.Data, time.Unix(500, 0))
 | 
						backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true, Log: log.New("translate")}, ngCond.Data, time.Unix(500, 0))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return nil, err
 | 
							return nil, err
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -37,7 +37,7 @@ func (srv TestingApiSrv) RouteTestRuleConfig(c *models.ReqContext, body apimodel
 | 
				
			|||||||
		if body.Type() != apimodels.GrafanaBackend || body.GrafanaManagedCondition == nil {
 | 
							if body.Type() != apimodels.GrafanaBackend || body.GrafanaManagedCondition == nil {
 | 
				
			||||||
			return ErrResp(http.StatusBadRequest, errors.New("unexpected payload"), "")
 | 
								return ErrResp(http.StatusBadRequest, errors.New("unexpected payload"), "")
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg)
 | 
							return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg, srv.log)
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if body.Type() != apimodels.LoTexRulerBackend {
 | 
						if body.Type() != apimodels.LoTexRulerBackend {
 | 
				
			||||||
@@ -90,7 +90,7 @@ func (srv TestingApiSrv) RouteEvalQueries(c *models.ReqContext, cmd apimodels.Ev
 | 
				
			|||||||
		return ErrResp(http.StatusBadRequest, err, "invalid queries or expressions")
 | 
							return ErrResp(http.StatusBadRequest, err, "invalid queries or expressions")
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	evaluator := eval.Evaluator{Cfg: srv.Cfg}
 | 
						evaluator := eval.Evaluator{Cfg: srv.Cfg, Log: srv.log}
 | 
				
			||||||
	evalResults, err := evaluator.QueriesAndExpressionsEval(c.SignedInUser.OrgId, cmd.Data, now, srv.DataService)
 | 
						evalResults, err := evaluator.QueriesAndExpressionsEval(c.SignedInUser.OrgId, cmd.Data, now, srv.DataService)
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return ErrResp(http.StatusBadRequest, err, "Failed to evaluate queries and expressions")
 | 
							return ErrResp(http.StatusBadRequest, err, "Failed to evaluate queries and expressions")
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -13,6 +13,7 @@ import (
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	"github.com/grafana/grafana-plugin-sdk-go/data"
 | 
						"github.com/grafana/grafana-plugin-sdk-go/data"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/api/response"
 | 
						"github.com/grafana/grafana/pkg/api/response"
 | 
				
			||||||
 | 
						"github.com/grafana/grafana/pkg/infra/log"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/models"
 | 
						"github.com/grafana/grafana/pkg/models"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/services/datasourceproxy"
 | 
						"github.com/grafana/grafana/pkg/services/datasourceproxy"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/services/datasources"
 | 
						"github.com/grafana/grafana/pkg/services/datasources"
 | 
				
			||||||
@@ -222,7 +223,7 @@ func validateQueriesAndExpressions(data []ngmodels.AlertQuery, user *models.Sign
 | 
				
			|||||||
	return refIDs, nil
 | 
						return refIDs, nil
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg) response.Response {
 | 
					func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg, log log.Logger) response.Response {
 | 
				
			||||||
	evalCond := ngmodels.Condition{
 | 
						evalCond := ngmodels.Condition{
 | 
				
			||||||
		Condition: cmd.Condition,
 | 
							Condition: cmd.Condition,
 | 
				
			||||||
		OrgID:     c.SignedInUser.OrgId,
 | 
							OrgID:     c.SignedInUser.OrgId,
 | 
				
			||||||
@@ -237,7 +238,7 @@ func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand,
 | 
				
			|||||||
		now = timeNow()
 | 
							now = timeNow()
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	evaluator := eval.Evaluator{Cfg: cfg}
 | 
						evaluator := eval.Evaluator{Cfg: cfg, Log: log}
 | 
				
			||||||
	evalResults, err := evaluator.ConditionEval(&evalCond, now, dataService)
 | 
						evalResults, err := evaluator.ConditionEval(&evalCond, now, dataService)
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")
 | 
							return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,10 +5,12 @@ package eval
 | 
				
			|||||||
import (
 | 
					import (
 | 
				
			||||||
	"context"
 | 
						"context"
 | 
				
			||||||
	"fmt"
 | 
						"fmt"
 | 
				
			||||||
 | 
						"runtime/debug"
 | 
				
			||||||
	"sort"
 | 
						"sort"
 | 
				
			||||||
	"time"
 | 
						"time"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/expr/classic"
 | 
						"github.com/grafana/grafana/pkg/expr/classic"
 | 
				
			||||||
 | 
						"github.com/grafana/grafana/pkg/infra/log"
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/services/ngalert/models"
 | 
						"github.com/grafana/grafana/pkg/services/ngalert/models"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	"github.com/grafana/grafana/pkg/setting"
 | 
						"github.com/grafana/grafana/pkg/setting"
 | 
				
			||||||
@@ -23,6 +25,7 @@ const alertingEvaluationTimeout = 30 * time.Second
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
type Evaluator struct {
 | 
					type Evaluator struct {
 | 
				
			||||||
	Cfg *setting.Cfg
 | 
						Cfg *setting.Cfg
 | 
				
			||||||
 | 
						Log log.Logger
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
 | 
					// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
 | 
				
			||||||
@@ -105,6 +108,7 @@ func (s State) String() string {
 | 
				
			|||||||
type AlertExecCtx struct {
 | 
					type AlertExecCtx struct {
 | 
				
			||||||
	OrgID              int64
 | 
						OrgID              int64
 | 
				
			||||||
	ExpressionsEnabled bool
 | 
						ExpressionsEnabled bool
 | 
				
			||||||
 | 
						Log                log.Logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	Ctx context.Context
 | 
						Ctx context.Context
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
@@ -220,7 +224,19 @@ func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, data
 | 
				
			|||||||
	return result
 | 
						return result
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (*backend.QueryDataResponse, error) {
 | 
					func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (resp *backend.QueryDataResponse, err error) {
 | 
				
			||||||
 | 
						defer func() {
 | 
				
			||||||
 | 
							if e := recover(); e != nil {
 | 
				
			||||||
 | 
								ctx.Log.Error("alert rule panic", "error", e, "stack", string(debug.Stack()))
 | 
				
			||||||
 | 
								panicErr := fmt.Errorf("alert rule panic; please check the logs for the full stack")
 | 
				
			||||||
 | 
								if err != nil {
 | 
				
			||||||
 | 
									err = fmt.Errorf("queries and expressions execution failed: %w; %v", err, panicErr.Error())
 | 
				
			||||||
 | 
								} else {
 | 
				
			||||||
 | 
									err = panicErr
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
 | 
						}()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	queryDataReq, err := GetExprRequest(ctx, data, now)
 | 
						queryDataReq, err := GetExprRequest(ctx, data, now)
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
		return nil, err
 | 
							return nil, err
 | 
				
			||||||
@@ -410,7 +426,7 @@ func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, da
 | 
				
			|||||||
	alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
 | 
						alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
 | 
				
			||||||
	defer cancelFn()
 | 
						defer cancelFn()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
 | 
						alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	execResult := executeCondition(alertExecCtx, condition, now, dataService)
 | 
						execResult := executeCondition(alertExecCtx, condition, now, dataService)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -423,7 +439,7 @@ func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQu
 | 
				
			|||||||
	alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
 | 
						alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
 | 
				
			||||||
	defer cancelFn()
 | 
						defer cancelFn()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
 | 
						alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, dataService)
 | 
						execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, dataService)
 | 
				
			||||||
	if err != nil {
 | 
						if err != nil {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -83,7 +83,7 @@ func (ng *AlertNG) Init() error {
 | 
				
			|||||||
		BaseInterval:  baseInterval,
 | 
							BaseInterval:  baseInterval,
 | 
				
			||||||
		Logger:        ng.Log,
 | 
							Logger:        ng.Log,
 | 
				
			||||||
		MaxAttempts:   maxAttempts,
 | 
							MaxAttempts:   maxAttempts,
 | 
				
			||||||
		Evaluator:     eval.Evaluator{Cfg: ng.Cfg},
 | 
							Evaluator:     eval.Evaluator{Cfg: ng.Cfg, Log: ng.Log},
 | 
				
			||||||
		InstanceStore: store,
 | 
							InstanceStore: store,
 | 
				
			||||||
		RuleStore:     store,
 | 
							RuleStore:     store,
 | 
				
			||||||
		Notifier:      ng.Alertmanager,
 | 
							Notifier:      ng.Alertmanager,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user