Alerting: Add error recovery during rule evaluations (#35450)

* Alerting: Eval recovery after query failure

* Apply suggestions from code review
This commit is contained in:
Sofia Papagiannaki 2021-06-15 19:30:21 +03:00 committed by GitHub
parent f7ed35336d
commit abe35c8c01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 27 additions and 9 deletions

View File

@ -10,6 +10,7 @@ import (
"github.com/grafana/grafana/pkg/bus"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/expr/classic"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
@ -35,7 +36,7 @@ func DashboardAlertConditions(rawDCondJSON []byte, orgID int64) (*ngmodels.Condi
return nil, err
}
backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true}, ngCond.Data, time.Unix(500, 0))
backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true, Log: log.New("translate")}, ngCond.Data, time.Unix(500, 0))
if err != nil {
return nil, err

View File

@ -37,7 +37,7 @@ func (srv TestingApiSrv) RouteTestRuleConfig(c *models.ReqContext, body apimodel
if body.Type() != apimodels.GrafanaBackend || body.GrafanaManagedCondition == nil {
return ErrResp(http.StatusBadRequest, errors.New("unexpected payload"), "")
}
return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg)
return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg, srv.log)
}
if body.Type() != apimodels.LoTexRulerBackend {
@ -90,7 +90,7 @@ func (srv TestingApiSrv) RouteEvalQueries(c *models.ReqContext, cmd apimodels.Ev
return ErrResp(http.StatusBadRequest, err, "invalid queries or expressions")
}
evaluator := eval.Evaluator{Cfg: srv.Cfg}
evaluator := eval.Evaluator{Cfg: srv.Cfg, Log: srv.log}
evalResults, err := evaluator.QueriesAndExpressionsEval(c.SignedInUser.OrgId, cmd.Data, now, srv.DataService)
if err != nil {
return ErrResp(http.StatusBadRequest, err, "Failed to evaluate queries and expressions")

View File

@ -13,6 +13,7 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/api/response"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/services/datasourceproxy"
"github.com/grafana/grafana/pkg/services/datasources"
@ -222,7 +223,7 @@ func validateQueriesAndExpressions(data []ngmodels.AlertQuery, user *models.Sign
return refIDs, nil
}
func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg) response.Response {
func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg, log log.Logger) response.Response {
evalCond := ngmodels.Condition{
Condition: cmd.Condition,
OrgID: c.SignedInUser.OrgId,
@ -237,7 +238,7 @@ func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand,
now = timeNow()
}
evaluator := eval.Evaluator{Cfg: cfg}
evaluator := eval.Evaluator{Cfg: cfg, Log: log}
evalResults, err := evaluator.ConditionEval(&evalCond, now, dataService)
if err != nil {
return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")

View File

@ -5,10 +5,12 @@ package eval
import (
"context"
"fmt"
"runtime/debug"
"sort"
"time"
"github.com/grafana/grafana/pkg/expr/classic"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/setting"
@ -23,6 +25,7 @@ const alertingEvaluationTimeout = 30 * time.Second
type Evaluator struct {
Cfg *setting.Cfg
Log log.Logger
}
// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
@ -105,6 +108,7 @@ func (s State) String() string {
type AlertExecCtx struct {
OrgID int64
ExpressionsEnabled bool
Log log.Logger
Ctx context.Context
}
@ -220,7 +224,19 @@ func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, data
return result
}
func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (*backend.QueryDataResponse, error) {
func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (resp *backend.QueryDataResponse, err error) {
defer func() {
if e := recover(); e != nil {
ctx.Log.Error("alert rule panic", "error", e, "stack", string(debug.Stack()))
panicErr := fmt.Errorf("alert rule panic; please check the logs for the full stack")
if err != nil {
err = fmt.Errorf("queries and expressions execution failed: %w; %v", err, panicErr.Error())
} else {
err = panicErr
}
}
}()
queryDataReq, err := GetExprRequest(ctx, data, now)
if err != nil {
return nil, err
@ -410,7 +426,7 @@ func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, da
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
defer cancelFn()
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
execResult := executeCondition(alertExecCtx, condition, now, dataService)
@ -423,7 +439,7 @@ func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQu
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
defer cancelFn()
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, dataService)
if err != nil {

View File

@ -83,7 +83,7 @@ func (ng *AlertNG) Init() error {
BaseInterval: baseInterval,
Logger: ng.Log,
MaxAttempts: maxAttempts,
Evaluator: eval.Evaluator{Cfg: ng.Cfg},
Evaluator: eval.Evaluator{Cfg: ng.Cfg, Log: ng.Log},
InstanceStore: store,
RuleStore: store,
Notifier: ng.Alertmanager,