mirror of
https://github.com/grafana/grafana.git
synced 2024-12-28 18:01:40 -06:00
Alerting: Add error recovery during rule evaluations (#35450)
* Alerting: Eval recovery after query failure * Apply suggestions from code review
This commit is contained in:
parent
f7ed35336d
commit
abe35c8c01
@ -10,6 +10,7 @@ import (
|
||||
"github.com/grafana/grafana/pkg/bus"
|
||||
"github.com/grafana/grafana/pkg/expr"
|
||||
"github.com/grafana/grafana/pkg/expr/classic"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
@ -35,7 +36,7 @@ func DashboardAlertConditions(rawDCondJSON []byte, orgID int64) (*ngmodels.Condi
|
||||
return nil, err
|
||||
}
|
||||
|
||||
backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true}, ngCond.Data, time.Unix(500, 0))
|
||||
backendReq, err := eval.GetExprRequest(eval.AlertExecCtx{ExpressionsEnabled: true, Log: log.New("translate")}, ngCond.Data, time.Unix(500, 0))
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -37,7 +37,7 @@ func (srv TestingApiSrv) RouteTestRuleConfig(c *models.ReqContext, body apimodel
|
||||
if body.Type() != apimodels.GrafanaBackend || body.GrafanaManagedCondition == nil {
|
||||
return ErrResp(http.StatusBadRequest, errors.New("unexpected payload"), "")
|
||||
}
|
||||
return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg)
|
||||
return conditionEval(c, *body.GrafanaManagedCondition, srv.DatasourceCache, srv.DataService, srv.Cfg, srv.log)
|
||||
}
|
||||
|
||||
if body.Type() != apimodels.LoTexRulerBackend {
|
||||
@ -90,7 +90,7 @@ func (srv TestingApiSrv) RouteEvalQueries(c *models.ReqContext, cmd apimodels.Ev
|
||||
return ErrResp(http.StatusBadRequest, err, "invalid queries or expressions")
|
||||
}
|
||||
|
||||
evaluator := eval.Evaluator{Cfg: srv.Cfg}
|
||||
evaluator := eval.Evaluator{Cfg: srv.Cfg, Log: srv.log}
|
||||
evalResults, err := evaluator.QueriesAndExpressionsEval(c.SignedInUser.OrgId, cmd.Data, now, srv.DataService)
|
||||
if err != nil {
|
||||
return ErrResp(http.StatusBadRequest, err, "Failed to evaluate queries and expressions")
|
||||
|
@ -13,6 +13,7 @@ import (
|
||||
|
||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||
"github.com/grafana/grafana/pkg/api/response"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/models"
|
||||
"github.com/grafana/grafana/pkg/services/datasourceproxy"
|
||||
"github.com/grafana/grafana/pkg/services/datasources"
|
||||
@ -222,7 +223,7 @@ func validateQueriesAndExpressions(data []ngmodels.AlertQuery, user *models.Sign
|
||||
return refIDs, nil
|
||||
}
|
||||
|
||||
func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg) response.Response {
|
||||
func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand, datasourceCache datasources.CacheService, dataService *tsdb.Service, cfg *setting.Cfg, log log.Logger) response.Response {
|
||||
evalCond := ngmodels.Condition{
|
||||
Condition: cmd.Condition,
|
||||
OrgID: c.SignedInUser.OrgId,
|
||||
@ -237,7 +238,7 @@ func conditionEval(c *models.ReqContext, cmd ngmodels.EvalAlertConditionCommand,
|
||||
now = timeNow()
|
||||
}
|
||||
|
||||
evaluator := eval.Evaluator{Cfg: cfg}
|
||||
evaluator := eval.Evaluator{Cfg: cfg, Log: log}
|
||||
evalResults, err := evaluator.ConditionEval(&evalCond, now, dataService)
|
||||
if err != nil {
|
||||
return ErrResp(http.StatusBadRequest, err, "Failed to evaluate conditions")
|
||||
|
@ -5,10 +5,12 @@ package eval
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"runtime/debug"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/expr/classic"
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
@ -23,6 +25,7 @@ const alertingEvaluationTimeout = 30 * time.Second
|
||||
|
||||
type Evaluator struct {
|
||||
Cfg *setting.Cfg
|
||||
Log log.Logger
|
||||
}
|
||||
|
||||
// invalidEvalResultFormatError is an error for invalid format of the alert definition evaluation results.
|
||||
@ -105,6 +108,7 @@ func (s State) String() string {
|
||||
type AlertExecCtx struct {
|
||||
OrgID int64
|
||||
ExpressionsEnabled bool
|
||||
Log log.Logger
|
||||
|
||||
Ctx context.Context
|
||||
}
|
||||
@ -220,7 +224,19 @@ func executeCondition(ctx AlertExecCtx, c *models.Condition, now time.Time, data
|
||||
return result
|
||||
}
|
||||
|
||||
func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (*backend.QueryDataResponse, error) {
|
||||
func executeQueriesAndExpressions(ctx AlertExecCtx, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (resp *backend.QueryDataResponse, err error) {
|
||||
defer func() {
|
||||
if e := recover(); e != nil {
|
||||
ctx.Log.Error("alert rule panic", "error", e, "stack", string(debug.Stack()))
|
||||
panicErr := fmt.Errorf("alert rule panic; please check the logs for the full stack")
|
||||
if err != nil {
|
||||
err = fmt.Errorf("queries and expressions execution failed: %w; %v", err, panicErr.Error())
|
||||
} else {
|
||||
err = panicErr
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
queryDataReq, err := GetExprRequest(ctx, data, now)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -410,7 +426,7 @@ func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, da
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
|
||||
defer cancelFn()
|
||||
|
||||
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
|
||||
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
|
||||
|
||||
execResult := executeCondition(alertExecCtx, condition, now, dataService)
|
||||
|
||||
@ -423,7 +439,7 @@ func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQu
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
|
||||
defer cancelFn()
|
||||
|
||||
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled}
|
||||
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
|
||||
|
||||
execResult, err := executeQueriesAndExpressions(alertExecCtx, data, now, dataService)
|
||||
if err != nil {
|
||||
|
@ -83,7 +83,7 @@ func (ng *AlertNG) Init() error {
|
||||
BaseInterval: baseInterval,
|
||||
Logger: ng.Log,
|
||||
MaxAttempts: maxAttempts,
|
||||
Evaluator: eval.Evaluator{Cfg: ng.Cfg},
|
||||
Evaluator: eval.Evaluator{Cfg: ng.Cfg, Log: ng.Log},
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
Notifier: ng.Alertmanager,
|
||||
|
Loading…
Reference in New Issue
Block a user