mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Makes timeouts and retries configurable (#16259)
Adds new alert settings for configuring timeouts and retries named evaluation_timeout_seconds, notification_timeout_seconds and max_attempts. Closes #16240
This commit is contained in:
@@ -521,6 +521,16 @@ nodata_or_nullvalues = no_data
|
||||
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
|
||||
concurrent_render_limit = 5
|
||||
|
||||
# Default setting for alert calculation timeout. Default value is 30
|
||||
evaluation_timeout_seconds = 30
|
||||
|
||||
# Default setting for alert notification timeout. Default value is 30
|
||||
notification_timeout_seconds = 30
|
||||
|
||||
# Default setting for max attempts to sending alert notifications. Default value is 3
|
||||
max_attempts = 3
|
||||
|
||||
|
||||
#################################### Explore #############################
|
||||
[explore]
|
||||
# Enable the Explore section
|
||||
|
||||
@@ -446,6 +446,16 @@ log_queries =
|
||||
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
|
||||
;concurrent_render_limit = 5
|
||||
|
||||
|
||||
# Default setting for alert calculation timeout. Default value is 30
|
||||
;evaluation_timeout_seconds = 30
|
||||
|
||||
# Default setting for alert notification timeout. Default value is 30
|
||||
;notification_timeout_seconds = 30
|
||||
|
||||
# Default setting for max attempts to sending alert notifications. Default value is 3
|
||||
;max_attempts = 3
|
||||
|
||||
#################################### Explore #############################
|
||||
[explore]
|
||||
# Enable the Explore section
|
||||
|
||||
@@ -650,6 +650,20 @@ Alert notifications can include images, but rendering many images at the same ti
|
||||
This limit will protect the server from render overloading and make sure notifications are sent out quickly. Default
|
||||
value is `5`.
|
||||
|
||||
|
||||
### evaluation_timeout_seconds
|
||||
|
||||
Default setting for alert calculation timeout. Default value is `30`
|
||||
|
||||
### notification_timeout_seconds
|
||||
|
||||
Default setting for alert notification timeout. Default value is `30`
|
||||
|
||||
### max_attempts
|
||||
|
||||
Default setting for max attempts to sending alert notifications. Default value is `3`
|
||||
|
||||
|
||||
## [panels]
|
||||
|
||||
### enable_alpha
|
||||
|
||||
@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
|
||||
|
||||
var (
|
||||
unfinishedWorkTimeout = time.Second * 5
|
||||
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
|
||||
alertTimeout = time.Second * 30
|
||||
resultHandleTimeout = time.Second * 30
|
||||
alertMaxAttempts = 3
|
||||
)
|
||||
|
||||
func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
|
||||
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
|
||||
}
|
||||
}()
|
||||
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
|
||||
attemptChan := make(chan int, 1)
|
||||
|
||||
// Initialize with first attemptID=1
|
||||
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
}
|
||||
}()
|
||||
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
|
||||
cancelChan <- cancelFn
|
||||
span := opentracing.StartSpan("alert execution")
|
||||
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
|
||||
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
tlog.Error(evalContext.Error),
|
||||
tlog.String("message", "alerting execution attempt failed"),
|
||||
)
|
||||
if attemptID < alertMaxAttempts {
|
||||
if attemptID < setting.AlertingMaxAttempts {
|
||||
span.Finish()
|
||||
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
|
||||
attemptChan <- (attemptID + 1)
|
||||
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
}
|
||||
|
||||
// create new context with timeout for notifications
|
||||
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
|
||||
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
|
||||
cancelChan <- resultHandleCancelFn
|
||||
|
||||
// override the context used for evaluation with a new context for notifications.
|
||||
|
||||
@@ -11,20 +11,22 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
)
|
||||
|
||||
func TestEngineTimeouts(t *testing.T) {
|
||||
Convey("Alerting engine timeout tests", t, func() {
|
||||
engine := NewEngine()
|
||||
setting.AlertingNotificationTimeout = 30 * time.Second
|
||||
setting.AlertingMaxAttempts = 3
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
job := &Job{Running: true, Rule: &Rule{}}
|
||||
|
||||
Convey("Should trigger as many retries as needed", func() {
|
||||
Convey("pended alert for datasource -> result handler should be worked", func() {
|
||||
// reduce alert timeout to test quickly
|
||||
originAlertTimeout := alertTimeout
|
||||
alertTimeout = 2 * time.Second
|
||||
setting.AlertingEvaluationTimeout = 30 * time.Second
|
||||
transportTimeoutInterval := 2 * time.Second
|
||||
serverBusySleepDuration := 1 * time.Second
|
||||
|
||||
@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
|
||||
So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
|
||||
|
||||
// initialize for other tests.
|
||||
alertTimeout = originAlertTimeout
|
||||
setting.AlertingEvaluationTimeout = 2 * time.Second
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
})
|
||||
})
|
||||
|
||||
@@ -6,7 +6,9 @@ import (
|
||||
"math"
|
||||
"testing"
|
||||
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FakeEvalHandler struct {
|
||||
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
|
||||
func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("Alerting engine job processing", t, func() {
|
||||
engine := NewEngine()
|
||||
setting.AlertingEvaluationTimeout = 30 * time.Second
|
||||
setting.AlertingNotificationTimeout = 30 * time.Second
|
||||
setting.AlertingMaxAttempts = 3
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
job := &Job{Running: true, Rule: &Rule{}}
|
||||
|
||||
@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("error + not last attempt -> retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(0)
|
||||
|
||||
for i := 1; i < alertMaxAttempts; i++ {
|
||||
for i := 1; i < setting.AlertingMaxAttempts; i++ {
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(i, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("error + last attempt -> no retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(0)
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
|
||||
engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
|
||||
So(nextAttemptID, ShouldEqual, 0)
|
||||
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("no error -> no retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(1)
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(1, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("Should trigger as many retries as needed", func() {
|
||||
|
||||
Convey("never success -> max retries number", func() {
|
||||
expectedAttempts := alertMaxAttempts
|
||||
expectedAttempts := setting.AlertingMaxAttempts
|
||||
evalHandler := NewFakeEvalHandler(0)
|
||||
engine.evalHandler = evalHandler
|
||||
|
||||
@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
})
|
||||
|
||||
Convey("some errors before success -> some retries", func() {
|
||||
expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
|
||||
expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
|
||||
evalHandler := NewFakeEvalHandler(expectedAttempts)
|
||||
engine.evalHandler = evalHandler
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
|
||||
renderOpts := rendering.Opts{
|
||||
Width: 1000,
|
||||
Height: 500,
|
||||
Timeout: time.Duration(float64(alertTimeout) * 0.9),
|
||||
Timeout: time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
|
||||
OrgId: context.Rule.OrgId,
|
||||
OrgRole: m.ROLE_ADMIN,
|
||||
ConcurrentLimit: setting.AlertingRenderLimit,
|
||||
|
||||
@@ -179,6 +179,10 @@ var (
|
||||
AlertingErrorOrTimeout string
|
||||
AlertingNoDataOrNullValues string
|
||||
|
||||
AlertingEvaluationTimeout time.Duration
|
||||
AlertingNotificationTimeout time.Duration
|
||||
AlertingMaxAttempts int
|
||||
|
||||
// Explore UI
|
||||
ExploreEnabled bool
|
||||
|
||||
@@ -760,6 +764,10 @@ func (cfg *Cfg) Load(args *CommandLineArgs) error {
|
||||
AlertingErrorOrTimeout = alerting.Key("error_or_timeout").MustString("alerting")
|
||||
AlertingNoDataOrNullValues = alerting.Key("nodata_or_nullvalues").MustString("no_data")
|
||||
|
||||
AlertingEvaluationTimeout = alerting.Key("evaluation_timeout_seconds").MustDuration(time.Second * 30)
|
||||
AlertingNotificationTimeout = alerting.Key("notification_timeout_seconds").MustDuration(time.Second * 30)
|
||||
AlertingMaxAttempts = alerting.Key("max_attempts").MustInt(3)
|
||||
|
||||
explore := iniFile.Section("explore")
|
||||
ExploreEnabled = explore.Key("enabled").MustBool(true)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user