Alerting: Makes timeouts and retries configurable (#16259)

Adds new alert settings for configuring timeouts and retries named evaluation_timeout_seconds, notification_timeout_seconds and max_attempts. Closes #16240
2025-02-25 18:55:37 -06:00 · 2019-03-29 13:58:37 +08:00
parent e6d9a524b4
commit 1b84a924a3
8 changed files with 64 additions and 19 deletions
--- a/pkg/services/alerting/engine.go
+++ b/pkg/services/alerting/engine.go
@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {

 var (
 	unfinishedWorkTimeout = time.Second * 5
-	// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
-	alertTimeout        = time.Second * 30
-	resultHandleTimeout = time.Second * 30
-	alertMaxAttempts    = 3
 )

 func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
 		}
 	}()

-	cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
+	cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
 	attemptChan := make(chan int, 1)

 	// Initialize with first attemptID=1
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}
 	}()

-	alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
+	alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
 	cancelChan <- cancelFn
 	span := opentracing.StartSpan("alert execution")
 	alertCtx = opentracing.ContextWithSpan(alertCtx, span)
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 				tlog.Error(evalContext.Error),
 				tlog.String("message", "alerting execution attempt failed"),
 			)
-			if attemptID < alertMaxAttempts {
+			if attemptID < setting.AlertingMaxAttempts {
 				span.Finish()
 				e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
 				attemptChan <- (attemptID + 1)
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
 		}

 		// create new context with timeout for notifications
-		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
+		resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
 		cancelChan <- resultHandleCancelFn

 		// override the context used for evaluation with a new context for notifications.
--- a/pkg/services/alerting/engine_integration_test.go
+++ b/pkg/services/alerting/engine_integration_test.go
@@ -11,20 +11,22 @@ import (
 	"testing"
 	"time"

+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
 )

 func TestEngineTimeouts(t *testing.T) {
 	Convey("Alerting engine timeout tests", t, func() {
 		engine := NewEngine()
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}

 		Convey("Should trigger as many retries as needed", func() {
 			Convey("pended alert for datasource -> result handler should be worked", func() {
 				// reduce alert timeout to test quickly
-				originAlertTimeout := alertTimeout
-				alertTimeout = 2 * time.Second
+				setting.AlertingEvaluationTimeout = 30 * time.Second
 				transportTimeoutInterval := 2 * time.Second
 				serverBusySleepDuration := 1 * time.Second

@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
 				So(resultHandler.ResultHandleSucceed, ShouldEqual, true)

 				// initialize for other tests.
-				alertTimeout = originAlertTimeout
+				setting.AlertingEvaluationTimeout = 2 * time.Second
 				engine.resultHandler = &FakeResultHandler{}
 			})
 		})
--- a/pkg/services/alerting/engine_test.go
+++ b/pkg/services/alerting/engine_test.go
@@ -6,7 +6,9 @@ import (
 	"math"
 	"testing"

+	"github.com/grafana/grafana/pkg/setting"
 	. "github.com/smartystreets/goconvey/convey"
+	"time"
 )

 type FakeEvalHandler struct {
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
 func TestEngineProcessJob(t *testing.T) {
 	Convey("Alerting engine job processing", t, func() {
 		engine := NewEngine()
+		setting.AlertingEvaluationTimeout = 30 * time.Second
+		setting.AlertingNotificationTimeout = 30 * time.Second
+		setting.AlertingMaxAttempts = 3
 		engine.resultHandler = &FakeResultHandler{}
 		job := &Job{Running: true, Rule: &Rule{}}

@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + not last attempt -> retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)

-				for i := 1; i < alertMaxAttempts; i++ {
+				for i := 1; i < setting.AlertingMaxAttempts; i++ {
 					attemptChan := make(chan int, 1)
-					cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+					cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

 					engine.processJob(i, attemptChan, cancelChan, job)
 					nextAttemptID, more := <-attemptChan
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("error + last attempt -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(0)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

-				engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
+				engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan

 				So(nextAttemptID, ShouldEqual, 0)
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
 			Convey("no error -> no retry", func() {
 				engine.evalHandler = NewFakeEvalHandler(1)
 				attemptChan := make(chan int, 1)
-				cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
+				cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)

 				engine.processJob(1, attemptChan, cancelChan, job)
 				nextAttemptID, more := <-attemptChan
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
 		Convey("Should trigger as many retries as needed", func() {

 			Convey("never success -> max retries number", func() {
-				expectedAttempts := alertMaxAttempts
+				expectedAttempts := setting.AlertingMaxAttempts
 				evalHandler := NewFakeEvalHandler(0)
 				engine.evalHandler = evalHandler

@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
 			})

 			Convey("some errors before success -> some retries", func() {
-				expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
+				expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
 				evalHandler := NewFakeEvalHandler(expectedAttempts)
 				engine.evalHandler = evalHandler

--- a/pkg/services/alerting/notifier.go
+++ b/pkg/services/alerting/notifier.go
@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
 	renderOpts := rendering.Opts{
 		Width:           1000,
 		Height:          500,
-		Timeout:         time.Duration(float64(alertTimeout) * 0.9),
+		Timeout:         time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
 		OrgId:           context.Rule.OrgId,
 		OrgRole:         m.ROLE_ADMIN,
 		ConcurrentLimit: setting.AlertingRenderLimit,