mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Makes timeouts and retries configurable (#16259)
Adds new alert settings for configuring timeouts and retries named evaluation_timeout_seconds, notification_timeout_seconds and max_attempts. Closes #16240
This commit is contained in:
@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
|
||||
|
||||
var (
|
||||
unfinishedWorkTimeout = time.Second * 5
|
||||
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
|
||||
alertTimeout = time.Second * 30
|
||||
resultHandleTimeout = time.Second * 30
|
||||
alertMaxAttempts = 3
|
||||
)
|
||||
|
||||
func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
|
||||
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
|
||||
}
|
||||
}()
|
||||
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
|
||||
attemptChan := make(chan int, 1)
|
||||
|
||||
// Initialize with first attemptID=1
|
||||
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
}
|
||||
}()
|
||||
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
|
||||
cancelChan <- cancelFn
|
||||
span := opentracing.StartSpan("alert execution")
|
||||
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
|
||||
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
tlog.Error(evalContext.Error),
|
||||
tlog.String("message", "alerting execution attempt failed"),
|
||||
)
|
||||
if attemptID < alertMaxAttempts {
|
||||
if attemptID < setting.AlertingMaxAttempts {
|
||||
span.Finish()
|
||||
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
|
||||
attemptChan <- (attemptID + 1)
|
||||
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
|
||||
}
|
||||
|
||||
// create new context with timeout for notifications
|
||||
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
|
||||
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
|
||||
cancelChan <- resultHandleCancelFn
|
||||
|
||||
// override the context used for evaluation with a new context for notifications.
|
||||
|
||||
@@ -11,20 +11,22 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
)
|
||||
|
||||
func TestEngineTimeouts(t *testing.T) {
|
||||
Convey("Alerting engine timeout tests", t, func() {
|
||||
engine := NewEngine()
|
||||
setting.AlertingNotificationTimeout = 30 * time.Second
|
||||
setting.AlertingMaxAttempts = 3
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
job := &Job{Running: true, Rule: &Rule{}}
|
||||
|
||||
Convey("Should trigger as many retries as needed", func() {
|
||||
Convey("pended alert for datasource -> result handler should be worked", func() {
|
||||
// reduce alert timeout to test quickly
|
||||
originAlertTimeout := alertTimeout
|
||||
alertTimeout = 2 * time.Second
|
||||
setting.AlertingEvaluationTimeout = 30 * time.Second
|
||||
transportTimeoutInterval := 2 * time.Second
|
||||
serverBusySleepDuration := 1 * time.Second
|
||||
|
||||
@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
|
||||
So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
|
||||
|
||||
// initialize for other tests.
|
||||
alertTimeout = originAlertTimeout
|
||||
setting.AlertingEvaluationTimeout = 2 * time.Second
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
})
|
||||
})
|
||||
|
||||
@@ -6,7 +6,9 @@ import (
|
||||
"math"
|
||||
"testing"
|
||||
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
"time"
|
||||
)
|
||||
|
||||
type FakeEvalHandler struct {
|
||||
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
|
||||
func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("Alerting engine job processing", t, func() {
|
||||
engine := NewEngine()
|
||||
setting.AlertingEvaluationTimeout = 30 * time.Second
|
||||
setting.AlertingNotificationTimeout = 30 * time.Second
|
||||
setting.AlertingMaxAttempts = 3
|
||||
engine.resultHandler = &FakeResultHandler{}
|
||||
job := &Job{Running: true, Rule: &Rule{}}
|
||||
|
||||
@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("error + not last attempt -> retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(0)
|
||||
|
||||
for i := 1; i < alertMaxAttempts; i++ {
|
||||
for i := 1; i < setting.AlertingMaxAttempts; i++ {
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(i, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("error + last attempt -> no retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(0)
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
|
||||
engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
|
||||
So(nextAttemptID, ShouldEqual, 0)
|
||||
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("no error -> no retry", func() {
|
||||
engine.evalHandler = NewFakeEvalHandler(1)
|
||||
attemptChan := make(chan int, 1)
|
||||
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
|
||||
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
|
||||
|
||||
engine.processJob(1, attemptChan, cancelChan, job)
|
||||
nextAttemptID, more := <-attemptChan
|
||||
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
Convey("Should trigger as many retries as needed", func() {
|
||||
|
||||
Convey("never success -> max retries number", func() {
|
||||
expectedAttempts := alertMaxAttempts
|
||||
expectedAttempts := setting.AlertingMaxAttempts
|
||||
evalHandler := NewFakeEvalHandler(0)
|
||||
engine.evalHandler = evalHandler
|
||||
|
||||
@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
|
||||
})
|
||||
|
||||
Convey("some errors before success -> some retries", func() {
|
||||
expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
|
||||
expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
|
||||
evalHandler := NewFakeEvalHandler(expectedAttempts)
|
||||
engine.evalHandler = evalHandler
|
||||
|
||||
|
||||
@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
|
||||
renderOpts := rendering.Opts{
|
||||
Width: 1000,
|
||||
Height: 500,
|
||||
Timeout: time.Duration(float64(alertTimeout) * 0.9),
|
||||
Timeout: time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
|
||||
OrgId: context.Rule.OrgId,
|
||||
OrgRole: m.ROLE_ADMIN,
|
||||
ConcurrentLimit: setting.AlertingRenderLimit,
|
||||
|
||||
Reference in New Issue
Block a user