Alerting: Makes timeouts and retries configurable (#16259)

Adds new alert settings for configuring timeouts and retries named 
evaluation_timeout_seconds, notification_timeout_seconds 
and max_attempts.

Closes #16240
This commit is contained in:
Zzy
2019-03-29 13:58:37 +08:00
committed by Marcus Efraimsson
parent e6d9a524b4
commit 1b84a924a3
8 changed files with 64 additions and 19 deletions

View File

@@ -104,10 +104,6 @@ func (e *AlertingService) runJobDispatcher(grafanaCtx context.Context) error {
var (
unfinishedWorkTimeout = time.Second * 5
// TODO: Make alertTimeout and alertMaxAttempts configurable in the config file.
alertTimeout = time.Second * 30
resultHandleTimeout = time.Second * 30
alertMaxAttempts = 3
)
func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *Job) error {
@@ -117,7 +113,7 @@ func (e *AlertingService) processJobWithRetry(grafanaCtx context.Context, job *J
}
}()
cancelChan := make(chan context.CancelFunc, alertMaxAttempts*2)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts*2)
attemptChan := make(chan int, 1)
// Initialize with first attemptID=1
@@ -161,7 +157,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}
}()
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertTimeout)
alertCtx, cancelFn := context.WithTimeout(context.Background(), setting.AlertingEvaluationTimeout)
cancelChan <- cancelFn
span := opentracing.StartSpan("alert execution")
alertCtx = opentracing.ContextWithSpan(alertCtx, span)
@@ -197,7 +193,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
tlog.Error(evalContext.Error),
tlog.String("message", "alerting execution attempt failed"),
)
if attemptID < alertMaxAttempts {
if attemptID < setting.AlertingMaxAttempts {
span.Finish()
e.log.Debug("Job Execution attempt triggered retry", "timeMs", evalContext.GetDurationMs(), "alertId", evalContext.Rule.Id, "name", evalContext.Rule.Name, "firing", evalContext.Firing, "attemptID", attemptID)
attemptChan <- (attemptID + 1)
@@ -206,7 +202,7 @@ func (e *AlertingService) processJob(attemptID int, attemptChan chan int, cancel
}
// create new context with timeout for notifications
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), resultHandleTimeout)
resultHandleCtx, resultHandleCancelFn := context.WithTimeout(context.Background(), setting.AlertingNotificationTimeout)
cancelChan <- resultHandleCancelFn
// override the context used for evaluation with a new context for notifications.

View File

@@ -11,20 +11,22 @@ import (
"testing"
"time"
"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
)
func TestEngineTimeouts(t *testing.T) {
Convey("Alerting engine timeout tests", t, func() {
engine := NewEngine()
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}
Convey("Should trigger as many retries as needed", func() {
Convey("pended alert for datasource -> result handler should be worked", func() {
// reduce alert timeout to test quickly
originAlertTimeout := alertTimeout
alertTimeout = 2 * time.Second
setting.AlertingEvaluationTimeout = 30 * time.Second
transportTimeoutInterval := 2 * time.Second
serverBusySleepDuration := 1 * time.Second
@@ -39,7 +41,7 @@ func TestEngineTimeouts(t *testing.T) {
So(resultHandler.ResultHandleSucceed, ShouldEqual, true)
// initialize for other tests.
alertTimeout = originAlertTimeout
setting.AlertingEvaluationTimeout = 2 * time.Second
engine.resultHandler = &FakeResultHandler{}
})
})

View File

@@ -6,7 +6,9 @@ import (
"math"
"testing"
"github.com/grafana/grafana/pkg/setting"
. "github.com/smartystreets/goconvey/convey"
"time"
)
type FakeEvalHandler struct {
@@ -37,6 +39,9 @@ func (handler *FakeResultHandler) Handle(evalContext *EvalContext) error {
func TestEngineProcessJob(t *testing.T) {
Convey("Alerting engine job processing", t, func() {
engine := NewEngine()
setting.AlertingEvaluationTimeout = 30 * time.Second
setting.AlertingNotificationTimeout = 30 * time.Second
setting.AlertingMaxAttempts = 3
engine.resultHandler = &FakeResultHandler{}
job := &Job{Running: true, Rule: &Rule{}}
@@ -45,9 +50,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + not last attempt -> retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)
for i := 1; i < alertMaxAttempts; i++ {
for i := 1; i < setting.AlertingMaxAttempts; i++ {
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(i, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
@@ -61,9 +66,9 @@ func TestEngineProcessJob(t *testing.T) {
Convey("error + last attempt -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(0)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(alertMaxAttempts, attemptChan, cancelChan, job)
engine.processJob(setting.AlertingMaxAttempts, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
So(nextAttemptID, ShouldEqual, 0)
@@ -74,7 +79,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("no error -> no retry", func() {
engine.evalHandler = NewFakeEvalHandler(1)
attemptChan := make(chan int, 1)
cancelChan := make(chan context.CancelFunc, alertMaxAttempts)
cancelChan := make(chan context.CancelFunc, setting.AlertingMaxAttempts)
engine.processJob(1, attemptChan, cancelChan, job)
nextAttemptID, more := <-attemptChan
@@ -88,7 +93,7 @@ func TestEngineProcessJob(t *testing.T) {
Convey("Should trigger as many retries as needed", func() {
Convey("never success -> max retries number", func() {
expectedAttempts := alertMaxAttempts
expectedAttempts := setting.AlertingMaxAttempts
evalHandler := NewFakeEvalHandler(0)
engine.evalHandler = evalHandler
@@ -106,7 +111,7 @@ func TestEngineProcessJob(t *testing.T) {
})
Convey("some errors before success -> some retries", func() {
expectedAttempts := int(math.Ceil(float64(alertMaxAttempts) / 2))
expectedAttempts := int(math.Ceil(float64(setting.AlertingMaxAttempts) / 2))
evalHandler := NewFakeEvalHandler(expectedAttempts)
engine.evalHandler = evalHandler

View File

@@ -127,7 +127,7 @@ func (n *notificationService) uploadImage(context *EvalContext) (err error) {
renderOpts := rendering.Opts{
Width: 1000,
Height: 500,
Timeout: time.Duration(float64(alertTimeout) * 0.9),
Timeout: time.Duration(setting.AlertingEvaluationTimeout.Seconds() * 0.9),
OrgId: context.Rule.OrgId,
OrgRole: m.ROLE_ADMIN,
ConcurrentLimit: setting.AlertingRenderLimit,