diff --git a/pkg/models/alerts.go b/pkg/models/alerts.go index a986fb0037f..12c5794eeb5 100644 --- a/pkg/models/alerts.go +++ b/pkg/models/alerts.go @@ -113,6 +113,7 @@ type AlertJob struct { Offset int64 Delay bool Running bool + Retry int Rule AlertRule } diff --git a/pkg/models/alerts_state.go b/pkg/models/alerts_state.go index 4fb60f2c11f..68012e41503 100644 --- a/pkg/models/alerts_state.go +++ b/pkg/models/alerts_state.go @@ -27,6 +27,7 @@ var ( AlertStateCritical = "CRITICAL" AlertStateAcknowledged = "ACKNOWLEDGED" AlertStateMaintenance = "MAINTENANCE" + AlertStatePending = "PENDING" ) func (this *UpdateAlertStateCommand) IsValidState() bool { diff --git a/pkg/services/alerting/alerting.go b/pkg/services/alerting/alerting.go index 1ebc36550d8..6a818d3b731 100644 --- a/pkg/services/alerting/alerting.go +++ b/pkg/services/alerting/alerting.go @@ -1,6 +1,7 @@ package alerting import ( + "fmt" "time" "github.com/grafana/grafana/pkg/bus" @@ -9,6 +10,10 @@ import ( "github.com/grafana/grafana/pkg/setting" ) +var ( + MaxRetries = 3 +) + func Init() { if !setting.AlertingEnabled { return @@ -70,6 +75,7 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) { } else { job = &m.AlertJob{ Running: false, + Retry: 0, } } @@ -104,18 +110,32 @@ func (scheduler *Scheduler) executor(executor Executor) { func (scheduler *Scheduler) handleResponses() { for response := range scheduler.responseQueue { - log.Info("Response: alert(%d) status(%s) actual(%v) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Running) + log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Retry, response.AlertJob.Running) response.AlertJob.Running = false - cmd := &m.UpdateAlertStateCommand{ - AlertId: response.Id, - NewState: response.State, - Info: response.Description, + if response.State == m.AlertStatePending { + response.AlertJob.Retry++ + if response.AlertJob.Retry > MaxRetries { + response.State = m.AlertStateCritical + response.Description = fmt.Sprintf("Failed to run check after %d retires", MaxRetries) + scheduler.saveState(response) + } + } else { + response.AlertJob.Retry = 0 + scheduler.saveState(response) } + } +} - if err := bus.Dispatch(cmd); err != nil { - log.Error(2, "failed to save state %v", err) - } +func (scheduler *Scheduler) saveState(response *m.AlertResult) { + cmd := &m.UpdateAlertStateCommand{ + AlertId: response.Id, + NewState: response.State, + Info: response.Description, + } + + if err := bus.Dispatch(cmd); err != nil { + log.Error(2, "failed to save state %v", err) } } @@ -129,7 +149,7 @@ func (scheduler *Scheduler) measureAndExecute(exec Executor, job *m.AlertJob) { case <-time.After(time.Second * 5): scheduler.responseQueue <- &m.AlertResult{ Id: job.Rule.Id, - State: "timed out", + State: m.AlertStatePending, Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000), AlertJob: job, } diff --git a/pkg/services/alerting/executor.go b/pkg/services/alerting/executor.go index 67fbdd38d07..1bf86a27bd1 100644 --- a/pkg/services/alerting/executor.go +++ b/pkg/services/alerting/executor.go @@ -81,7 +81,7 @@ func (this *ExecutorImpl) Execute(job *m.AlertJob, responseQueue chan *m.AlertRe response, err := b.GetSeries(job) if err != nil { - responseQueue <- &m.AlertResult{State: "PENDING", Id: job.Rule.Id, AlertJob: job} + responseQueue <- &m.AlertResult{State: m.AlertStatePending, Id: job.Rule.Id, AlertJob: job} } result := this.validateRule(job.Rule, response)