feat(alerting): adds support for retries

This commit is contained in:
bergquist 2016-06-03 09:04:20 +02:00
parent 68f01d57d3
commit 50d98b161c
4 changed files with 32 additions and 10 deletions

View File

@ -113,6 +113,7 @@ type AlertJob struct {
Offset int64 Offset int64
Delay bool Delay bool
Running bool Running bool
Retry int
Rule AlertRule Rule AlertRule
} }

View File

@ -27,6 +27,7 @@ var (
AlertStateCritical = "CRITICAL" AlertStateCritical = "CRITICAL"
AlertStateAcknowledged = "ACKNOWLEDGED" AlertStateAcknowledged = "ACKNOWLEDGED"
AlertStateMaintenance = "MAINTENANCE" AlertStateMaintenance = "MAINTENANCE"
AlertStatePending = "PENDING"
) )
func (this *UpdateAlertStateCommand) IsValidState() bool { func (this *UpdateAlertStateCommand) IsValidState() bool {

View File

@ -1,6 +1,7 @@
package alerting package alerting
import ( import (
"fmt"
"time" "time"
"github.com/grafana/grafana/pkg/bus" "github.com/grafana/grafana/pkg/bus"
@ -9,6 +10,10 @@ import (
"github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana/pkg/setting"
) )
var (
MaxRetries = 3
)
func Init() { func Init() {
if !setting.AlertingEnabled { if !setting.AlertingEnabled {
return return
@ -70,6 +75,7 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) {
} else { } else {
job = &m.AlertJob{ job = &m.AlertJob{
Running: false, Running: false,
Retry: 0,
} }
} }
@ -104,18 +110,32 @@ func (scheduler *Scheduler) executor(executor Executor) {
func (scheduler *Scheduler) handleResponses() { func (scheduler *Scheduler) handleResponses() {
for response := range scheduler.responseQueue { for response := range scheduler.responseQueue {
log.Info("Response: alert(%d) status(%s) actual(%v) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Running) log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Retry, response.AlertJob.Running)
response.AlertJob.Running = false response.AlertJob.Running = false
cmd := &m.UpdateAlertStateCommand{ if response.State == m.AlertStatePending {
AlertId: response.Id, response.AlertJob.Retry++
NewState: response.State, if response.AlertJob.Retry > MaxRetries {
Info: response.Description, response.State = m.AlertStateCritical
response.Description = fmt.Sprintf("Failed to run check after %d retires", MaxRetries)
scheduler.saveState(response)
}
} else {
response.AlertJob.Retry = 0
scheduler.saveState(response)
} }
}
}
if err := bus.Dispatch(cmd); err != nil { func (scheduler *Scheduler) saveState(response *m.AlertResult) {
log.Error(2, "failed to save state %v", err) cmd := &m.UpdateAlertStateCommand{
} AlertId: response.Id,
NewState: response.State,
Info: response.Description,
}
if err := bus.Dispatch(cmd); err != nil {
log.Error(2, "failed to save state %v", err)
} }
} }
@ -129,7 +149,7 @@ func (scheduler *Scheduler) measureAndExecute(exec Executor, job *m.AlertJob) {
case <-time.After(time.Second * 5): case <-time.After(time.Second * 5):
scheduler.responseQueue <- &m.AlertResult{ scheduler.responseQueue <- &m.AlertResult{
Id: job.Rule.Id, Id: job.Rule.Id,
State: "timed out", State: m.AlertStatePending,
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000), Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
AlertJob: job, AlertJob: job,
} }

View File

@ -81,7 +81,7 @@ func (this *ExecutorImpl) Execute(job *m.AlertJob, responseQueue chan *m.AlertRe
response, err := b.GetSeries(job) response, err := b.GetSeries(job)
if err != nil { if err != nil {
responseQueue <- &m.AlertResult{State: "PENDING", Id: job.Rule.Id, AlertJob: job} responseQueue <- &m.AlertResult{State: m.AlertStatePending, Id: job.Rule.Id, AlertJob: job}
} }
result := this.validateRule(job.Rule, response) result := this.validateRule(job.Rule, response)