mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
feat(alerting): adds support for retries
This commit is contained in:
parent
68f01d57d3
commit
50d98b161c
@ -113,6 +113,7 @@ type AlertJob struct {
|
||||
Offset int64
|
||||
Delay bool
|
||||
Running bool
|
||||
Retry int
|
||||
Rule AlertRule
|
||||
}
|
||||
|
||||
|
@ -27,6 +27,7 @@ var (
|
||||
AlertStateCritical = "CRITICAL"
|
||||
AlertStateAcknowledged = "ACKNOWLEDGED"
|
||||
AlertStateMaintenance = "MAINTENANCE"
|
||||
AlertStatePending = "PENDING"
|
||||
)
|
||||
|
||||
func (this *UpdateAlertStateCommand) IsValidState() bool {
|
||||
|
@ -1,6 +1,7 @@
|
||||
package alerting
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/bus"
|
||||
@ -9,6 +10,10 @@ import (
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
var (
|
||||
MaxRetries = 3
|
||||
)
|
||||
|
||||
func Init() {
|
||||
if !setting.AlertingEnabled {
|
||||
return
|
||||
@ -70,6 +75,7 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []m.AlertRule) {
|
||||
} else {
|
||||
job = &m.AlertJob{
|
||||
Running: false,
|
||||
Retry: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -104,18 +110,32 @@ func (scheduler *Scheduler) executor(executor Executor) {
|
||||
|
||||
func (scheduler *Scheduler) handleResponses() {
|
||||
for response := range scheduler.responseQueue {
|
||||
log.Info("Response: alert(%d) status(%s) actual(%v) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Running)
|
||||
log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d) running(%v)", response.Id, response.State, response.ActualValue, response.AlertJob.Retry, response.AlertJob.Running)
|
||||
response.AlertJob.Running = false
|
||||
|
||||
cmd := &m.UpdateAlertStateCommand{
|
||||
AlertId: response.Id,
|
||||
NewState: response.State,
|
||||
Info: response.Description,
|
||||
if response.State == m.AlertStatePending {
|
||||
response.AlertJob.Retry++
|
||||
if response.AlertJob.Retry > MaxRetries {
|
||||
response.State = m.AlertStateCritical
|
||||
response.Description = fmt.Sprintf("Failed to run check after %d retires", MaxRetries)
|
||||
scheduler.saveState(response)
|
||||
}
|
||||
} else {
|
||||
response.AlertJob.Retry = 0
|
||||
scheduler.saveState(response)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := bus.Dispatch(cmd); err != nil {
|
||||
log.Error(2, "failed to save state %v", err)
|
||||
}
|
||||
func (scheduler *Scheduler) saveState(response *m.AlertResult) {
|
||||
cmd := &m.UpdateAlertStateCommand{
|
||||
AlertId: response.Id,
|
||||
NewState: response.State,
|
||||
Info: response.Description,
|
||||
}
|
||||
|
||||
if err := bus.Dispatch(cmd); err != nil {
|
||||
log.Error(2, "failed to save state %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +149,7 @@ func (scheduler *Scheduler) measureAndExecute(exec Executor, job *m.AlertJob) {
|
||||
case <-time.After(time.Second * 5):
|
||||
scheduler.responseQueue <- &m.AlertResult{
|
||||
Id: job.Rule.Id,
|
||||
State: "timed out",
|
||||
State: m.AlertStatePending,
|
||||
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
|
||||
AlertJob: job,
|
||||
}
|
||||
|
@ -81,7 +81,7 @@ func (this *ExecutorImpl) Execute(job *m.AlertJob, responseQueue chan *m.AlertRe
|
||||
response, err := b.GetSeries(job)
|
||||
|
||||
if err != nil {
|
||||
responseQueue <- &m.AlertResult{State: "PENDING", Id: job.Rule.Id, AlertJob: job}
|
||||
responseQueue <- &m.AlertResult{State: m.AlertStatePending, Id: job.Rule.Id, AlertJob: job}
|
||||
}
|
||||
|
||||
result := this.validateRule(job.Rule, response)
|
||||
|
Loading…
Reference in New Issue
Block a user