feat(alerting): began work on splitting scheduler into scheduler and engine

This commit is contained in:
Torkel Ödegaard 2016-06-06 11:56:58 +02:00
parent a191b9b1cf
commit 0cbf4ae773
8 changed files with 314 additions and 99 deletions

View File

@ -11,19 +11,24 @@ var (
maxRetries = 3
)
var engine *Engine
func Init() {
if !setting.AlertingEnabled {
return
}
log.Info("Alerting: Initializing scheduler...")
log.Info("Alerting: Initializing alerting engine...")
scheduler := NewScheduler()
reader := NewRuleReader()
engine = NewEngine()
engine.Start()
go scheduler.dispatch(reader)
go scheduler.executor(&ExecutorImpl{})
go scheduler.handleResponses()
// scheduler := NewScheduler()
// reader := NewRuleReader()
//
// go scheduler.dispatch(reader)
// go scheduler.executor(&ExecutorImpl{})
// go scheduler.handleResponses()
}
func saveState(response *AlertResult) {

View File

@ -1 +1,53 @@
package alerting
import (
"time"
"github.com/Unknwon/log"
"github.com/benbjohnson/clock"
)
type Engine struct {
execQueue chan *AlertJob
resultQueue chan *AlertResult
clock clock.Clock
ticker *Ticker
scheduler Scheduler
}
func NewEngine() *Engine {
e := &Engine{
ticker: NewTicker(time.Now(), time.Second*0, clock.New()),
execQueue: make(chan *AlertJob, 1000),
resultQueue: make(chan *AlertResult, 1000),
scheduler: NewScheduler(),
}
return e
}
func (e *Engine) Start() {
go e.schedulerTick()
go e.execDispatch()
}
func (e *Engine) Stop() {
close(e.execQueue)
}
func (e *Engine) schedulerTick() {
for {
select {
case tick := <-e.ticker.C:
e.scheduler.Tick(tick, e.execQueue)
}
}
}
func (e *Engine) execDispatch() {
for job := range e.execQueue {
log.Info("AlertEngine: Dispatching alert job %s", job.Rule.Title)
job.Running = true
//scheduler.measureAndExecute(executor, job)
}
}

View File

@ -12,10 +12,6 @@ import (
"github.com/grafana/grafana/pkg/tsdb"
)
type Executor interface {
Execute(rule *AlertJob, responseQueue chan *AlertResult)
}
var (
resultLogFmt = "%s executor: %s %1.2f %s %1.2f : %v"
descriptionFmt = "Actual value: %1.2f for %s"
@ -111,7 +107,7 @@ func (executor *ExecutorImpl) GetSeries(job *AlertJob) (tsdb.TimeSeriesSlice, er
return nil, fmt.Errorf("Grafana does not support alerts for %s", query.Result.Type)
}
func (executor *ExecutorImpl) validateRule(rule AlertRule, series tsdb.TimeSeriesSlice) *AlertResult {
func (executor *ExecutorImpl) validateRule(rule *AlertRule, series tsdb.TimeSeriesSlice) *AlertResult {
for _, serie := range series {
if aggregator[rule.Aggregator] == nil {
continue

View File

@ -0,0 +1,12 @@
package alerting
import "time"
type Executor interface {
Execute(rule *AlertJob, resultChan chan *AlertResult)
}
type Scheduler interface {
Tick(time time.Time, execQueue chan *AlertJob)
Update(rules []*AlertRule)
}

View File

@ -7,7 +7,7 @@ type AlertJob struct {
Delay bool
Running bool
RetryCount int
Rule AlertRule
Rule *AlertRule
}
type AlertResult struct {

View File

@ -1,48 +1,25 @@
package alerting
import (
"fmt"
"time"
"github.com/Unknwon/log"
"github.com/grafana/grafana/pkg/services/alerting/alertstates"
"github.com/grafana/grafana/pkg/log"
)
type Scheduler struct {
jobs map[int64]*AlertJob
runQueue chan *AlertJob
responseQueue chan *AlertResult
type SchedulerImpl struct {
jobs map[int64]*AlertJob
}
func NewScheduler() *Scheduler {
return &Scheduler{
jobs: make(map[int64]*AlertJob, 0),
runQueue: make(chan *AlertJob, 1000),
responseQueue: make(chan *AlertResult, 1000),
func NewScheduler() Scheduler {
return &SchedulerImpl{
jobs: make(map[int64]*AlertJob, 0),
}
}
func (scheduler *Scheduler) dispatch(reader RuleReader) {
reschedule := time.NewTicker(time.Second * 10)
secondTicker := time.NewTicker(time.Second)
scheduler.updateJobs(reader.Fetch)
for {
select {
case <-secondTicker.C:
scheduler.queueJobs()
case <-reschedule.C:
scheduler.updateJobs(reader.Fetch)
}
}
}
func (scheduler *Scheduler) updateJobs(alertRuleFn func() []AlertRule) {
log.Debug("Scheduler: UpdateJobs()")
func (scheduler *SchedulerImpl) Update(rules []*AlertRule) {
log.Debug("Scheduler: Update()")
jobs := make(map[int64]*AlertJob, 0)
rules := alertRuleFn()
for i, rule := range rules {
var job *AlertJob
@ -65,65 +42,57 @@ func (scheduler *Scheduler) updateJobs(alertRuleFn func() []AlertRule) {
scheduler.jobs = jobs
}
func (scheduler *Scheduler) queueJobs() {
now := time.Now().Unix()
func (scheduler *SchedulerImpl) Tick(tickTime time.Time, execQueue chan *AlertJob) {
now := tickTime.Unix()
for _, job := range scheduler.jobs {
if now%job.Rule.Frequency == 0 && job.Running == false {
log.Info("Scheduler: Putting job on to run queue: %s", job.Rule.Title)
scheduler.runQueue <- job
log.Trace("Scheduler: Putting job on to exec queue: %s", job.Rule.Title)
execQueue <- job
}
}
}
func (scheduler *Scheduler) executor(executor Executor) {
for job := range scheduler.runQueue {
//log.Info("Executor: queue length %d", len(this.runQueue))
log.Info("Executor: executing %s", job.Rule.Title)
job.Running = true
scheduler.measureAndExecute(executor, job)
}
}
func (scheduler *Scheduler) handleResponses() {
for response := range scheduler.responseQueue {
log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d)", response.Id, response.State, response.ActualValue, response.AlertJob.RetryCount)
response.AlertJob.Running = false
if response.IsResultIncomplete() {
response.AlertJob.RetryCount++
if response.AlertJob.RetryCount < maxRetries {
scheduler.runQueue <- response.AlertJob
} else {
saveState(&AlertResult{
Id: response.Id,
State: alertstates.Critical,
Description: fmt.Sprintf("Failed to run check after %d retires", maxRetries),
})
}
} else {
response.AlertJob.RetryCount = 0
saveState(response)
}
}
}
func (scheduler *Scheduler) measureAndExecute(exec Executor, job *AlertJob) {
now := time.Now()
responseChan := make(chan *AlertResult, 1)
go exec.Execute(job, responseChan)
select {
case <-time.After(time.Second * 5):
scheduler.responseQueue <- &AlertResult{
Id: job.Rule.Id,
State: alertstates.Pending,
Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
AlertJob: job,
}
case result := <-responseChan:
result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
log.Info("Schedular: exeuction took %vms", result.Duration)
scheduler.responseQueue <- result
}
}
// func (scheduler *Scheduler) handleResponses() {
// for response := range scheduler.responseQueue {
// log.Info("Response: alert(%d) status(%s) actual(%v) retry(%d)", response.Id, response.State, response.ActualValue, response.AlertJob.RetryCount)
// response.AlertJob.Running = false
//
// if response.IsResultIncomplete() {
// response.AlertJob.RetryCount++
// if response.AlertJob.RetryCount < maxRetries {
// scheduler.runQueue <- response.AlertJob
// } else {
// saveState(&AlertResult{
// Id: response.Id,
// State: alertstates.Critical,
// Description: fmt.Sprintf("Failed to run check after %d retires", maxRetries),
// })
// }
// } else {
// response.AlertJob.RetryCount = 0
// saveState(response)
// }
// }
// }
//
// func (scheduler *Scheduler) measureAndExecute(exec Executor, job *AlertJob) {
// now := time.Now()
//
// responseChan := make(chan *AlertResult, 1)
// go exec.Execute(job, responseChan)
//
// select {
// case <-time.After(time.Second * 5):
// scheduler.responseQueue <- &AlertResult{
// Id: job.Rule.Id,
// State: alertstates.Pending,
// Duration: float64(time.Since(now).Nanoseconds()) / float64(1000000),
// AlertJob: job,
// }
// case result := <-responseChan:
// result.Duration = float64(time.Since(now).Nanoseconds()) / float64(1000000)
// log.Info("Schedular: exeuction took %vms", result.Duration)
// scheduler.responseQueue <- result
// }
// }

View File

@ -0,0 +1,60 @@
package alerting
import (
"time"
"github.com/benbjohnson/clock"
)
// ticker is a ticker to power the alerting scheduler. it's like a time.Ticker, except:
// * it doesn't drop ticks for slow receivers, rather, it queues up. so that callers are in control to instrument what's going on.
// * it automatically ticks every second, which is the right thing in our current design
// * it ticks on second marks or very shortly after. this provides a predictable load pattern
// (this shouldn't cause too much load contention issues because the next steps in the pipeline just process at their own pace)
// * the timestamps are used to mark "last datapoint to query for" and as such, are a configurable amount of seconds in the past
// * because we want to allow:
// - a clean "resume where we left off" and "don't yield ticks we already did"
// - adjusting offset over time to compensate for storage backing up or getting fast and providing lower latency
// you specify a lastProcessed timestamp as well as an offset at creation, or runtime
type Ticker struct {
C chan time.Time
clock clock.Clock
last time.Time
offset time.Duration
newOffset chan time.Duration
}
// NewTicker returns a ticker that ticks on second marks or very shortly after, and never drops ticks
func NewTicker(last time.Time, initialOffset time.Duration, c clock.Clock) *Ticker {
t := &Ticker{
C: make(chan time.Time),
clock: c,
last: last,
offset: initialOffset,
newOffset: make(chan time.Duration),
}
go t.run()
return t
}
func (t *Ticker) updateOffset(offset time.Duration) {
t.newOffset <- offset
}
func (t *Ticker) run() {
for {
next := t.last.Add(time.Duration(1) * time.Second)
diff := t.clock.Now().Add(-t.offset).Sub(next)
if diff >= 0 {
t.C <- next
t.last = next
continue
}
// tick is too young. try again when ...
select {
case <-t.clock.After(-diff): // ...it'll definitely be old enough
case offset := <-t.newOffset: // ...it might be old enough
t.offset = offset
}
}
}

View File

@ -0,0 +1,121 @@
package alerting
import (
"testing"
"time"
"github.com/benbjohnson/clock"
)
func inspectTick(tick time.Time, last time.Time, offset time.Duration, t *testing.T) {
if !tick.Equal(last.Add(time.Duration(1) * time.Second)) {
t.Fatalf("expected a tick 1 second more than prev, %s. got: %s", last, tick)
}
}
// returns the new last tick seen
func assertAdvanceUntil(ticker *Ticker, last, desiredLast time.Time, offset, wait time.Duration, t *testing.T) time.Time {
for {
select {
case tick := <-ticker.C:
inspectTick(tick, last, offset, t)
last = tick
case <-time.NewTimer(wait).C:
if last.Before(desiredLast) {
t.Fatalf("waited %s for ticker to advance to %s, but only went up to %s", wait, desiredLast, last)
}
if last.After(desiredLast) {
t.Fatalf("timer advanced too far. should only have gone up to %s, but it went up to %s", desiredLast, last)
}
return last
}
}
}
func assertNoAdvance(ticker *Ticker, desiredLast time.Time, wait time.Duration, t *testing.T) {
for {
select {
case tick := <-ticker.C:
t.Fatalf("timer should have stayed at %s, instead it advanced to %s", desiredLast, tick)
case <-time.NewTimer(wait).C:
return
}
}
}
func TestTickerRetro1Hour(t *testing.T) {
offset := time.Duration(10) * time.Second
last := time.Unix(0, 0)
mock := clock.NewMock()
mock.Add(time.Duration(1) * time.Hour)
desiredLast := mock.Now().Add(-offset)
ticker := NewTicker(last, offset, mock)
last = assertAdvanceUntil(ticker, last, desiredLast, offset, time.Duration(10)*time.Millisecond, t)
assertNoAdvance(ticker, last, time.Duration(500)*time.Millisecond, t)
}
func TestAdvanceWithUpdateOffset(t *testing.T) {
offset := time.Duration(10) * time.Second
last := time.Unix(0, 0)
mock := clock.NewMock()
mock.Add(time.Duration(1) * time.Hour)
desiredLast := mock.Now().Add(-offset)
ticker := NewTicker(last, offset, mock)
last = assertAdvanceUntil(ticker, last, desiredLast, offset, time.Duration(10)*time.Millisecond, t)
assertNoAdvance(ticker, last, time.Duration(500)*time.Millisecond, t)
// lowering offset should see a few more ticks
offset = time.Duration(5) * time.Second
ticker.updateOffset(offset)
desiredLast = mock.Now().Add(-offset)
last = assertAdvanceUntil(ticker, last, desiredLast, offset, time.Duration(9)*time.Millisecond, t)
assertNoAdvance(ticker, last, time.Duration(500)*time.Millisecond, t)
// advancing clock should see even more ticks
mock.Add(time.Duration(1) * time.Hour)
desiredLast = mock.Now().Add(-offset)
last = assertAdvanceUntil(ticker, last, desiredLast, offset, time.Duration(8)*time.Millisecond, t)
assertNoAdvance(ticker, last, time.Duration(500)*time.Millisecond, t)
}
func getCase(lastSeconds, offsetSeconds int) (time.Time, time.Duration) {
last := time.Unix(int64(lastSeconds), 0)
offset := time.Duration(offsetSeconds) * time.Second
return last, offset
}
func TestTickerNoAdvance(t *testing.T) {
// it's 00:01:00 now. what are some cases where we don't want the ticker to advance?
mock := clock.NewMock()
mock.Add(time.Duration(60) * time.Second)
type Case struct {
last int
offset int
}
// note that some cases add up to now, others go into the future
cases := []Case{
{50, 10},
{50, 30},
{59, 1},
{59, 10},
{59, 30},
{60, 1},
{60, 10},
{60, 30},
{90, 1},
{90, 10},
{90, 30},
}
for _, c := range cases {
last, offset := getCase(c.last, c.offset)
ticker := NewTicker(last, offset, mock)
assertNoAdvance(ticker, last, time.Duration(500)*time.Millisecond, t)
}
}