[Alerting]: Update scheduler to evaluate rules created by the unified API (#32589)

* Update scheduler

* Fix tests

* Fixes after code review feedback

* lint - add uncommitted modifications

Co-authored-by: kyle <kyle@grafana.com>
This commit is contained in:
Sofia Papagiannaki
2021-04-03 20:13:29 +03:00
committed by GitHub
parent 455fbce020
commit daabf64aa1
11 changed files with 275 additions and 212 deletions

View File

@@ -184,12 +184,9 @@ func (srv RulerSrv) RoutePostNameRulesConfig(c *models.ReqContext, ruleGroupConf
// TODO check quota
// TODO validate UID uniqueness in the payload
ruleGroup := ruleGroupConfig.Name
if err := srv.store.UpdateRuleGroup(store.UpdateRuleGroupCmd{
OrgID: c.SignedInUser.OrgId,
NamespaceUID: namespaceUID,
RuleGroup: ruleGroup,
RuleGroupConfig: ruleGroupConfig,
}); err != nil {
return response.Error(http.StatusInternalServerError, "failed to update rule group", err)

View File

@@ -74,7 +74,7 @@
"condition": "B",
"data": [
{
"refId": "query",
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 18000,
@@ -82,7 +82,7 @@
},
"model": {
"alias": "just-testing",
"datasource": "000000004",
"datasource": "gdev-testdata",
"datasourceUid": "000000004",
"intervalMs": 1000,
"maxDataPoints": 100,

View File

@@ -74,7 +74,7 @@
"condition": "B",
"data": [
{
"refId": "query",
"refId": "A",
"queryType": "",
"relativeTimeRange": {
"from": 18000,
@@ -82,7 +82,7 @@
},
"model": {
"alias": "just-testing",
"datasource": "000000004",
"datasource": "gdev-testdata",
"datasourceUid": "000000004",
"intervalMs": 1000,
"maxDataPoints": 100,

View File

@@ -25,13 +25,19 @@ const (
InstanceStateFiring InstanceStateType = "Alerting"
// InstanceStateNormal is for a normal alert.
InstanceStateNormal InstanceStateType = "Normal"
// InstanceStateNoData is for an alert with no data.
InstanceStateNoData InstanceStateType = "NoData"
// InstanceStateError is for a erroring alert.
InstanceStateError InstanceStateType = "Error"
)
// IsValid checks that the value of InstanceStateType is a valid
// string.
func (i InstanceStateType) IsValid() bool {
return i == InstanceStateFiring ||
i == InstanceStateNormal
i == InstanceStateNormal ||
i == InstanceStateNoData ||
i == InstanceStateError
}
// SaveAlertInstanceCommand is the query for saving a new alert instance.

View File

@@ -69,6 +69,7 @@ func (ng *AlertNG) Init() error {
MaxAttempts: maxAttempts,
Evaluator: eval.Evaluator{Cfg: ng.Cfg},
Store: store,
RuleStore: store,
Notifier: ng.Alertmanager,
}
ng.schedule = schedule.NewScheduler(schedCfg, ng.DataService)

View File

@@ -1,16 +1,14 @@
package schedule
import (
"time"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func (sch *schedule) fetchAllDetails(now time.Time) []*models.AlertDefinition {
q := models.ListAlertDefinitionsQuery{}
err := sch.store.GetAlertDefinitions(&q)
func (sch *schedule) fetchAllDetails() []*models.AlertRule {
q := models.ListAlertRulesQuery{}
err := sch.ruleStore.GetAlertRulesForScheduling(&q)
if err != nil {
sch.log.Error("failed to fetch alert definitions", "now", now, "err", err)
sch.log.Error("failed to fetch alert definitions", "err", err)
return nil
}
return q.Result

View File

@@ -34,18 +34,18 @@ type ScheduleService interface {
WarmStateCache(*state.StateTracker)
// the following are used by tests only used for tests
evalApplied(models.AlertDefinitionKey, time.Time)
stopApplied(models.AlertDefinitionKey)
evalApplied(models.AlertRuleKey, time.Time)
stopApplied(models.AlertRuleKey)
overrideCfg(cfg SchedulerCfg)
}
func (sch *schedule) definitionRoutine(grafanaCtx context.Context, key models.AlertDefinitionKey, evalCh <-chan *evalContext, stopCh <-chan struct{}, stateTracker *state.StateTracker) error {
sch.log.Debug("alert definition routine started", "key", key)
func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRuleKey, evalCh <-chan *evalContext, stopCh <-chan struct{}, stateTracker *state.StateTracker) error {
sch.log.Debug("alert rule routine started", "key", key)
evalRunning := false
var start, end time.Time
var attempt int64
var alertDefinition *models.AlertDefinition
var alertRule *models.AlertRule
for {
select {
case ctx := <-evalCh:
@@ -56,33 +56,33 @@ func (sch *schedule) definitionRoutine(grafanaCtx context.Context, key models.Al
evaluate := func(attempt int64) error {
start = timeNow()
// fetch latest alert definition version
if alertDefinition == nil || alertDefinition.Version < ctx.version {
q := models.GetAlertDefinitionByUIDQuery{OrgID: key.OrgID, UID: key.DefinitionUID}
err := sch.store.GetAlertDefinitionByUID(&q)
// fetch latest alert rule version
if alertRule == nil || alertRule.Version < ctx.version {
q := models.GetAlertRuleByUIDQuery{OrgID: key.OrgID, UID: key.UID}
err := sch.ruleStore.GetAlertRuleByUID(&q)
if err != nil {
sch.log.Error("failed to fetch alert definition", "key", key)
sch.log.Error("failed to fetch alert rule", "key", key)
return err
}
alertDefinition = q.Result
sch.log.Debug("new alert definition version fetched", "title", alertDefinition.Title, "key", key, "version", alertDefinition.Version)
alertRule = q.Result
sch.log.Debug("new alert rule version fetched", "title", alertRule.Title, "key", key, "version", alertRule.Version)
}
condition := models.Condition{
Condition: alertDefinition.Condition,
OrgID: alertDefinition.OrgID,
Data: alertDefinition.Data,
Condition: alertRule.Condition,
OrgID: alertRule.OrgID,
Data: alertRule.Data,
}
results, err := sch.evaluator.ConditionEval(&condition, ctx.now, sch.dataService)
end = timeNow()
if err != nil {
// consider saving alert instance on error
sch.log.Error("failed to evaluate alert definition", "title", alertDefinition.Title,
sch.log.Error("failed to evaluate alert rule", "title", alertRule.Title,
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
return err
}
processedStates := stateTracker.ProcessEvalResults(key.DefinitionUID, results, condition)
processedStates := stateTracker.ProcessEvalResults(key.UID, results, condition)
sch.saveAlertStates(processedStates)
alerts := FromAlertStateToPostableAlerts(processedStates)
sch.log.Debug("sending alerts to notifier", "count", len(alerts))
@@ -109,7 +109,7 @@ func (sch *schedule) definitionRoutine(grafanaCtx context.Context, key models.Al
}()
case <-stopCh:
sch.stopApplied(key)
sch.log.Debug("stopping alert definition routine", "key", key)
sch.log.Debug("stopping alert rule routine", "key", key)
// interrupt evaluation if it's running
return nil
case <-grafanaCtx.Done():
@@ -127,8 +127,8 @@ type schedule struct {
// base tick rate (fastest possible configured check)
baseInterval time.Duration
// each alert definition gets its own channel and routine
registry alertDefinitionRegistry
// each alert rule gets its own channel and routine
registry alertRuleRegistry
maxAttempts int64
@@ -139,12 +139,12 @@ type schedule struct {
// evalApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from evalApplied is handled.
evalAppliedFunc func(models.AlertDefinitionKey, time.Time)
evalAppliedFunc func(models.AlertRuleKey, time.Time)
// stopApplied is only used for tests: test code can set it to non-nil
// function, and then it'll be called from the event loop whenever the
// message from stopApplied is handled.
stopAppliedFunc func(models.AlertDefinitionKey)
stopAppliedFunc func(models.AlertRuleKey)
log log.Logger
@@ -152,6 +152,8 @@ type schedule struct {
store store.Store
ruleStore store.RuleStore
dataService *tsdb.Service
notifier Notifier
@@ -162,11 +164,12 @@ type SchedulerCfg struct {
C clock.Clock
BaseInterval time.Duration
Logger log.Logger
EvalAppliedFunc func(models.AlertDefinitionKey, time.Time)
EvalAppliedFunc func(models.AlertRuleKey, time.Time)
MaxAttempts int64
StopAppliedFunc func(models.AlertDefinitionKey)
StopAppliedFunc func(models.AlertRuleKey)
Evaluator eval.Evaluator
Store store.Store
RuleStore store.RuleStore
Notifier Notifier
}
@@ -174,7 +177,7 @@ type SchedulerCfg struct {
func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule {
ticker := alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
sch := schedule{
registry: alertDefinitionRegistry{alertDefinitionInfo: make(map[models.AlertDefinitionKey]alertDefinitionInfo)},
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]alertRuleInfo)},
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
baseInterval: cfg.BaseInterval,
@@ -184,6 +187,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service) *schedule {
stopAppliedFunc: cfg.StopAppliedFunc,
evaluator: cfg.Evaluator,
store: cfg.Store,
ruleStore: cfg.RuleStore,
dataService: dataService,
notifier: cfg.Notifier,
}
@@ -198,7 +202,7 @@ func (sch *schedule) overrideCfg(cfg SchedulerCfg) {
sch.stopAppliedFunc = cfg.StopAppliedFunc
}
func (sch *schedule) evalApplied(alertDefKey models.AlertDefinitionKey, now time.Time) {
func (sch *schedule) evalApplied(alertDefKey models.AlertRuleKey, now time.Time) {
if sch.evalAppliedFunc == nil {
return
}
@@ -206,7 +210,7 @@ func (sch *schedule) evalApplied(alertDefKey models.AlertDefinitionKey, now time
sch.evalAppliedFunc(alertDefKey, now)
}
func (sch *schedule) stopApplied(alertDefKey models.AlertDefinitionKey) {
func (sch *schedule) stopApplied(alertDefKey models.AlertRuleKey) {
if sch.stopAppliedFunc == nil {
return
}
@@ -219,7 +223,7 @@ func (sch *schedule) Pause() error {
return fmt.Errorf("scheduler is not initialised")
}
sch.heartbeat.Pause()
sch.log.Info("alert definition scheduler paused", "now", sch.clock.Now())
sch.log.Info("alert rule scheduler paused", "now", sch.clock.Now())
return nil
}
@@ -228,7 +232,7 @@ func (sch *schedule) Unpause() error {
return fmt.Errorf("scheduler is not initialised")
}
sch.heartbeat.Unpause()
sch.log.Info("alert definition scheduler unpaused", "now", sch.clock.Now())
sch.log.Info("alert rule scheduler unpaused", "now", sch.clock.Now())
return nil
}
@@ -238,50 +242,46 @@ func (sch *schedule) Ticker(grafanaCtx context.Context, stateTracker *state.Stat
select {
case tick := <-sch.heartbeat.C:
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
alertDefinitions := sch.fetchAllDetails(tick)
sch.log.Debug("alert definitions fetched", "count", len(alertDefinitions))
alertRules := sch.fetchAllDetails()
sch.log.Debug("alert rules fetched", "count", len(alertRules))
// registeredDefinitions is a map used for finding deleted alert definitions
// initially it is assigned to all known alert definitions from the previous cycle
// each alert definition found also in this cycle is removed
// so, at the end, the remaining registered alert definitions are the deleted ones
// registeredDefinitions is a map used for finding deleted alert rules
// initially it is assigned to all known alert rules from the previous cycle
// each alert rule found also in this cycle is removed
// so, at the end, the remaining registered alert rules are the deleted ones
registeredDefinitions := sch.registry.keyMap()
type readyToRunItem struct {
key models.AlertDefinitionKey
definitionInfo alertDefinitionInfo
key models.AlertRuleKey
ruleInfo alertRuleInfo
}
readyToRun := make([]readyToRunItem, 0)
for _, item := range alertDefinitions {
if item.Paused {
continue
}
for _, item := range alertRules {
key := item.GetKey()
itemVersion := item.Version
newRoutine := !sch.registry.exists(key)
definitionInfo := sch.registry.getOrCreateInfo(key, itemVersion)
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
if newRoutine && !invalidInterval {
dispatcherGroup.Go(func() error {
return sch.definitionRoutine(ctx, key, definitionInfo.evalCh, definitionInfo.stopCh, stateTracker)
return sch.ruleRoutine(ctx, key, ruleInfo.evalCh, ruleInfo.stopCh, stateTracker)
})
}
if invalidInterval {
// this is expected to be always false
// give that we validate interval during alert definition updates
sch.log.Debug("alert definition with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
// give that we validate interval during alert rule updates
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
continue
}
itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
if item.IntervalSeconds != 0 && tickNum%itemFrequency == 0 {
readyToRun = append(readyToRun, readyToRunItem{key: key, definitionInfo: definitionInfo})
readyToRun = append(readyToRun, readyToRunItem{key: key, ruleInfo: ruleInfo})
}
// remove the alert definition from the registered alert definitions
// remove the alert rule from the registered alert rules
delete(registeredDefinitions, key)
}
@@ -294,18 +294,18 @@ func (sch *schedule) Ticker(grafanaCtx context.Context, stateTracker *state.Stat
item := readyToRun[i]
time.AfterFunc(time.Duration(int64(i)*step), func() {
item.definitionInfo.evalCh <- &evalContext{now: tick, version: item.definitionInfo.version}
item.ruleInfo.evalCh <- &evalContext{now: tick, version: item.ruleInfo.version}
})
}
// unregister and stop routines of the deleted alert definitions
// unregister and stop routines of the deleted alert rules
for key := range registeredDefinitions {
definitionInfo, err := sch.registry.get(key)
ruleInfo, err := sch.registry.get(key)
if err != nil {
sch.log.Error("failed to get alert definition routine information", "err", err)
sch.log.Error("failed to get alert rule routine information", "err", err)
continue
}
definitionInfo.stopCh <- struct{}{}
ruleInfo.stopCh <- struct{}{}
sch.registry.del(key)
}
case <-grafanaCtx.Done():
@@ -394,63 +394,63 @@ func translateInstanceState(state models.InstanceStateType) eval.State {
}
}
type alertDefinitionRegistry struct {
mu sync.Mutex
alertDefinitionInfo map[models.AlertDefinitionKey]alertDefinitionInfo
type alertRuleRegistry struct {
mu sync.Mutex
alertRuleInfo map[models.AlertRuleKey]alertRuleInfo
}
// getOrCreateInfo returns the channel for the specific alert definition
// getOrCreateInfo returns the channel for the specific alert rule
// if it does not exists creates one and returns it
func (r *alertDefinitionRegistry) getOrCreateInfo(key models.AlertDefinitionKey, definitionVersion int64) alertDefinitionInfo {
func (r *alertRuleRegistry) getOrCreateInfo(key models.AlertRuleKey, ruleVersion int64) alertRuleInfo {
r.mu.Lock()
defer r.mu.Unlock()
info, ok := r.alertDefinitionInfo[key]
info, ok := r.alertRuleInfo[key]
if !ok {
r.alertDefinitionInfo[key] = alertDefinitionInfo{evalCh: make(chan *evalContext), stopCh: make(chan struct{}), version: definitionVersion}
return r.alertDefinitionInfo[key]
r.alertRuleInfo[key] = alertRuleInfo{evalCh: make(chan *evalContext), stopCh: make(chan struct{}), version: ruleVersion}
return r.alertRuleInfo[key]
}
info.version = definitionVersion
r.alertDefinitionInfo[key] = info
info.version = ruleVersion
r.alertRuleInfo[key] = info
return info
}
// get returns the channel for the specific alert definition
// get returns the channel for the specific alert rule
// if the key does not exist returns an error
func (r *alertDefinitionRegistry) get(key models.AlertDefinitionKey) (*alertDefinitionInfo, error) {
func (r *alertRuleRegistry) get(key models.AlertRuleKey) (*alertRuleInfo, error) {
r.mu.Lock()
defer r.mu.Unlock()
info, ok := r.alertDefinitionInfo[key]
info, ok := r.alertRuleInfo[key]
if !ok {
return nil, fmt.Errorf("%v key not found", key)
}
return &info, nil
}
func (r *alertDefinitionRegistry) exists(key models.AlertDefinitionKey) bool {
func (r *alertRuleRegistry) exists(key models.AlertRuleKey) bool {
r.mu.Lock()
defer r.mu.Unlock()
_, ok := r.alertDefinitionInfo[key]
_, ok := r.alertRuleInfo[key]
return ok
}
func (r *alertDefinitionRegistry) del(key models.AlertDefinitionKey) {
func (r *alertRuleRegistry) del(key models.AlertRuleKey) {
r.mu.Lock()
defer r.mu.Unlock()
delete(r.alertDefinitionInfo, key)
delete(r.alertRuleInfo, key)
}
func (r *alertDefinitionRegistry) iter() <-chan models.AlertDefinitionKey {
c := make(chan models.AlertDefinitionKey)
func (r *alertRuleRegistry) iter() <-chan models.AlertRuleKey {
c := make(chan models.AlertRuleKey)
f := func() {
r.mu.Lock()
defer r.mu.Unlock()
for k := range r.alertDefinitionInfo {
for k := range r.alertRuleInfo {
c <- k
}
close(c)
@@ -460,15 +460,15 @@ func (r *alertDefinitionRegistry) iter() <-chan models.AlertDefinitionKey {
return c
}
func (r *alertDefinitionRegistry) keyMap() map[models.AlertDefinitionKey]struct{} {
definitionsIDs := make(map[models.AlertDefinitionKey]struct{})
func (r *alertRuleRegistry) keyMap() map[models.AlertRuleKey]struct{} {
definitionsIDs := make(map[models.AlertRuleKey]struct{})
for k := range r.iter() {
definitionsIDs[k] = struct{}{}
}
return definitionsIDs
}
type alertDefinitionInfo struct {
type alertRuleInfo struct {
evalCh chan *evalContext
stopCh chan struct{}
version int64

View File

@@ -25,7 +25,6 @@ const AlertRuleMaxRuleGroupNameLength = 190
type UpdateRuleGroupCmd struct {
OrgID int64
NamespaceUID string
RuleGroup string
RuleGroupConfig apimodels.PostableRuleGroupConfig
}
@@ -40,7 +39,7 @@ type RuleStore interface {
DeleteNamespaceAlertRules(orgID int64, namespaceUID string) error
DeleteRuleGroupAlertRules(orgID int64, namespaceUID string, ruleGroup string) error
GetAlertRuleByUID(*ngmodels.GetAlertRuleByUIDQuery) error
GetAlertRules(query *ngmodels.ListAlertRulesQuery) error
GetAlertRulesForScheduling(query *ngmodels.ListAlertRulesQuery) error
GetOrgAlertRules(query *ngmodels.ListAlertRulesQuery) error
GetNamespaceAlertRules(query *ngmodels.ListNamespaceAlertRulesQuery) error
GetRuleGroupAlertRules(query *ngmodels.ListRuleGroupAlertRulesQuery) error
@@ -324,12 +323,12 @@ func (st DBstore) GetNamespaceByUID(UID string, orgID int64, user *models.Signed
return folder.Title, nil
}
// GetAlertRules returns alert rule identifier, interval, version and pause state
// that are useful for it's scheduling.
func (st DBstore) GetAlertRules(query *ngmodels.ListAlertRulesQuery) error {
// GetAlertRulesForScheduling returns alert rule info (identifier, interval, version state)
// that is useful for it's scheduling.
func (st DBstore) GetAlertRulesForScheduling(query *ngmodels.ListAlertRulesQuery) error {
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
alerts := make([]*ngmodels.AlertRule, 0)
q := "SELECT uid, org_id, interval_seconds, version, paused FROM alert_rule"
q := "SELECT uid, org_id, interval_seconds, version FROM alert_rule"
if err := sess.SQL(q).Find(&alerts); err != nil {
return err
}
@@ -391,10 +390,11 @@ func (st DBstore) ValidateAlertRule(alertRule ngmodels.AlertRule, requireData bo
// UpdateRuleGroup creates new rules and updates and/or deletes existing rules
func (st DBstore) UpdateRuleGroup(cmd UpdateRuleGroupCmd) error {
return st.SQLStore.WithTransactionalDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
ruleGroup := cmd.RuleGroupConfig.Name
q := &ngmodels.ListRuleGroupAlertRulesQuery{
OrgID: cmd.OrgID,
NamespaceUID: cmd.NamespaceUID,
RuleGroup: cmd.RuleGroup,
RuleGroup: ruleGroup,
}
if err := st.GetRuleGroupAlertRules(q); err != nil {
return err
@@ -421,7 +421,7 @@ func (st DBstore) UpdateRuleGroup(cmd UpdateRuleGroupCmd) error {
UID: r.GrafanaManagedAlert.UID,
IntervalSeconds: int64(time.Duration(cmd.RuleGroupConfig.Interval).Seconds()),
NamespaceUID: cmd.NamespaceUID,
RuleGroup: cmd.RuleGroup,
RuleGroup: ruleGroup,
NoDataState: ngmodels.NoDataState(r.GrafanaManagedAlert.NoDataState),
ExecErrState: ngmodels.ExecutionErrorState(r.GrafanaManagedAlert.ExecErrState),
},

View File

@@ -5,6 +5,8 @@ package tests
import (
"encoding/json"
"errors"
"fmt"
"math/rand"
"testing"
"time"
@@ -418,3 +420,31 @@ func getLongString(n int) string {
}
return string(b)
}
// createTestAlertDefinition creates a dummy alert definition to be used by the tests.
func createTestAlertDefinition(t *testing.T, dbstore *store.DBstore, intervalSeconds int64) *models.AlertDefinition {
cmd := models.SaveAlertDefinitionCommand{
OrgID: 1,
Title: fmt.Sprintf("an alert definition %d", rand.Intn(1000)),
Condition: "A",
Data: []models.AlertQuery{
{
Model: json.RawMessage(`{
"datasource": "__expr__",
"type":"math",
"expression":"2 + 2 > 1"
}`),
RelativeTimeRange: models.RelativeTimeRange{
From: models.Duration(5 * time.Hour),
To: models.Duration(3 * time.Hour),
},
RefID: "A",
},
},
IntervalSeconds: &intervalSeconds,
}
err := dbstore.SaveAlertDefinition(&cmd)
require.NoError(t, err)
t.Logf("alert definition: %v with interval: %d created", cmd.Result.GetKey(), intervalSeconds)
return cmd.Result
}

View File

@@ -26,7 +26,7 @@ import (
)
type evalAppliedInfo struct {
alertDefKey models.AlertDefinitionKey
alertDefKey models.AlertRuleKey
now time.Time
}
@@ -109,15 +109,15 @@ func TestAlertingTicker(t *testing.T) {
dbstore := setupTestEnv(t, 1)
t.Cleanup(registry.ClearOverrides)
alerts := make([]*models.AlertDefinition, 0)
// create alert definition with zero interval (should never run)
alerts = append(alerts, createTestAlertDefinition(t, dbstore, 0))
alerts := make([]*models.AlertRule, 0)
// create alert rule with zero interval (should never run)
alerts = append(alerts, createTestAlertRule(t, dbstore, 0))
// create alert definition with one second interval
alerts = append(alerts, createTestAlertDefinition(t, dbstore, 1))
// create alert rule with one second interval
alerts = append(alerts, createTestAlertRule(t, dbstore, 1))
evalAppliedCh := make(chan evalAppliedInfo, len(alerts))
stopAppliedCh := make(chan models.AlertDefinitionKey, len(alerts))
stopAppliedCh := make(chan models.AlertRuleKey, len(alerts))
mockedClock := clock.NewMock()
baseInterval := time.Second
@@ -125,14 +125,15 @@ func TestAlertingTicker(t *testing.T) {
schedCfg := schedule.SchedulerCfg{
C: mockedClock,
BaseInterval: baseInterval,
EvalAppliedFunc: func(alertDefKey models.AlertDefinitionKey, now time.Time) {
EvalAppliedFunc: func(alertDefKey models.AlertRuleKey, now time.Time) {
evalAppliedCh <- evalAppliedInfo{alertDefKey: alertDefKey, now: now}
},
StopAppliedFunc: func(alertDefKey models.AlertDefinitionKey) {
StopAppliedFunc: func(alertDefKey models.AlertRuleKey) {
stopAppliedCh <- alertDefKey
},
Store: dbstore,
Logger: log.New("ngalert schedule test"),
Store: dbstore,
RuleStore: dbstore,
Logger: log.New("ngalert schedule test"),
}
sched := schedule.NewScheduler(schedCfg, nil)
@@ -145,101 +146,69 @@ func TestAlertingTicker(t *testing.T) {
}()
runtime.Gosched()
expectedAlertDefinitionsEvaluated := []models.AlertDefinitionKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 1st tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
expectedAlertRulesEvaluated := []models.AlertRuleKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 1st tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
// change alert definition interval to three seconds
// change alert rule interval to three seconds
var threeSecInterval int64 = 3
err := dbstore.UpdateAlertDefinition(&models.UpdateAlertDefinitionCommand{
UID: alerts[0].UID,
IntervalSeconds: &threeSecInterval,
OrgID: alerts[0].OrgID,
alerts[0] = updateTestAlertRuleIntervalSeconds(t, dbstore, alerts[0], threeSecInterval)
t.Logf("alert rule: %v interval reset to: %d", alerts[0].GetKey(), threeSecInterval)
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 2nd tick alert rule: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[1].GetKey(), alerts[0].GetKey()}
t.Run(fmt.Sprintf("on 3rd tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 4th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
err := dbstore.DeleteAlertRuleByUID(alerts[1].OrgID, alerts[1].UID)
require.NoError(t, err)
t.Logf("alert definition: %v interval reset to: %d", alerts[0].GetKey(), threeSecInterval)
t.Logf("alert rule: %v deleted", alerts[1].GetKey())
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 2nd tick alert definition: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
expectedAlertRulesEvaluated = []models.AlertRuleKey{}
t.Run(fmt.Sprintf("on 5th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
expectedAlertRulesStopped := []models.AlertRuleKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 5th tick alert rules: %s should be stopped", concatenate(expectedAlertRulesStopped)), func(t *testing.T) {
assertStopRun(t, stopAppliedCh, expectedAlertRulesStopped...)
})
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[1].GetKey(), alerts[0].GetKey()}
t.Run(fmt.Sprintf("on 3rd tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[0].GetKey()}
t.Run(fmt.Sprintf("on 6th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 4th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
// create alert rule with one second interval
alerts = append(alerts, createTestAlertRule(t, dbstore, 1))
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 7th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
})
err = dbstore.DeleteAlertDefinitionByUID(&models.DeleteAlertDefinitionByUIDCommand{UID: alerts[1].UID, OrgID: alerts[1].OrgID})
require.NoError(t, err)
t.Logf("alert definition: %v deleted", alerts[1].GetKey())
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{}
t.Run(fmt.Sprintf("on 5th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
})
expectedAlertDefinitionsStopped := []models.AlertDefinitionKey{alerts[1].GetKey()}
t.Run(fmt.Sprintf("on 5th tick alert definitions: %s should be stopped", concatenate(expectedAlertDefinitionsStopped)), func(t *testing.T) {
assertStopRun(t, stopAppliedCh, expectedAlertDefinitionsStopped...)
})
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[0].GetKey()}
t.Run(fmt.Sprintf("on 6th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
})
// create alert definition with one second interval
alerts = append(alerts, createTestAlertDefinition(t, dbstore, 1))
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 7th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
})
// pause alert definition
err = dbstore.UpdateAlertDefinitionPaused(&models.UpdateAlertDefinitionPausedCommand{UIDs: []string{alerts[2].UID}, OrgID: alerts[2].OrgID, Paused: true})
require.NoError(t, err)
t.Logf("alert definition: %v paused", alerts[2].GetKey())
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{}
t.Run(fmt.Sprintf("on 8th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
})
expectedAlertDefinitionsStopped = []models.AlertDefinitionKey{alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 8th tick alert definitions: %s should be stopped", concatenate(expectedAlertDefinitionsStopped)), func(t *testing.T) {
assertStopRun(t, stopAppliedCh, expectedAlertDefinitionsStopped...)
})
// unpause alert definition
err = dbstore.UpdateAlertDefinitionPaused(&models.UpdateAlertDefinitionPausedCommand{UIDs: []string{alerts[2].UID}, OrgID: alerts[2].OrgID, Paused: false})
require.NoError(t, err)
t.Logf("alert definition: %v unpaused", alerts[2].GetKey())
expectedAlertDefinitionsEvaluated = []models.AlertDefinitionKey{alerts[0].GetKey(), alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 9th tick alert definitions: %s should be evaluated", concatenate(expectedAlertDefinitionsEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertDefinitionsEvaluated...)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
}
func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys ...models.AlertDefinitionKey) {
func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys ...models.AlertRuleKey) {
timeout := time.After(time.Second)
expected := make(map[models.AlertDefinitionKey]struct{}, len(keys))
expected := make(map[models.AlertRuleKey]struct{}, len(keys))
for _, k := range keys {
expected[k] = struct{}{}
}
@@ -248,7 +217,7 @@ func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys
select {
case info := <-ch:
_, ok := expected[info.alertDefKey]
t.Logf("alert definition: %v evaluated at: %v", info.alertDefKey, info.now)
t.Logf("alert rule: %v evaluated at: %v", info.alertDefKey, info.now)
assert.True(t, ok)
assert.Equal(t, tick, info.now)
delete(expected, info.alertDefKey)
@@ -264,10 +233,10 @@ func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys
}
}
func assertStopRun(t *testing.T, ch <-chan models.AlertDefinitionKey, keys ...models.AlertDefinitionKey) {
func assertStopRun(t *testing.T, ch <-chan models.AlertRuleKey, keys ...models.AlertRuleKey) {
timeout := time.After(time.Second)
expected := make(map[models.AlertDefinitionKey]struct{}, len(keys))
expected := make(map[models.AlertRuleKey]struct{}, len(keys))
for _, k := range keys {
expected[k] = struct{}{}
}
@@ -276,7 +245,7 @@ func assertStopRun(t *testing.T, ch <-chan models.AlertDefinitionKey, keys ...mo
select {
case alertDefKey := <-ch:
_, ok := expected[alertDefKey]
t.Logf("alert definition: %v stopped", alertDefKey)
t.Logf("alert rule: %v stopped", alertDefKey)
assert.True(t, ok)
delete(expected, alertDefKey)
if len(expected) == 0 {
@@ -297,7 +266,7 @@ func advanceClock(t *testing.T, mockedClock *clock.Mock) time.Time {
// t.Logf("Tick: %v", mockedClock.Now())
}
func concatenate(keys []models.AlertDefinitionKey) string {
func concatenate(keys []models.AlertRuleKey) string {
s := make([]string, len(keys))
for _, k := range keys {
s = append(s, k.String())

View File

@@ -7,6 +7,9 @@ import (
"testing"
"time"
apimodels "github.com/grafana/alerting-api/pkg/api"
"github.com/prometheus/common/model"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert"
@@ -61,30 +64,89 @@ func overrideAlertNGInRegistry(t *testing.T, cfg *setting.Cfg) ngalert.AlertNG {
return ng
}
// createTestAlertDefinition creates a dummy alert definition to be used by the tests.
func createTestAlertDefinition(t *testing.T, store *store.DBstore, intervalSeconds int64) *models.AlertDefinition {
cmd := models.SaveAlertDefinitionCommand{
OrgID: 1,
Title: fmt.Sprintf("an alert definition %d", rand.Intn(1000)),
Condition: "A",
Data: []models.AlertQuery{
{
Model: json.RawMessage(`{
"datasource": "__expr__",
"type":"math",
"expression":"2 + 2 > 1"
}`),
RelativeTimeRange: models.RelativeTimeRange{
From: models.Duration(5 * time.Hour),
To: models.Duration(3 * time.Hour),
// createTestAlertRule creates a dummy alert definition to be used by the tests.
func createTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds int64) *models.AlertRule {
d := rand.Intn(1000)
ruleGroup := fmt.Sprintf("ruleGroup-%d", d)
err := dbstore.UpdateRuleGroup(store.UpdateRuleGroupCmd{
OrgID: 1,
NamespaceUID: "namespace",
RuleGroupConfig: apimodels.PostableRuleGroupConfig{
Name: ruleGroup,
Interval: model.Duration(time.Duration(intervalSeconds) * time.Second),
Rules: []apimodels.PostableExtendedRuleNode{
{
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
OrgID: 1,
Title: fmt.Sprintf("an alert definition %d", d),
Condition: "A",
Data: []models.AlertQuery{
{
Model: json.RawMessage(`{
"datasource": "__expr__",
"type":"math",
"expression":"2 + 2 > 1"
}`),
RelativeTimeRange: models.RelativeTimeRange{
From: models.Duration(5 * time.Hour),
To: models.Duration(3 * time.Hour),
},
RefID: "A",
},
},
},
},
RefID: "A",
},
},
IntervalSeconds: &intervalSeconds,
}
err := store.SaveAlertDefinition(&cmd)
})
require.NoError(t, err)
t.Logf("alert definition: %v with interval: %d created", cmd.Result.GetKey(), intervalSeconds)
return cmd.Result
q := models.ListRuleGroupAlertRulesQuery{
OrgID: 1,
NamespaceUID: "namespace",
RuleGroup: ruleGroup,
}
err = dbstore.GetRuleGroupAlertRules(&q)
require.NoError(t, err)
require.NotEmpty(t, q.Result)
rule := q.Result[0]
t.Logf("alert definition: %v with interval: %d created", rule.GetKey(), rule.IntervalSeconds)
return rule
}
// updateTestAlertRule update a dummy alert definition to be used by the tests.
func updateTestAlertRuleIntervalSeconds(t *testing.T, dbstore *store.DBstore, existingRule *models.AlertRule, intervalSeconds int64) *models.AlertRule {
cmd := store.UpdateRuleGroupCmd{
OrgID: 1,
NamespaceUID: "namespace",
RuleGroupConfig: apimodels.PostableRuleGroupConfig{
Name: existingRule.RuleGroup,
Interval: model.Duration(time.Duration(intervalSeconds) * time.Second),
Rules: []apimodels.PostableExtendedRuleNode{
{
GrafanaManagedAlert: &apimodels.PostableGrafanaRule{
OrgID: 1,
UID: existingRule.UID,
},
},
},
},
}
err := dbstore.UpdateRuleGroup(cmd)
require.NoError(t, err)
q := models.ListRuleGroupAlertRulesQuery{
OrgID: 1,
NamespaceUID: "namespace",
RuleGroup: existingRule.RuleGroup,
}
err = dbstore.GetRuleGroupAlertRules(&q)
require.NoError(t, err)
require.NotEmpty(t, q.Result)
rule := q.Result[0]
t.Logf("alert definition: %v with interval: %d created", rule.GetKey(), rule.IntervalSeconds)
return rule
}