Alerting: Persist alerts on evaluation and shutdown. Warm cache from DB on startup (#32576)

* Initial commit for state tracking

* basic state transition logic and tests

* constructor. test and interface fixup

* use new sig for sch.definitionRoutine()

* test fixup

* make the linter happy

* more minor linting cleanup

* Alerting: Send alerts from state tracker to notifier

* Add evaluation time and test

Add evaluation time and test

* Add cleanup routine and logging

* Pull in compact.go and reconcile differences

* Save alert transitions and save all state on shutdown

* pr feedback

* WIP

* WIP

* Persist alerts on evaluation and shutdown. Warm cache on startup

* Filter non-firing alerts before sending to notifier

Co-authored-by: Josue Abreu <josue@grafana.com>
This commit is contained in:
David Parrott 2021-04-02 08:11:33 -07:00 committed by GitHub
parent 698a1ee003
commit 2a8446e435
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 468 additions and 150 deletions

View File

@ -13,6 +13,7 @@ type AlertInstance struct {
LabelsHash string
CurrentState InstanceStateType
CurrentStateSince time.Time
CurrentStateEnd time.Time
LastEvalTime time.Time
}
@ -35,11 +36,13 @@ func (i InstanceStateType) IsValid() bool {
// SaveAlertInstanceCommand is the query for saving a new alert instance.
type SaveAlertInstanceCommand struct {
DefinitionOrgID int64
DefinitionUID string
Labels InstanceLabels
State InstanceStateType
LastEvalTime time.Time
DefinitionOrgID int64
DefinitionUID string
Labels InstanceLabels
State InstanceStateType
LastEvalTime time.Time
CurrentStateSince time.Time
CurrentStateEnd time.Time
}
// GetAlertInstanceQuery is the query for retrieving/deleting an alert definition by ID.
@ -61,6 +64,10 @@ type ListAlertInstancesQuery struct {
Result []*ListAlertInstancesQueryResult
}
type FetchUniqueOrgIdsQuery struct {
Result []*FetchUniqueOrgIdsQueryResult
}
// ListAlertInstancesQueryResult represents the result of listAlertInstancesQuery.
type ListAlertInstancesQueryResult struct {
DefinitionOrgID int64 `xorm:"def_org_id" json:"definitionOrgId"`
@ -70,9 +77,14 @@ type ListAlertInstancesQueryResult struct {
LabelsHash string `json:"labeHash"`
CurrentState InstanceStateType `json:"currentState"`
CurrentStateSince time.Time `json:"currentStateSince"`
CurrentStateEnd time.Time `json:"currentStateEnd"`
LastEvalTime time.Time `json:"lastEvalTime"`
}
type FetchUniqueOrgIdsQueryResult struct {
DefinitionOrgID int64 `xorm:"def_org_id" json:"definitionOrgId"`
}
// ValidateAlertInstance validates that the alert instance contains an alert definition id,
// and state.
func ValidateAlertInstance(alertInstance *AlertInstance) error {

View File

@ -93,6 +93,7 @@ func (ng *AlertNG) Init() error {
// Run starts the scheduler
func (ng *AlertNG) Run(ctx context.Context) error {
ng.Log.Debug("ngalert starting")
ng.schedule.WarmStateCache(ng.stateTracker)
return ng.schedule.Ticker(ctx, ng.stateTracker)
}

View File

@ -1,6 +1,8 @@
package schedule
import (
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/prometheus/alertmanager/api/v2/models"
@ -8,17 +10,19 @@ import (
func FromAlertStateToPostableAlerts(firingStates []state.AlertState) []*notifier.PostableAlert {
alerts := make([]*notifier.PostableAlert, 0, len(firingStates))
for _, state := range firingStates {
alerts = append(alerts, &notifier.PostableAlert{
PostableAlert: models.PostableAlert{
Annotations: models.LabelSet{}, //TODO: add annotations to evaluation results, add them to the state struct, and then set them before sending to the notifier
StartsAt: state.StartsAt,
EndsAt: state.EndsAt,
Alert: models.Alert{
Labels: models.LabelSet(state.Labels),
for _, alertState := range firingStates {
if alertState.State == eval.Alerting {
alerts = append(alerts, &notifier.PostableAlert{
PostableAlert: models.PostableAlert{
Annotations: models.LabelSet{}, //TODO: add annotations to evaluation results, add them to the alertState struct, and then set them before sending to the notifier
StartsAt: strfmt.DateTime(alertState.StartsAt),
EndsAt: strfmt.DateTime(alertState.EndsAt),
Alert: models.Alert{
Labels: models.LabelSet(alertState.Labels),
},
},
},
})
})
}
}
return alerts
}

View File

@ -6,6 +6,8 @@ import (
"sync"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"golang.org/x/sync/errgroup"
"github.com/benbjohnson/clock"
@ -29,6 +31,7 @@ type ScheduleService interface {
Ticker(context.Context, *state.StateTracker) error
Pause() error
Unpause() error
WarmStateCache(*state.StateTracker)
// the following are used by tests only used for tests
evalApplied(models.AlertDefinitionKey, time.Time)
@ -78,17 +81,12 @@ func (sch *schedule) definitionRoutine(grafanaCtx context.Context, key models.Al
"key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "error", err)
return err
}
for _, r := range results {
sch.log.Info("alert definition result", "title", alertDefinition.Title, "key", key, "attempt", attempt, "now", ctx.now, "duration", end.Sub(start), "instance", r.Instance, "state", r.State.String())
cmd := models.SaveAlertInstanceCommand{DefinitionOrgID: key.OrgID, DefinitionUID: key.DefinitionUID, State: models.InstanceStateType(r.State.String()), Labels: models.InstanceLabels(r.Instance), LastEvalTime: ctx.now}
err := sch.store.SaveAlertInstance(&cmd)
if err != nil {
sch.log.Error("failed saving alert instance", "title", alertDefinition.Title, "key", key, "attempt", attempt, "now", ctx.now, "instance", r.Instance, "state", r.State.String(), "error", err)
}
}
transitionedStates := stateTracker.ProcessEvalResults(key.DefinitionUID, results, condition)
alerts := FromAlertStateToPostableAlerts(transitionedStates)
err = sch.SendAlerts(alerts)
processedStates := stateTracker.ProcessEvalResults(key.DefinitionUID, results, condition)
sch.saveAlertStates(processedStates)
alerts := FromAlertStateToPostableAlerts(processedStates)
sch.log.Debug("sending alerts to notifier", "count", len(alerts))
err = sch.sendAlerts(alerts)
if err != nil {
sch.log.Error("failed to put alerts in the notifier", "count", len(alerts), "err", err)
}
@ -312,15 +310,90 @@ func (sch *schedule) Ticker(grafanaCtx context.Context, stateTracker *state.Stat
}
case <-grafanaCtx.Done():
err := dispatcherGroup.Wait()
sch.saveAlertStates(stateTracker.GetAll())
return err
}
}
}
func (sch *schedule) SendAlerts(alerts []*notifier.PostableAlert) error {
func (sch *schedule) sendAlerts(alerts []*notifier.PostableAlert) error {
return sch.notifier.PutAlerts(alerts...)
}
func (sch *schedule) saveAlertStates(states []state.AlertState) {
sch.log.Debug("saving alert states", "count", len(states))
for _, s := range states {
cmd := models.SaveAlertInstanceCommand{
DefinitionOrgID: s.OrgID,
DefinitionUID: s.UID,
Labels: models.InstanceLabels(s.Labels),
State: models.InstanceStateType(s.State.String()),
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
}
err := sch.store.SaveAlertInstance(&cmd)
if err != nil {
sch.log.Error("failed to save alert state", "uid", s.UID, "orgId", s.OrgID, "labels", s.Labels.String(), "state", s.State.String(), "msg", err.Error())
}
}
}
func dataLabelsFromInstanceLabels(il models.InstanceLabels) data.Labels {
lbs := data.Labels{}
for k, v := range il {
lbs[k] = v
}
return lbs
}
func (sch *schedule) WarmStateCache(st *state.StateTracker) {
sch.log.Info("warming cache for startup")
st.ResetCache()
orgIdsCmd := models.FetchUniqueOrgIdsQuery{}
if err := sch.store.FetchOrgIds(&orgIdsCmd); err != nil {
sch.log.Error("unable to fetch orgIds", "msg", err.Error())
}
var states []state.AlertState
for _, orgIdResult := range orgIdsCmd.Result {
cmd := models.ListAlertInstancesQuery{
DefinitionOrgID: orgIdResult.DefinitionOrgID,
}
if err := sch.store.ListAlertInstances(&cmd); err != nil {
sch.log.Error("unable to fetch previous state", "msg", err.Error())
}
for _, entry := range cmd.Result {
lbs := dataLabelsFromInstanceLabels(entry.Labels)
stateForEntry := state.AlertState{
UID: entry.DefinitionUID,
OrgID: entry.DefinitionOrgID,
CacheId: fmt.Sprintf("%s %s", entry.DefinitionUID, lbs),
Labels: lbs,
State: translateInstanceState(entry.CurrentState),
Results: []state.StateEvaluation{},
StartsAt: entry.CurrentStateSince,
EndsAt: entry.CurrentStateEnd,
LastEvaluationTime: entry.LastEvalTime,
}
states = append(states, stateForEntry)
}
}
st.Put(states)
}
func translateInstanceState(state models.InstanceStateType) eval.State {
switch {
case state == models.InstanceStateFiring:
return eval.Alerting
case state == models.InstanceStateNormal:
return eval.Normal
default:
return eval.Error
}
}
type alertDefinitionRegistry struct {
mu sync.Mutex
alertDefinitionInfo map[models.AlertDefinitionKey]alertDefinitionInfo

View File

@ -7,21 +7,26 @@ import (
"github.com/grafana/grafana/pkg/infra/log"
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
type AlertState struct {
UID string
CacheId string
Labels data.Labels
State eval.State
Results []eval.State
StartsAt strfmt.DateTime
EndsAt strfmt.DateTime
EvaluatedAt strfmt.DateTime
UID string
OrgID int64
CacheId string
Labels data.Labels
State eval.State
Results []StateEvaluation
StartsAt time.Time
EndsAt time.Time
LastEvaluationTime time.Time
}
type StateEvaluation struct {
EvaluationTime time.Time
EvaluationState eval.State
}
type cache struct {
@ -48,7 +53,7 @@ func NewStateTracker(logger log.Logger) *StateTracker {
return tracker
}
func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
func (st *StateTracker) getOrCreate(uid string, orgId int64, result eval.Result) AlertState {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
@ -58,12 +63,12 @@ func (st *StateTracker) getOrCreate(uid string, result eval.Result) AlertState {
}
st.Log.Debug("adding new alert state cache entry", "cacheId", idString, "state", result.State.String(), "evaluatedAt", result.EvaluatedAt.String())
newState := AlertState{
UID: uid,
CacheId: idString,
Labels: result.Instance,
State: result.State,
Results: []eval.State{},
EvaluatedAt: strfmt.DateTime(result.EvaluatedAt),
UID: uid,
OrgID: orgId,
CacheId: idString,
Labels: result.Instance,
State: result.State,
Results: []StateEvaluation{},
}
st.stateCache.cacheMap[idString] = newState
return newState
@ -75,19 +80,25 @@ func (st *StateTracker) set(stateEntry AlertState) {
st.stateCache.cacheMap[stateEntry.CacheId] = stateEntry
}
func (st *StateTracker) get(stateId string) AlertState {
func (st *StateTracker) Get(stateId string) AlertState {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
return st.stateCache.cacheMap[stateId]
}
//Used to ensure a clean cache on startup
func (st *StateTracker) ResetCache() {
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
st.stateCache.cacheMap = make(map[string]AlertState)
}
func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, condition ngModels.Condition) []AlertState {
st.Log.Info("state tracker processing evaluation results", "uid", uid, "resultCount", len(results))
var changedStates []AlertState
for _, result := range results {
if s, ok := st.setNextState(uid, result); ok {
changedStates = append(changedStates, s)
}
s, _ := st.setNextState(uid, condition.OrgID, result)
changedStates = append(changedStates, s)
}
st.Log.Debug("returning changed states to scheduler", "count", len(changedStates))
return changedStates
@ -99,34 +110,43 @@ func (st *StateTracker) ProcessEvalResults(uid string, results eval.Results, con
// 3. The base interval defined by the scheduler - in the case where #2 is not yet an option we can use the base interval at which every alert runs.
//Set the current state based on evaluation results
//return the state and a bool indicating whether a state transition occurred
func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState, bool) {
currentState := st.getOrCreate(uid, result)
func (st *StateTracker) setNextState(uid string, orgId int64, result eval.Result) (AlertState, bool) {
currentState := st.getOrCreate(uid, orgId, result)
st.Log.Debug("setting alert state", "uid", uid)
switch {
case currentState.State == result.State:
st.Log.Debug("no state transition", "cacheId", currentState.CacheId, "state", currentState.State.String())
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
if currentState.State == eval.Alerting {
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
}
st.set(currentState)
return currentState, false
case currentState.State == eval.Normal && result.State == eval.Alerting:
st.Log.Debug("state transition from normal to alerting", "cacheId", currentState.CacheId)
currentState.State = eval.Alerting
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.StartsAt = strfmt.DateTime(result.EvaluatedAt)
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt.Add(40 * time.Second))
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.StartsAt = result.EvaluatedAt
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
st.set(currentState)
return currentState, true
case currentState.State == eval.Alerting && result.State == eval.Normal:
st.Log.Debug("state transition from alerting to normal", "cacheId", currentState.CacheId)
currentState.State = eval.Normal
currentState.EvaluatedAt = strfmt.DateTime(result.EvaluatedAt)
currentState.EndsAt = strfmt.DateTime(result.EvaluatedAt)
currentState.Results = append(currentState.Results, result.State)
currentState.LastEvaluationTime = result.EvaluatedAt
currentState.EndsAt = result.EvaluatedAt
currentState.Results = append(currentState.Results, StateEvaluation{
EvaluationTime: result.EvaluatedAt,
EvaluationState: result.State,
})
st.set(currentState)
return currentState, true
default:
@ -134,6 +154,16 @@ func (st *StateTracker) setNextState(uid string, result eval.Result) (AlertState
}
}
func (st *StateTracker) GetAll() []AlertState {
var states []AlertState
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
for _, v := range st.stateCache.cacheMap {
states = append(states, v)
}
return states
}
func (st *StateTracker) cleanUp() {
ticker := time.NewTicker(time.Duration(60) * time.Minute)
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
@ -150,16 +180,33 @@ func (st *StateTracker) cleanUp() {
}
func (st *StateTracker) trim() {
st.Log.Info("trimming alert state cache")
st.Log.Info("trimming alert state cache", "now", time.Now())
st.stateCache.mu.Lock()
defer st.stateCache.mu.Unlock()
for _, v := range st.stateCache.cacheMap {
if len(v.Results) > 100 {
st.Log.Debug("trimming result set", "cacheId", v.CacheId, "count", len(v.Results)-100)
newResults := make([]eval.State, 100)
newResults := make([]StateEvaluation, 100)
copy(newResults, v.Results[100:])
v.Results = newResults
st.set(v)
}
}
}
func (a AlertState) Equals(b AlertState) bool {
return a.UID == b.UID &&
a.OrgID == b.OrgID &&
a.CacheId == b.CacheId &&
a.Labels.String() == b.Labels.String() &&
a.State.String() == b.State.String() &&
a.StartsAt == b.StartsAt &&
a.EndsAt == b.EndsAt &&
a.LastEvaluationTime == b.LastEvaluationTime
}
func (st *StateTracker) Put(states []AlertState) {
for _, s := range states {
st.set(s)
}
}

View File

@ -1,13 +1,12 @@
package state
import (
"fmt"
"testing"
"time"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/models"
@ -34,22 +33,31 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 0,
expectedResultCount: 1,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
LastEvaluationTime: evaluationTime,
},
},
},
@ -58,27 +66,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Alerting,
expectedReturnedStateCount: 1,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []eval.State{eval.Normal, eval.Alerting},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
},
StartsAt: evaluationTime.Add(1 * time.Minute),
EndsAt: evaluationTime.Add(100 * time.Second),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@ -87,27 +105,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 1,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Alerting, eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: evaluationTime.Add(1 * time.Minute),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@ -116,27 +144,37 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Alerting,
expectedReturnedStateCount: 0,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []eval.State{eval.Alerting, eval.Alerting},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Alerting,
Results: []StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
},
StartsAt: time.Time{},
EndsAt: evaluationTime.Add(100 * time.Second),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
@ -145,64 +183,103 @@ func TestProcessEvalResults(t *testing.T) {
uid: "test_uid",
evalResults: eval.Results{
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime,
},
eval.Result{
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Instance: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
EvaluatedAt: evaluationTime.Add(1 * time.Minute),
},
},
condition: models.Condition{
Condition: "A",
OrgID: 123,
},
expectedState: eval.Normal,
expectedReturnedStateCount: 0,
expectedResultCount: 2,
expectedCacheEntries: []AlertState{
{
UID: "test_uid",
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []eval.State{eval.Normal, eval.Normal},
StartsAt: strfmt.DateTime{},
EndsAt: strfmt.DateTime{},
EvaluatedAt: strfmt.DateTime(evaluationTime),
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid label1=value1, label2=value2",
Labels: data.Labels{"label1": "value1", "label2": "value2"},
State: eval.Normal,
Results: []StateEvaluation{
{evaluationTime, eval.Normal},
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
},
StartsAt: time.Time{},
EndsAt: time.Time{},
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
},
},
},
}
for _, tc := range testCases {
t.Run("the correct number of entries are added to the cache", func(t *testing.T) {
t.Run("all fields for a cache entry are set correctly", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
if !entry.Equals(st.Get(entry.CacheId)) {
t.Log(tc.desc)
printEntryDiff(entry, st.Get(entry.CacheId), t)
}
assert.True(t, entry.Equals(st.Get(entry.CacheId)))
}
})
t.Run("the expected number of entries are added to the cache", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
assert.Equal(t, len(tc.expectedCacheEntries), len(st.stateCache.cacheMap))
})
t.Run("the correct state is set for each evaluation result", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
testState := st.get(entry.CacheId)
assert.Equal(t, tc.expectedState, testState.State)
}
})
t.Run("the correct number of states are returned to the caller", func(t *testing.T) {
//This test, as configured, does not quite represent the behavior of the system.
//It is expected that each batch of evaluation results will have only one result
//for a unique set of labels.
t.Run("the expected number of states are returned to the caller", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
results := st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
assert.Equal(t, tc.expectedReturnedStateCount, len(results))
})
t.Run("the correct results are set for each cache entry", func(t *testing.T) {
st := NewStateTracker(log.New("test_state_tracker"))
_ = st.ProcessEvalResults(tc.uid, tc.evalResults, tc.condition)
for _, entry := range tc.expectedCacheEntries {
testState := st.get(entry.CacheId)
assert.Equal(t, len(entry.Results), len(testState.Results))
for i, res := range entry.Results {
assert.Equal(t, res, testState.Results[i])
}
}
assert.Equal(t, len(tc.evalResults), len(results))
})
}
}
func printEntryDiff(a, b AlertState, t *testing.T) {
if a.UID != b.UID {
t.Log(fmt.Sprintf("%v \t %v\n", a.UID, b.UID))
}
if a.OrgID != b.OrgID {
t.Log(fmt.Sprintf("%v \t %v\n", a.OrgID, b.OrgID))
}
if a.CacheId != b.CacheId {
t.Log(fmt.Sprintf("%v \t %v\n", a.CacheId, b.CacheId))
}
if !a.Labels.Equals(b.Labels) {
t.Log(fmt.Sprintf("%v \t %v\n", a.Labels, b.Labels))
}
if a.StartsAt != b.StartsAt {
t.Log(fmt.Sprintf("%v \t %v\n", a.StartsAt, b.StartsAt))
}
if a.EndsAt != b.EndsAt {
t.Log(fmt.Sprintf("%v \t %v\n", a.EndsAt, b.EndsAt))
}
if a.LastEvaluationTime != b.LastEvaluationTime {
t.Log(fmt.Sprintf("%v \t %v\n", a.LastEvaluationTime, b.LastEvaluationTime))
}
if len(a.Results) != len(b.Results) {
t.Log(fmt.Sprintf("a: %d b: %d", len(a.Results), len(b.Results)))
t.Log("a")
for i := 0; i < len(a.Results); i++ {
t.Log(fmt.Sprintf("%v\n", a.Results[i]))
}
t.Log("b")
for i := 0; i < len(b.Results); i++ {
t.Log(fmt.Sprintf("%v\n", b.Results[i]))
}
}
}

View File

@ -35,6 +35,7 @@ type Store interface {
SaveAlertInstance(*models.SaveAlertInstanceCommand) error
ValidateAlertDefinition(*models.AlertDefinition, bool) error
UpdateAlertDefinitionPaused(*models.UpdateAlertDefinitionPausedCommand) error
FetchOrgIds(cmd *models.FetchUniqueOrgIdsQuery) error
}
// AlertingStore is the database interface used by the Alertmanager service.

View File

@ -94,6 +94,7 @@ func AlertInstanceMigration(mg *migrator.Migrator) {
{Name: "labels_hash", Type: migrator.DB_NVarchar, Length: 190, Nullable: false},
{Name: "current_state", Type: migrator.DB_NVarchar, Length: 190, Nullable: false},
{Name: "current_state_since", Type: migrator.DB_BigInt, Nullable: false},
{Name: "current_state_end", Type: migrator.DB_BigInt, Nullable: false},
{Name: "last_eval_time", Type: migrator.DB_BigInt, Nullable: false},
},
PrimaryKeys: []string{"def_org_id", "def_uid", "labels_hash"},

View File

@ -91,7 +91,8 @@ func (st DBstore) SaveAlertInstance(cmd *models.SaveAlertInstanceCommand) error
Labels: cmd.Labels,
LabelsHash: labelsHash,
CurrentState: cmd.State,
CurrentStateSince: TimeNow(),
CurrentStateSince: cmd.CurrentStateSince,
CurrentStateEnd: cmd.CurrentStateEnd,
LastEvalTime: cmd.LastEvalTime,
}
@ -99,12 +100,12 @@ func (st DBstore) SaveAlertInstance(cmd *models.SaveAlertInstanceCommand) error
return err
}
params := append(make([]interface{}, 0), alertInstance.DefinitionOrgID, alertInstance.DefinitionUID, labelTupleJSON, alertInstance.LabelsHash, alertInstance.CurrentState, alertInstance.CurrentStateSince.Unix(), alertInstance.LastEvalTime.Unix())
params := append(make([]interface{}, 0), alertInstance.DefinitionOrgID, alertInstance.DefinitionUID, labelTupleJSON, alertInstance.LabelsHash, alertInstance.CurrentState, alertInstance.CurrentStateSince.Unix(), alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix())
upsertSQL := st.SQLStore.Dialect.UpsertSQL(
"alert_instance",
[]string{"def_org_id", "def_uid", "labels_hash"},
[]string{"def_org_id", "def_uid", "labels", "labels_hash", "current_state", "current_state_since", "last_eval_time"})
[]string{"def_org_id", "def_uid", "labels", "labels_hash", "current_state", "current_state_since", "current_state_end", "last_eval_time"})
_, err = sess.SQL(upsertSQL, params...).Query()
if err != nil {
return err
@ -113,3 +114,26 @@ func (st DBstore) SaveAlertInstance(cmd *models.SaveAlertInstanceCommand) error
return nil
})
}
func (st DBstore) FetchOrgIds(cmd *models.FetchUniqueOrgIdsQuery) error {
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
orgIds := make([]*models.FetchUniqueOrgIdsQueryResult, 0)
s := strings.Builder{}
params := make([]interface{}, 0)
addToQuery := func(stmt string, p ...interface{}) {
s.WriteString(stmt)
params = append(params, p...)
}
addToQuery("SELECT DISTINCT def_org_id FROM alert_instance")
if err := sess.SQL(s.String(), params...).Find(&orgIds); err != nil {
return err
}
cmd.Result = orgIds
return nil
})
}

View File

@ -8,6 +8,9 @@ import (
"testing"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/infra/log"
@ -27,6 +30,81 @@ type evalAppliedInfo struct {
now time.Time
}
func TestWarmStateCache(t *testing.T) {
evaluationTime, _ := time.Parse("2006-01-02", "2021-03-25")
expectedEntries := []state.AlertState{
{
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid test1=testValue1",
Labels: data.Labels{"test1": "testValue1"},
State: eval.Normal,
Results: []state.StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
},
StartsAt: evaluationTime.Add(-1 * time.Minute),
EndsAt: evaluationTime.Add(1 * time.Minute),
LastEvaluationTime: evaluationTime,
}, {
UID: "test_uid",
OrgID: 123,
CacheId: "test_uid test2=testValue2",
Labels: data.Labels{"test2": "testValue2"},
State: eval.Alerting,
Results: []state.StateEvaluation{
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
},
StartsAt: evaluationTime.Add(-1 * time.Minute),
EndsAt: evaluationTime.Add(1 * time.Minute),
LastEvaluationTime: evaluationTime,
},
}
dbstore := setupTestEnv(t, 1)
saveCmd1 := &models.SaveAlertInstanceCommand{
DefinitionOrgID: 123,
DefinitionUID: "test_uid",
Labels: models.InstanceLabels{"test1": "testValue1"},
State: models.InstanceStateNormal,
LastEvalTime: evaluationTime,
CurrentStateSince: evaluationTime.Add(-1 * time.Minute),
CurrentStateEnd: evaluationTime.Add(1 * time.Minute),
}
_ = dbstore.SaveAlertInstance(saveCmd1)
saveCmd2 := &models.SaveAlertInstanceCommand{
DefinitionOrgID: 123,
DefinitionUID: "test_uid",
Labels: models.InstanceLabels{"test2": "testValue2"},
State: models.InstanceStateFiring,
LastEvalTime: evaluationTime,
CurrentStateSince: evaluationTime.Add(-1 * time.Minute),
CurrentStateEnd: evaluationTime.Add(1 * time.Minute),
}
_ = dbstore.SaveAlertInstance(saveCmd2)
t.Cleanup(registry.ClearOverrides)
schedCfg := schedule.SchedulerCfg{
C: clock.NewMock(),
BaseInterval: time.Second,
Logger: log.New("ngalert cache warming test"),
Store: dbstore,
}
sched := schedule.NewScheduler(schedCfg, nil)
st := state.NewStateTracker(schedCfg.Logger)
sched.WarmStateCache(st)
t.Run("instance cache has expected entries", func(t *testing.T) {
for _, entry := range expectedEntries {
cacheEntry := st.Get(entry.CacheId)
assert.True(t, entry.Equals(cacheEntry))
}
})
}
func TestAlertingTicker(t *testing.T) {
dbstore := setupTestEnv(t, 1)
t.Cleanup(registry.ClearOverrides)
@ -44,7 +122,7 @@ func TestAlertingTicker(t *testing.T) {
mockedClock := clock.NewMock()
baseInterval := time.Second
schefCfg := schedule.SchedulerCfg{
schedCfg := schedule.SchedulerCfg{
C: mockedClock,
BaseInterval: baseInterval,
EvalAppliedFunc: func(alertDefKey models.AlertDefinitionKey, now time.Time) {
@ -56,11 +134,11 @@ func TestAlertingTicker(t *testing.T) {
Store: dbstore,
Logger: log.New("ngalert schedule test"),
}
sched := schedule.NewScheduler(schefCfg, nil)
sched := schedule.NewScheduler(schedCfg, nil)
ctx := context.Background()
st := state.NewStateTracker(schefCfg.Logger)
st := state.NewStateTracker(schedCfg.Logger)
go func() {
err := sched.Ticker(ctx, st)
require.NoError(t, err)