mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Write and Delete multiple alert instances. (#55350)
Prior to this change, all alert instance writes and deletes happened
individually, in their own database transaction. This change batches up
writes or deletes for a given rule's evaluation loop into a single
transaction before applying it.
These new transactions are off by default, guarded by the feature toggle "alertingBigTransactions"
Before:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8 398 2991381 ns/op 1133537 B/op 27703 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
util.go:127: alert definition: {orgID: 1, UID: FovKXiRVzm} with title: "an alert definition FTvFXmRVkz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: foDFXmRVkm} with title: "an alert definition fovFXmRVkz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: VQvFuigVkm} with title: "an alert definition VwDKXmR4kz" interval: 60 created
PASS
ok github.com/grafana/grafana/pkg/services/ngalert/store 1.619s
```
After:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8 1440 816484 ns/op 352297 B/op 6529 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
util.go:127: alert definition: {orgID: 1, UID: 302r_igVzm} with title: "an alert definition q0h9lmR4zz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: 71hrlmR4km} with title: "an alert definition nJ29_mR4zz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: Cahr_mR4zm} with title: "an alert definition ja2rlmg4zz" interval: 60 created
PASS
ok github.com/grafana/grafana/pkg/services/ngalert/store 1.383s
```
So we cut time by about 75% and memory allocations by about 60% when
storing and deleting 100 instances.
This commit is contained in:
@@ -177,11 +177,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
|
||||
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
|
||||
if len(states) > 0 {
|
||||
logger.Debug("saving new states to the database", "count", len(states))
|
||||
for _, state := range states {
|
||||
if err := st.saveState(ctx, state); err != nil {
|
||||
logger.Error("failed to save alert state", "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error())
|
||||
}
|
||||
}
|
||||
_, _ = st.saveAlertStates(ctx, states...)
|
||||
}
|
||||
return append(states, resolvedStates...)
|
||||
}
|
||||
@@ -310,18 +306,52 @@ func (st *Manager) Put(states []*State) {
|
||||
}
|
||||
}
|
||||
|
||||
func (st *Manager) saveState(ctx context.Context, s *State) error {
|
||||
cmd := ngModels.SaveAlertInstanceCommand{
|
||||
RuleOrgID: s.OrgID,
|
||||
RuleUID: s.AlertRuleUID,
|
||||
Labels: ngModels.InstanceLabels(s.Labels),
|
||||
State: ngModels.InstanceStateType(s.State.String()),
|
||||
StateReason: s.StateReason,
|
||||
LastEvalTime: s.LastEvaluationTime,
|
||||
CurrentStateSince: s.StartsAt,
|
||||
CurrentStateEnd: s.EndsAt,
|
||||
// TODO: Is the `State` type necessary? Should it embed the instance?
|
||||
func (st *Manager) saveAlertStates(ctx context.Context, states ...*State) (saved, failed int) {
|
||||
st.log.Debug("saving alert states", "count", len(states))
|
||||
instances := make([]ngModels.AlertInstance, 0, len(states))
|
||||
|
||||
type debugInfo struct {
|
||||
OrgID int64
|
||||
Uid string
|
||||
State string
|
||||
Labels string
|
||||
}
|
||||
return st.instanceStore.SaveAlertInstance(ctx, &cmd)
|
||||
debug := make([]debugInfo, 0)
|
||||
|
||||
for _, s := range states {
|
||||
labels := ngModels.InstanceLabels(s.Labels)
|
||||
_, hash, err := labels.StringAndHash()
|
||||
if err != nil {
|
||||
debug = append(debug, debugInfo{s.OrgID, s.AlertRuleUID, s.State.String(), s.Labels.String()})
|
||||
st.log.Error("failed to save alert instance with invalid labels", "orgID", s.OrgID, "ruleUID", s.AlertRuleUID, "err", err)
|
||||
continue
|
||||
}
|
||||
fields := ngModels.AlertInstance{
|
||||
AlertInstanceKey: ngModels.AlertInstanceKey{
|
||||
RuleOrgID: s.OrgID,
|
||||
RuleUID: s.AlertRuleUID,
|
||||
LabelsHash: hash,
|
||||
},
|
||||
Labels: ngModels.InstanceLabels(s.Labels),
|
||||
CurrentState: ngModels.InstanceStateType(s.State.String()),
|
||||
CurrentReason: s.StateReason,
|
||||
LastEvalTime: s.LastEvaluationTime,
|
||||
CurrentStateSince: s.StartsAt,
|
||||
CurrentStateEnd: s.EndsAt,
|
||||
}
|
||||
instances = append(instances, fields)
|
||||
}
|
||||
|
||||
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
|
||||
for _, inst := range instances {
|
||||
debug = append(debug, debugInfo{inst.RuleOrgID, inst.RuleUID, string(inst.CurrentState), data.Labels(inst.Labels).String()})
|
||||
}
|
||||
st.log.Error("failed to save alert states", "states", debug, "err", err)
|
||||
return 0, len(debug)
|
||||
}
|
||||
|
||||
return len(instances), len(debug)
|
||||
}
|
||||
|
||||
// TODO: why wouldn't you allow other types like NoData or Error?
|
||||
@@ -353,9 +383,11 @@ func (i InstanceStateAndReason) String() string {
|
||||
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State {
|
||||
var resolvedStates []*State
|
||||
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
||||
toDelete := make([]ngModels.AlertInstanceKey, 0)
|
||||
|
||||
for _, s := range allStates {
|
||||
_, ok := states[s.CacheId]
|
||||
if !ok && isItStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
|
||||
// Is the cached state in our recently processed results? If not, is it stale?
|
||||
if _, ok := states[s.CacheId]; !ok && stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
|
||||
st.log.Debug("removing stale state entry", "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID, "cacheID", s.CacheId)
|
||||
st.cache.deleteEntry(s.OrgID, s.AlertRuleUID, s.CacheId)
|
||||
ilbs := ngModels.InstanceLabels(s.Labels)
|
||||
@@ -364,9 +396,7 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
|
||||
st.log.Error("unable to get labelsHash", "err", err.Error(), "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID)
|
||||
}
|
||||
|
||||
if err = st.instanceStore.DeleteAlertInstance(ctx, s.OrgID, s.AlertRuleUID, labelsHash); err != nil {
|
||||
st.log.Error("unable to delete stale instance from database", "err", err.Error(), "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID, "cacheID", s.CacheId)
|
||||
}
|
||||
toDelete = append(toDelete, ngModels.AlertInstanceKey{RuleOrgID: s.OrgID, RuleUID: s.AlertRuleUID, LabelsHash: labelsHash})
|
||||
|
||||
if s.State == eval.Alerting {
|
||||
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
|
||||
@@ -382,9 +412,14 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...); err != nil {
|
||||
st.log.Error("unable to delete stale instances from database", "err", err.Error(),
|
||||
"orgID", alertRule.OrgID, "alertRuleUID", alertRule.UID, "count", len(toDelete))
|
||||
}
|
||||
return resolvedStates
|
||||
}
|
||||
|
||||
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
||||
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
||||
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
|
||||
}
|
||||
|
||||
@@ -106,7 +106,7 @@ func Test_maybeNewImage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsItStale(t *testing.T) {
|
||||
func TestStateIsStale(t *testing.T) {
|
||||
now := time.Now()
|
||||
intervalSeconds := rand.Int63n(10) + 5
|
||||
|
||||
@@ -143,7 +143,7 @@ func TestIsItStale(t *testing.T) {
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
require.Equal(t, tc.expectedResult, isItStale(now, tc.lastEvaluation, intervalSeconds))
|
||||
require.Equal(t, tc.expectedResult, stateIsStale(now, tc.lastEvaluation, intervalSeconds))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2019,10 +2019,10 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
|
||||
require.NotEmpty(t, states)
|
||||
|
||||
savedStates := make(map[string]models.SaveAlertInstanceCommand)
|
||||
savedStates := make(map[string]models.AlertInstance)
|
||||
for _, op := range instanceStore.RecordedOps {
|
||||
switch q := op.(type) {
|
||||
case models.SaveAlertInstanceCommand:
|
||||
case models.AlertInstance:
|
||||
cacheId, err := q.Labels.StringKey()
|
||||
require.NoError(t, err)
|
||||
savedStates[cacheId] = q
|
||||
@@ -2055,28 +2055,39 @@ func TestStaleResultsHandler(t *testing.T) {
|
||||
const mainOrgID int64 = 1
|
||||
rule := tests.CreateTestAlertRule(t, ctx, dbstore, int64(interval.Seconds()), mainOrgID)
|
||||
lastEval := evaluationTime.Add(-2 * interval)
|
||||
saveCmd1 := &models.SaveAlertInstanceCommand{
|
||||
RuleOrgID: rule.OrgID,
|
||||
RuleUID: rule.UID,
|
||||
Labels: models.InstanceLabels{"test1": "testValue1"},
|
||||
State: models.InstanceStateNormal,
|
||||
LastEvalTime: lastEval,
|
||||
CurrentStateSince: lastEval,
|
||||
CurrentStateEnd: lastEval.Add(3 * interval),
|
||||
|
||||
labels1 := models.InstanceLabels{"test1": "testValue1"}
|
||||
_, hash1, _ := labels1.StringAndHash()
|
||||
labels2 := models.InstanceLabels{"test2": "testValue2"}
|
||||
_, hash2, _ := labels2.StringAndHash()
|
||||
instances := []models.AlertInstance{
|
||||
{
|
||||
AlertInstanceKey: models.AlertInstanceKey{
|
||||
RuleOrgID: rule.OrgID,
|
||||
RuleUID: rule.UID,
|
||||
LabelsHash: hash1,
|
||||
},
|
||||
CurrentState: models.InstanceStateNormal,
|
||||
Labels: labels1,
|
||||
LastEvalTime: lastEval,
|
||||
CurrentStateSince: lastEval,
|
||||
CurrentStateEnd: lastEval.Add(3 * interval),
|
||||
},
|
||||
{
|
||||
AlertInstanceKey: models.AlertInstanceKey{
|
||||
RuleOrgID: rule.OrgID,
|
||||
RuleUID: rule.UID,
|
||||
LabelsHash: hash2,
|
||||
},
|
||||
CurrentState: models.InstanceStateFiring,
|
||||
Labels: labels2,
|
||||
LastEvalTime: lastEval,
|
||||
CurrentStateSince: lastEval,
|
||||
CurrentStateEnd: lastEval.Add(3 * interval),
|
||||
},
|
||||
}
|
||||
|
||||
_ = dbstore.SaveAlertInstance(ctx, saveCmd1)
|
||||
|
||||
saveCmd2 := &models.SaveAlertInstanceCommand{
|
||||
RuleOrgID: rule.OrgID,
|
||||
RuleUID: rule.UID,
|
||||
Labels: models.InstanceLabels{"test2": "testValue2"},
|
||||
State: models.InstanceStateFiring,
|
||||
LastEvalTime: lastEval,
|
||||
CurrentStateSince: lastEval,
|
||||
CurrentStateEnd: lastEval.Add(3 * interval),
|
||||
}
|
||||
_ = dbstore.SaveAlertInstance(ctx, saveCmd2)
|
||||
_ = dbstore.SaveAlertInstances(ctx, instances...)
|
||||
|
||||
testCases := []struct {
|
||||
desc string
|
||||
|
||||
@@ -12,8 +12,8 @@ import (
|
||||
type InstanceStore interface {
|
||||
FetchOrgIds(ctx context.Context) ([]int64, error)
|
||||
ListAlertInstances(ctx context.Context, cmd *models.ListAlertInstancesQuery) error
|
||||
SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error
|
||||
DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error
|
||||
SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error
|
||||
DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error
|
||||
DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,8 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
var _ InstanceStore = &FakeInstanceStore{}
|
||||
|
||||
type FakeInstanceStore struct {
|
||||
mtx sync.Mutex
|
||||
RecordedOps []interface{}
|
||||
@@ -21,16 +23,18 @@ func (f *FakeInstanceStore) ListAlertInstances(_ context.Context, q *models.List
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *FakeInstanceStore) SaveAlertInstance(_ context.Context, q *models.SaveAlertInstanceCommand) error {
|
||||
func (f *FakeInstanceStore) SaveAlertInstances(_ context.Context, q ...models.AlertInstance) error {
|
||||
f.mtx.Lock()
|
||||
defer f.mtx.Unlock()
|
||||
f.RecordedOps = append(f.RecordedOps, *q)
|
||||
for _, inst := range q {
|
||||
f.RecordedOps = append(f.RecordedOps, inst)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *FakeInstanceStore) FetchOrgIds(_ context.Context) ([]int64, error) { return []int64{}, nil }
|
||||
|
||||
func (f *FakeInstanceStore) DeleteAlertInstance(_ context.Context, _ int64, _, _ string) error {
|
||||
func (f *FakeInstanceStore) DeleteAlertInstances(_ context.Context, _ ...models.AlertInstanceKey) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user