Alerting: Write and Delete multiple alert instances. (#55350)

Prior to this change, all alert instance writes and deletes happened
individually, in their own database transaction. This change batches up
writes or deletes for a given rule's evaluation loop into a single
transaction before applying it.

These new transactions are off by default, guarded by the feature toggle "alertingBigTransactions"

Before:

```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8           398           2991381 ns/op         1133537 B/op      27703 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
    util.go:127: alert definition: {orgID: 1, UID: FovKXiRVzm} with title: "an alert definition FTvFXmRVkz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: foDFXmRVkm} with title: "an alert definition fovFXmRVkz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: VQvFuigVkm} with title: "an alert definition VwDKXmR4kz" interval: 60 created
PASS
ok      github.com/grafana/grafana/pkg/services/ngalert/store   1.619s
```

After:

```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8          1440            816484 ns/op          352297 B/op       6529 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
    util.go:127: alert definition: {orgID: 1, UID: 302r_igVzm} with title: "an alert definition q0h9lmR4zz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: 71hrlmR4km} with title: "an alert definition nJ29_mR4zz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: Cahr_mR4zm} with title: "an alert definition ja2rlmg4zz" interval: 60 created
PASS
ok      github.com/grafana/grafana/pkg/services/ngalert/store   1.383s
```

So we cut time by about 75% and memory allocations by about 60% when
storing and deleting 100 instances.
This commit is contained in:
Joe Blubaugh
2022-10-06 14:22:58 +08:00
committed by GitHub
parent b4e23e5d32
commit b476ae62fb
21 changed files with 606 additions and 193 deletions

View File

@@ -177,11 +177,7 @@ func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults)
if len(states) > 0 {
logger.Debug("saving new states to the database", "count", len(states))
for _, state := range states {
if err := st.saveState(ctx, state); err != nil {
logger.Error("failed to save alert state", "labels", state.Labels.String(), "state", state.State.String(), "err", err.Error())
}
}
_, _ = st.saveAlertStates(ctx, states...)
}
return append(states, resolvedStates...)
}
@@ -310,18 +306,52 @@ func (st *Manager) Put(states []*State) {
}
}
func (st *Manager) saveState(ctx context.Context, s *State) error {
cmd := ngModels.SaveAlertInstanceCommand{
RuleOrgID: s.OrgID,
RuleUID: s.AlertRuleUID,
Labels: ngModels.InstanceLabels(s.Labels),
State: ngModels.InstanceStateType(s.State.String()),
StateReason: s.StateReason,
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
// TODO: Is the `State` type necessary? Should it embed the instance?
func (st *Manager) saveAlertStates(ctx context.Context, states ...*State) (saved, failed int) {
st.log.Debug("saving alert states", "count", len(states))
instances := make([]ngModels.AlertInstance, 0, len(states))
type debugInfo struct {
OrgID int64
Uid string
State string
Labels string
}
return st.instanceStore.SaveAlertInstance(ctx, &cmd)
debug := make([]debugInfo, 0)
for _, s := range states {
labels := ngModels.InstanceLabels(s.Labels)
_, hash, err := labels.StringAndHash()
if err != nil {
debug = append(debug, debugInfo{s.OrgID, s.AlertRuleUID, s.State.String(), s.Labels.String()})
st.log.Error("failed to save alert instance with invalid labels", "orgID", s.OrgID, "ruleUID", s.AlertRuleUID, "err", err)
continue
}
fields := ngModels.AlertInstance{
AlertInstanceKey: ngModels.AlertInstanceKey{
RuleOrgID: s.OrgID,
RuleUID: s.AlertRuleUID,
LabelsHash: hash,
},
Labels: ngModels.InstanceLabels(s.Labels),
CurrentState: ngModels.InstanceStateType(s.State.String()),
CurrentReason: s.StateReason,
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
}
instances = append(instances, fields)
}
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
for _, inst := range instances {
debug = append(debug, debugInfo{inst.RuleOrgID, inst.RuleUID, string(inst.CurrentState), data.Labels(inst.Labels).String()})
}
st.log.Error("failed to save alert states", "states", debug, "err", err)
return 0, len(debug)
}
return len(instances), len(debug)
}
// TODO: why wouldn't you allow other types like NoData or Error?
@@ -353,9 +383,11 @@ func (i InstanceStateAndReason) String() string {
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State) []*State {
var resolvedStates []*State
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
toDelete := make([]ngModels.AlertInstanceKey, 0)
for _, s := range allStates {
_, ok := states[s.CacheId]
if !ok && isItStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
// Is the cached state in our recently processed results? If not, is it stale?
if _, ok := states[s.CacheId]; !ok && stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
st.log.Debug("removing stale state entry", "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID, "cacheID", s.CacheId)
st.cache.deleteEntry(s.OrgID, s.AlertRuleUID, s.CacheId)
ilbs := ngModels.InstanceLabels(s.Labels)
@@ -364,9 +396,7 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
st.log.Error("unable to get labelsHash", "err", err.Error(), "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID)
}
if err = st.instanceStore.DeleteAlertInstance(ctx, s.OrgID, s.AlertRuleUID, labelsHash); err != nil {
st.log.Error("unable to delete stale instance from database", "err", err.Error(), "orgID", s.OrgID, "alertRuleUID", s.AlertRuleUID, "cacheID", s.CacheId)
}
toDelete = append(toDelete, ngModels.AlertInstanceKey{RuleOrgID: s.OrgID, RuleUID: s.AlertRuleUID, LabelsHash: labelsHash})
if s.State == eval.Alerting {
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
@@ -382,9 +412,14 @@ func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Tim
}
}
}
if err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...); err != nil {
st.log.Error("unable to delete stale instances from database", "err", err.Error(),
"orgID", alertRule.OrgID, "alertRuleUID", alertRule.UID, "count", len(toDelete))
}
return resolvedStates
}
func isItStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
}

View File

@@ -106,7 +106,7 @@ func Test_maybeNewImage(t *testing.T) {
}
}
func TestIsItStale(t *testing.T) {
func TestStateIsStale(t *testing.T) {
now := time.Now()
intervalSeconds := rand.Int63n(10) + 5
@@ -143,7 +143,7 @@ func TestIsItStale(t *testing.T) {
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
require.Equal(t, tc.expectedResult, isItStale(now, tc.lastEvaluation, intervalSeconds))
require.Equal(t, tc.expectedResult, stateIsStale(now, tc.lastEvaluation, intervalSeconds))
})
}
}

View File

@@ -2019,10 +2019,10 @@ func TestProcessEvalResults(t *testing.T) {
require.NotEmpty(t, states)
savedStates := make(map[string]models.SaveAlertInstanceCommand)
savedStates := make(map[string]models.AlertInstance)
for _, op := range instanceStore.RecordedOps {
switch q := op.(type) {
case models.SaveAlertInstanceCommand:
case models.AlertInstance:
cacheId, err := q.Labels.StringKey()
require.NoError(t, err)
savedStates[cacheId] = q
@@ -2055,28 +2055,39 @@ func TestStaleResultsHandler(t *testing.T) {
const mainOrgID int64 = 1
rule := tests.CreateTestAlertRule(t, ctx, dbstore, int64(interval.Seconds()), mainOrgID)
lastEval := evaluationTime.Add(-2 * interval)
saveCmd1 := &models.SaveAlertInstanceCommand{
RuleOrgID: rule.OrgID,
RuleUID: rule.UID,
Labels: models.InstanceLabels{"test1": "testValue1"},
State: models.InstanceStateNormal,
LastEvalTime: lastEval,
CurrentStateSince: lastEval,
CurrentStateEnd: lastEval.Add(3 * interval),
labels1 := models.InstanceLabels{"test1": "testValue1"}
_, hash1, _ := labels1.StringAndHash()
labels2 := models.InstanceLabels{"test2": "testValue2"}
_, hash2, _ := labels2.StringAndHash()
instances := []models.AlertInstance{
{
AlertInstanceKey: models.AlertInstanceKey{
RuleOrgID: rule.OrgID,
RuleUID: rule.UID,
LabelsHash: hash1,
},
CurrentState: models.InstanceStateNormal,
Labels: labels1,
LastEvalTime: lastEval,
CurrentStateSince: lastEval,
CurrentStateEnd: lastEval.Add(3 * interval),
},
{
AlertInstanceKey: models.AlertInstanceKey{
RuleOrgID: rule.OrgID,
RuleUID: rule.UID,
LabelsHash: hash2,
},
CurrentState: models.InstanceStateFiring,
Labels: labels2,
LastEvalTime: lastEval,
CurrentStateSince: lastEval,
CurrentStateEnd: lastEval.Add(3 * interval),
},
}
_ = dbstore.SaveAlertInstance(ctx, saveCmd1)
saveCmd2 := &models.SaveAlertInstanceCommand{
RuleOrgID: rule.OrgID,
RuleUID: rule.UID,
Labels: models.InstanceLabels{"test2": "testValue2"},
State: models.InstanceStateFiring,
LastEvalTime: lastEval,
CurrentStateSince: lastEval,
CurrentStateEnd: lastEval.Add(3 * interval),
}
_ = dbstore.SaveAlertInstance(ctx, saveCmd2)
_ = dbstore.SaveAlertInstances(ctx, instances...)
testCases := []struct {
desc string

View File

@@ -12,8 +12,8 @@ import (
type InstanceStore interface {
FetchOrgIds(ctx context.Context) ([]int64, error)
ListAlertInstances(ctx context.Context, cmd *models.ListAlertInstancesQuery) error
SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error
DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error
SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error
DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error
DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error
}

View File

@@ -9,6 +9,8 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
var _ InstanceStore = &FakeInstanceStore{}
type FakeInstanceStore struct {
mtx sync.Mutex
RecordedOps []interface{}
@@ -21,16 +23,18 @@ func (f *FakeInstanceStore) ListAlertInstances(_ context.Context, q *models.List
return nil
}
func (f *FakeInstanceStore) SaveAlertInstance(_ context.Context, q *models.SaveAlertInstanceCommand) error {
func (f *FakeInstanceStore) SaveAlertInstances(_ context.Context, q ...models.AlertInstance) error {
f.mtx.Lock()
defer f.mtx.Unlock()
f.RecordedOps = append(f.RecordedOps, *q)
for _, inst := range q {
f.RecordedOps = append(f.RecordedOps, inst)
}
return nil
}
func (f *FakeInstanceStore) FetchOrgIds(_ context.Context) ([]int64, error) { return []int64{}, nil }
func (f *FakeInstanceStore) DeleteAlertInstance(_ context.Context, _ int64, _, _ string) error {
func (f *FakeInstanceStore) DeleteAlertInstances(_ context.Context, _ ...models.AlertInstanceKey) error {
return nil
}