Alerting: Add sync state persister to save entire state of the rule (#96628)

This commit is contained in:
Alexander Akhmetov
2025-01-20 12:12:27 +01:00
committed by GitHub
parent 63a68f3e99
commit 651430e34a
3 changed files with 217 additions and 0 deletions

View File

@@ -0,0 +1,80 @@
package state
import (
"context"
"time"
"go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
type SyncRuleStatePersister struct {
log log.Logger
store InstanceStore
doNotSaveNormalState bool
}
func NewSyncRuleStatePersisiter(log log.Logger, cfg ManagerCfg) StatePersister {
return &SyncRuleStatePersister{
log: log,
store: cfg.InstanceStore,
doNotSaveNormalState: cfg.DoNotSaveNormalState,
}
}
func (a *SyncRuleStatePersister) Async(_ context.Context, _ AlertInstancesProvider) {
a.log.Debug("Async: No-Op")
}
func (a *SyncRuleStatePersister) Sync(ctx context.Context, span trace.Span, ruleKey models.AlertRuleKeyWithGroup, states StateTransitions) {
if a.store == nil || len(states) == 0 {
return
}
logger := a.log.FromContext(ctx)
instancesToSave := make([]models.AlertInstance, 0, len(states))
for _, s := range states {
if s.IsStale() {
continue
}
if a.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() {
continue
}
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Failed to create a key for alert state to save it. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String(), "rule_uid", ruleKey.UID, "rule_group", ruleKey.RuleGroup)
continue
}
instance := models.AlertInstance{
AlertInstanceKey: key,
Labels: models.InstanceLabels(s.Labels),
CurrentState: models.InstanceStateType(s.State.State.String()),
CurrentReason: s.StateReason,
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
ResolvedAt: s.ResolvedAt,
LastSentAt: s.LastSentAt,
ResultFingerprint: s.ResultFingerprint.String(),
}
instancesToSave = append(instancesToSave, instance)
}
start := time.Now()
logger.Debug("Saving alert states", "count", len(instancesToSave))
err := a.store.SaveAlertInstancesForRule(ctx, ruleKey, instancesToSave)
if err != nil {
logger.Error("Failed to save alert rule state", "error", err, "duration", time.Since(start))
return
}
logger.Debug("Saving alert states done", "count", len(instancesToSave), "duration", time.Since(start))
span.AddEvent("saved alert rule state")
}

View File

@@ -0,0 +1,126 @@
package state
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
"go.opentelemetry.io/otel"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func TestSyncRuleStatePersister_Sync(t *testing.T) {
const orgID = int64(1)
const ruleUID = "rule-id"
const ruleGroup = "test-group"
testCases := []struct {
name string
states StateTransitions
ruleKey models.AlertRuleKeyWithGroup
doNotSaveNormalState bool
expectedError error
}{
{
name: "success case",
states: StateTransitions{
{
State: &State{
Labels: data.Labels{
"label-1": "value-1",
},
LastEvaluationTime: time.Now(),
StartsAt: time.Now(),
EndsAt: time.Now(),
},
},
},
ruleKey: models.AlertRuleKeyWithGroup{
AlertRuleKey: models.AlertRuleKey{
OrgID: orgID,
UID: ruleUID,
},
RuleGroup: ruleGroup,
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx := context.Background()
mockStore := new(FakeInstanceStore)
persister := &SyncRuleStatePersister{
log: log.New("test"),
store: mockStore,
}
tracer := otel.Tracer("test")
ctx, span := tracer.Start(ctx, "test-span")
instances := make([]models.AlertInstance, 0, len(tc.states))
for _, s := range tc.states {
key, err := s.GetAlertInstanceKey()
require.NoError(t, err)
instance := models.AlertInstance{
AlertInstanceKey: key,
Labels: models.InstanceLabels(s.Labels),
CurrentState: models.InstanceStateType(s.State.State.String()),
CurrentReason: s.StateReason,
LastEvalTime: s.LastEvaluationTime,
CurrentStateSince: s.StartsAt,
CurrentStateEnd: s.EndsAt,
ResolvedAt: s.ResolvedAt,
LastSentAt: s.LastSentAt,
ResultFingerprint: s.ResultFingerprint.String(),
}
instances = append(instances, instance)
}
persister.Sync(ctx, span, tc.ruleKey, tc.states)
recordedCalls := mockStore.RecordedOps()
require.Len(t, recordedCalls, 1)
for _, op := range recordedCalls {
switch q := op.(type) {
case FakeInstanceStoreOp:
require.Equal(t, "SaveAlertInstancesForRule", q.Name)
require.Equal(t, tc.ruleKey, q.Args[1])
require.Equal(t, instances, q.Args[2])
default:
require.Fail(t, "unexpected call", "op: %v", op)
}
}
})
}
t.Run("no-op when store is nil", func(t *testing.T) {
persister := &SyncRuleStatePersister{
log: log.New("test"),
}
tracer := otel.Tracer("test")
ctx := context.Background()
ctx, span := tracer.Start(ctx, "test-span")
ruleKey := models.AlertRuleKeyWithGroup{
AlertRuleKey: models.AlertRuleKey{
OrgID: orgID,
UID: ruleUID,
},
RuleGroup: ruleGroup,
}
states := StateTransitions{
{
State: &State{
Labels: data.Labels{
"label-1": "value-1",
},
},
},
}
// There is no store, so no call to SaveAlertInstancesForRule or panic
persister.Sync(ctx, span, ruleKey, states)
})
}

View File

@@ -59,6 +59,17 @@ func (f *FakeInstanceStore) DeleteAlertInstances(ctx context.Context, q ...model
}
func (f *FakeInstanceStore) SaveAlertInstancesForRule(ctx context.Context, key models.AlertRuleKeyWithGroup, instances []models.AlertInstance) error {
f.mtx.Lock()
defer f.mtx.Unlock()
f.recordedOps = append(f.recordedOps, FakeInstanceStoreOp{
Name: "SaveAlertInstancesForRule", Args: []any{
ctx,
key,
instances,
},
})
return nil
}