Alerting: Do not persist noop transition from Normal state. (#61201)

* add feature flag `alertingNoNormalState`
* update instance database to support exclusion of state in list operation
* do not save normal state and delete transitions to normal
* update get methods to filter out normal state
This commit is contained in:
Yuri Tseretyan 2023-01-13 18:29:29 -05:00 committed by GitHub
parent 12d27edb52
commit 9d57b1c72e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 209 additions and 57 deletions

View File

@ -45,6 +45,7 @@ Some stable features are enabled by default. You can disable a stable feature by
| `autoMigrateGraphPanels` | Replace the angular graph panel with timeseries |
| `datasourceLogger` | Logs all datasource requests |
| `accessControlOnCall` | Access control primitives for OnCall |
| `alertingNoNormalState` | Stop maintaining state of alerts that are not firing |
## Alpha feature toggles

View File

@ -88,4 +88,5 @@ export interface FeatureToggles {
sessionRemoteCache?: boolean;
disablePrometheusExemplarSampling?: boolean;
alertingBacktesting?: boolean;
alertingNoNormalState?: boolean;
}

View File

@ -403,5 +403,11 @@ var (
Description: "Rule backtesting API for alerting",
State: FeatureStateAlpha,
},
{
Name: "alertingNoNormalState",
Description: "Stop maintaining state of alerts that are not firing",
State: FeatureStateBeta,
RequiresRestart: false,
},
}
)

View File

@ -294,4 +294,8 @@ const (
// FlagAlertingBacktesting
// Rule backtesting API for alerting
FlagAlertingBacktesting = "alertingBacktesting"
// FlagAlertingNoNormalState
// Stop maintaining state of alerts that are not firing
FlagAlertingNoNormalState = "alertingNoNormalState"
)

View File

@ -50,10 +50,8 @@ func (i InstanceStateType) IsValid() bool {
// ListAlertInstancesQuery is the query list alert Instances.
type ListAlertInstancesQuery struct {
RuleOrgID int64 `json:"-"`
RuleUID string
State InstanceStateType
StateReason string
RuleUID string
RuleOrgID int64 `json:"-"`
Result []*AlertInstance
}

View File

@ -210,12 +210,13 @@ func (ng *AlertNG) init() error {
return err
}
cfg := state.ManagerCfg{
Metrics: ng.Metrics.GetStateMetrics(),
ExternalURL: appUrl,
InstanceStore: store,
Images: ng.imageService,
Clock: clk,
Historian: history,
Metrics: ng.Metrics.GetStateMetrics(),
ExternalURL: appUrl,
InstanceStore: store,
Images: ng.imageService,
Clock: clk,
Historian: history,
DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
}
stateManager := state.NewManager(cfg)
scheduler := schedule.NewScheduler(schedCfg, stateManager)

View File

@ -209,19 +209,22 @@ func (c *cache) get(orgID int64, alertRuleUID, stateId string) *State {
return nil
}
func (c *cache) getAll(orgID int64) []*State {
func (c *cache) getAll(orgID int64, skipNormalState bool) []*State {
var states []*State
c.mtxStates.RLock()
defer c.mtxStates.RUnlock()
for _, v1 := range c.states[orgID] {
for _, v2 := range v1.states {
if skipNormalState && IsNormalStateWithNoReason(v2) {
continue
}
states = append(states, v2)
}
}
return states
}
func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string, skipNormalState bool) []*State {
c.mtxStates.RLock()
defer c.mtxStates.RUnlock()
orgRules, ok := c.states[orgID]
@ -234,6 +237,9 @@ func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
}
result := make([]*State, 0, len(rs.states))
for _, state := range rs.states {
if skipNormalState && IsNormalStateWithNoReason(state) {
continue
}
result = append(result, state)
}
return result

View File

@ -37,6 +37,8 @@ type Manager struct {
images ImageCapturer
historian Historian
externalURL *url.URL
doNotSaveNormalState bool
}
type ManagerCfg struct {
@ -46,19 +48,22 @@ type ManagerCfg struct {
Images ImageCapturer
Clock clock.Clock
Historian Historian
// DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods
DoNotSaveNormalState bool
}
func NewManager(cfg ManagerCfg) *Manager {
return &Manager{
cache: newCache(),
ResendDelay: ResendDelay, // TODO: make this configurable
log: log.New("ngalert.state.manager"),
metrics: cfg.Metrics,
instanceStore: cfg.InstanceStore,
images: cfg.Images,
historian: cfg.Historian,
clock: cfg.Clock,
externalURL: cfg.ExternalURL,
cache: newCache(),
ResendDelay: ResendDelay, // TODO: make this configurable
log: log.New("ngalert.state.manager"),
metrics: cfg.Metrics,
instanceStore: cfg.InstanceStore,
images: cfg.Images,
historian: cfg.Historian,
clock: cfg.Clock,
externalURL: cfg.ExternalURL,
doNotSaveNormalState: cfg.DoNotSaveNormalState,
}
}
@ -271,11 +276,11 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
}
func (st *Manager) GetAll(orgID int64) []*State {
return st.cache.getAll(orgID)
allStates := st.cache.getAll(orgID, st.doNotSaveNormalState)
return allStates
}
func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
return st.cache.getStatesForRuleUID(orgID, alertRuleUID)
return st.cache.getStatesForRuleUID(orgID, alertRuleUID, st.doNotSaveNormalState)
}
func (st *Manager) Put(states []*State) {
@ -294,9 +299,14 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state
instances := make([]ngModels.AlertInstance, 0, len(states))
for _, s := range states {
// Do not save normal state to database and remove transition to Normal state but keep mapped states
if st.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() {
continue
}
key, err := s.GetAlertInstanceKey()
if err != nil {
logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err)
logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String())
continue
}
fields := ngModels.AlertInstance{
@ -311,6 +321,10 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state
instances = append(instances, fields)
}
if len(instances) == 0 {
return
}
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
type debugInfo struct {
State string

View File

@ -7,9 +7,13 @@ import (
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/infra/log/logtest"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/util"
)
// Not for parallel tests.
@ -65,3 +69,81 @@ func TestStateIsStale(t *testing.T) {
})
}
}
func TestManager_saveAlertStates(t *testing.T) {
type stateWithReason struct {
State eval.State
Reason string
}
create := func(s eval.State, r string) stateWithReason {
return stateWithReason{
State: s,
Reason: r,
}
}
allStates := [...]stateWithReason{
create(eval.Normal, ""),
create(eval.Normal, eval.NoData.String()),
create(eval.Normal, eval.Error.String()),
create(eval.Normal, util.GenerateShortUID()),
create(eval.Alerting, ""),
create(eval.Pending, ""),
create(eval.NoData, ""),
create(eval.Error, ""),
}
transitionToKey := map[ngmodels.AlertInstanceKey]StateTransition{}
transitions := make([]StateTransition, 0)
for _, fromState := range allStates {
for i, toState := range allStates {
tr := StateTransition{
State: &State{
State: toState.State,
StateReason: toState.Reason,
Labels: ngmodels.GenerateAlertLabels(5, fmt.Sprintf("%d--", i)),
},
PreviousState: fromState.State,
PreviousStateReason: fromState.Reason,
}
key, err := tr.GetAlertInstanceKey()
require.NoError(t, err)
transitionToKey[key] = tr
transitions = append(transitions, tr)
}
}
t.Run("should save all transitions if doNotSaveNormalState is false", func(t *testing.T) {
st := &FakeInstanceStore{}
m := Manager{instanceStore: st, doNotSaveNormalState: false}
m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...)
savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{}
for _, op := range st.RecordedOps {
saved := op.(ngmodels.AlertInstance)
savedKeys[saved.AlertInstanceKey] = saved
}
assert.Len(t, transitionToKey, len(savedKeys))
for key, tr := range transitionToKey {
assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason)
}
})
t.Run("should not save Normal->Normal if doNotSaveNormalState is true", func(t *testing.T) {
st := &FakeInstanceStore{}
m := Manager{instanceStore: st, doNotSaveNormalState: true}
m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...)
savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{}
for _, op := range st.RecordedOps {
saved := op.(ngmodels.AlertInstance)
savedKeys[saved.AlertInstanceKey] = saved
}
for key, tr := range transitionToKey {
if tr.State.State == eval.Normal && tr.StateReason == "" && tr.PreviousState == eval.Normal && tr.PreviousStateReason == "" {
continue
}
assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason)
}
})
}

View File

@ -140,6 +140,11 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
a.EndsAt = nextEndsTime(interval, evaluatedAt)
}
// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty
func IsNormalStateWithNoReason(s *State) bool {
return s.State == eval.Normal && s.StateReason == ""
}
// StateTransition describes the transition from one state to another.
type StateTransition struct {
*State

View File

@ -30,15 +30,9 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
if cmd.RuleUID != "" {
addToQuery(` AND rule_uid = ?`, cmd.RuleUID)
}
if cmd.State != "" {
addToQuery(` AND current_state = ?`, cmd.State)
if st.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState) {
s.WriteString(fmt.Sprintf(" AND NOT (current_state = '%s' AND current_reason = '')", models.InstanceStateNormal))
}
if cmd.StateReason != "" {
addToQuery(` AND current_reason = ?`, cmd.StateReason)
}
if err := sess.SQL(s.String(), params...).Find(&alertInstances); err != nil {
return err
}

View File

@ -7,8 +7,10 @@ import (
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/tests"
"github.com/grafana/grafana/pkg/util"
)
const baseIntervalSeconds = 10
@ -17,7 +19,7 @@ func BenchmarkAlertInstanceOperations(b *testing.B) {
b.StopTimer()
ctx := context.Background()
_, dbstore := tests.SetupTestEnv(b, baseIntervalSeconds)
dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = false
dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingBigTransactions)
const mainOrgID int64 = 1
@ -86,7 +88,8 @@ func TestIntegrationAlertInstanceBulkWrite(t *testing.T) {
}
for _, bigStmts := range []bool{false, true} {
dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = bigStmts
dbstore.FeatureToggles = featuremgmt.WithFeatures([]interface{}{featuremgmt.FlagAlertingBigTransactions, bigStmts})
t.Log("Saving")
err := dbstore.SaveAlertInstances(ctx, instances...)
require.NoError(t, err)
t.Log("Finished database write")
@ -126,6 +129,16 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) {
const mainOrgID int64 = 1
containsHash := func(t *testing.T, instances []*models.AlertInstance, hash string) {
t.Helper()
for _, i := range instances {
if i.LabelsHash == hash {
return
}
}
require.Fail(t, "%v does not contain an instance with hash %s", instances, hash)
}
alertRule1 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID)
orgID := alertRule1.OrgID
@ -249,16 +262,56 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) {
require.Len(t, listQuery.Result, 4)
})
t.Run("can list all added instances in org filtered by current state", func(t *testing.T) {
listQuery := &models.ListAlertInstancesQuery{
RuleOrgID: orgID,
State: models.InstanceStateNormal,
t.Run("should ignore Normal state with no reason if feature flag is enabled", func(t *testing.T) {
labels := models.InstanceLabels{"test": util.GenerateShortUID()}
instance1 := models.AlertInstance{
AlertInstanceKey: models.AlertInstanceKey{
RuleOrgID: orgID,
RuleUID: util.GenerateShortUID(),
LabelsHash: util.GenerateShortUID(),
},
CurrentState: models.InstanceStateNormal,
CurrentReason: "",
Labels: labels,
}
err := dbstore.ListAlertInstances(ctx, listQuery)
instance2 := models.AlertInstance{
AlertInstanceKey: models.AlertInstanceKey{
RuleOrgID: orgID,
RuleUID: util.GenerateShortUID(),
LabelsHash: util.GenerateShortUID(),
},
CurrentState: models.InstanceStateNormal,
CurrentReason: models.StateReasonError,
Labels: labels,
}
err := dbstore.SaveAlertInstances(ctx, instance1, instance2)
require.NoError(t, err)
require.Len(t, listQuery.Result, 1)
listQuery := &models.ListAlertInstancesQuery{
RuleOrgID: orgID,
}
err = dbstore.ListAlertInstances(ctx, listQuery)
require.NoError(t, err)
containsHash(t, listQuery.Result, instance1.LabelsHash)
f := dbstore.FeatureToggles
dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingNoNormalState)
t.Cleanup(func() {
dbstore.FeatureToggles = f
})
err = dbstore.ListAlertInstances(ctx, listQuery)
require.NoError(t, err)
containsHash(t, listQuery.Result, instance2.LabelsHash)
for _, instance := range listQuery.Result {
if instance.CurrentState == models.InstanceStateNormal && instance.CurrentReason == "" {
require.Fail(t, "List operation expected to return all states except Normal but the result contains Normal states")
}
}
})
t.Run("update instance with same org_id, uid and different state", func(t *testing.T) {

View File

@ -43,18 +43,6 @@ import (
"github.com/grafana/grafana/pkg/util"
)
type FakeFeatures struct {
BigTransactions bool
}
func (f *FakeFeatures) IsEnabled(feature string) bool {
if feature == featuremgmt.FlagAlertingBigTransactions {
return f.BigTransactions
}
return false
}
// SetupTestEnv initializes a store to used by the tests.
func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG, *store.DBstore) {
tb.Helper()
@ -100,7 +88,7 @@ func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG,
folderService := folderimpl.ProvideService(ac, bus, cfg, dashboardService, dashboardStore, nil, features, folderPermissions, nil)
ng, err := ngalert.ProvideService(
cfg, &FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil),
cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil),
secretsService, nil, m, folderService, ac, &dashboards.FakeDashboardService{}, nil, bus, ac, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer,
)
require.NoError(tb, err)

View File

@ -29,7 +29,6 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngalertmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
ngalerttests "github.com/grafana/grafana/pkg/services/ngalert/tests"
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/org/orgimpl"
"github.com/grafana/grafana/pkg/services/quota"
@ -478,7 +477,7 @@ func setupEnv(t *testing.T, sqlStore *sqlstore.SQLStore, b bus.Bus, quotaService
require.NoError(t, err)
m := metrics.NewNGAlert(prometheus.NewRegistry())
_, err = ngalert.ProvideService(
sqlStore.Cfg, &ngalerttests.FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService,
sqlStore.Cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService,
secretsService, nil, m, &foldertest.FakeService{}, &acmock.Mock{}, &dashboards.FakeDashboardService{}, nil, b, &acmock.Mock{}, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer,
)
require.NoError(t, err)