mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Do not persist noop transition from Normal state. (#61201)
* add feature flag `alertingNoNormalState` * update instance database to support exclusion of state in list operation * do not save normal state and delete transitions to normal * update get methods to filter out normal state
This commit is contained in:
parent
12d27edb52
commit
9d57b1c72e
@ -45,6 +45,7 @@ Some stable features are enabled by default. You can disable a stable feature by
|
||||
| `autoMigrateGraphPanels` | Replace the angular graph panel with timeseries |
|
||||
| `datasourceLogger` | Logs all datasource requests |
|
||||
| `accessControlOnCall` | Access control primitives for OnCall |
|
||||
| `alertingNoNormalState` | Stop maintaining state of alerts that are not firing |
|
||||
|
||||
## Alpha feature toggles
|
||||
|
||||
|
@ -88,4 +88,5 @@ export interface FeatureToggles {
|
||||
sessionRemoteCache?: boolean;
|
||||
disablePrometheusExemplarSampling?: boolean;
|
||||
alertingBacktesting?: boolean;
|
||||
alertingNoNormalState?: boolean;
|
||||
}
|
||||
|
@ -403,5 +403,11 @@ var (
|
||||
Description: "Rule backtesting API for alerting",
|
||||
State: FeatureStateAlpha,
|
||||
},
|
||||
{
|
||||
Name: "alertingNoNormalState",
|
||||
Description: "Stop maintaining state of alerts that are not firing",
|
||||
State: FeatureStateBeta,
|
||||
RequiresRestart: false,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
@ -294,4 +294,8 @@ const (
|
||||
// FlagAlertingBacktesting
|
||||
// Rule backtesting API for alerting
|
||||
FlagAlertingBacktesting = "alertingBacktesting"
|
||||
|
||||
// FlagAlertingNoNormalState
|
||||
// Stop maintaining state of alerts that are not firing
|
||||
FlagAlertingNoNormalState = "alertingNoNormalState"
|
||||
)
|
||||
|
@ -50,10 +50,8 @@ func (i InstanceStateType) IsValid() bool {
|
||||
|
||||
// ListAlertInstancesQuery is the query list alert Instances.
|
||||
type ListAlertInstancesQuery struct {
|
||||
RuleOrgID int64 `json:"-"`
|
||||
RuleUID string
|
||||
State InstanceStateType
|
||||
StateReason string
|
||||
RuleUID string
|
||||
RuleOrgID int64 `json:"-"`
|
||||
|
||||
Result []*AlertInstance
|
||||
}
|
||||
|
@ -210,12 +210,13 @@ func (ng *AlertNG) init() error {
|
||||
return err
|
||||
}
|
||||
cfg := state.ManagerCfg{
|
||||
Metrics: ng.Metrics.GetStateMetrics(),
|
||||
ExternalURL: appUrl,
|
||||
InstanceStore: store,
|
||||
Images: ng.imageService,
|
||||
Clock: clk,
|
||||
Historian: history,
|
||||
Metrics: ng.Metrics.GetStateMetrics(),
|
||||
ExternalURL: appUrl,
|
||||
InstanceStore: store,
|
||||
Images: ng.imageService,
|
||||
Clock: clk,
|
||||
Historian: history,
|
||||
DoNotSaveNormalState: ng.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState),
|
||||
}
|
||||
stateManager := state.NewManager(cfg)
|
||||
scheduler := schedule.NewScheduler(schedCfg, stateManager)
|
||||
|
@ -209,19 +209,22 @@ func (c *cache) get(orgID int64, alertRuleUID, stateId string) *State {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cache) getAll(orgID int64) []*State {
|
||||
func (c *cache) getAll(orgID int64, skipNormalState bool) []*State {
|
||||
var states []*State
|
||||
c.mtxStates.RLock()
|
||||
defer c.mtxStates.RUnlock()
|
||||
for _, v1 := range c.states[orgID] {
|
||||
for _, v2 := range v1.states {
|
||||
if skipNormalState && IsNormalStateWithNoReason(v2) {
|
||||
continue
|
||||
}
|
||||
states = append(states, v2)
|
||||
}
|
||||
}
|
||||
return states
|
||||
}
|
||||
|
||||
func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
|
||||
func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string, skipNormalState bool) []*State {
|
||||
c.mtxStates.RLock()
|
||||
defer c.mtxStates.RUnlock()
|
||||
orgRules, ok := c.states[orgID]
|
||||
@ -234,6 +237,9 @@ func (c *cache) getStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
|
||||
}
|
||||
result := make([]*State, 0, len(rs.states))
|
||||
for _, state := range rs.states {
|
||||
if skipNormalState && IsNormalStateWithNoReason(state) {
|
||||
continue
|
||||
}
|
||||
result = append(result, state)
|
||||
}
|
||||
return result
|
||||
|
@ -37,6 +37,8 @@ type Manager struct {
|
||||
images ImageCapturer
|
||||
historian Historian
|
||||
externalURL *url.URL
|
||||
|
||||
doNotSaveNormalState bool
|
||||
}
|
||||
|
||||
type ManagerCfg struct {
|
||||
@ -46,19 +48,22 @@ type ManagerCfg struct {
|
||||
Images ImageCapturer
|
||||
Clock clock.Clock
|
||||
Historian Historian
|
||||
// DoNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods
|
||||
DoNotSaveNormalState bool
|
||||
}
|
||||
|
||||
func NewManager(cfg ManagerCfg) *Manager {
|
||||
return &Manager{
|
||||
cache: newCache(),
|
||||
ResendDelay: ResendDelay, // TODO: make this configurable
|
||||
log: log.New("ngalert.state.manager"),
|
||||
metrics: cfg.Metrics,
|
||||
instanceStore: cfg.InstanceStore,
|
||||
images: cfg.Images,
|
||||
historian: cfg.Historian,
|
||||
clock: cfg.Clock,
|
||||
externalURL: cfg.ExternalURL,
|
||||
cache: newCache(),
|
||||
ResendDelay: ResendDelay, // TODO: make this configurable
|
||||
log: log.New("ngalert.state.manager"),
|
||||
metrics: cfg.Metrics,
|
||||
instanceStore: cfg.InstanceStore,
|
||||
images: cfg.Images,
|
||||
historian: cfg.Historian,
|
||||
clock: cfg.Clock,
|
||||
externalURL: cfg.ExternalURL,
|
||||
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
||||
}
|
||||
}
|
||||
|
||||
@ -271,11 +276,11 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
|
||||
}
|
||||
|
||||
func (st *Manager) GetAll(orgID int64) []*State {
|
||||
return st.cache.getAll(orgID)
|
||||
allStates := st.cache.getAll(orgID, st.doNotSaveNormalState)
|
||||
return allStates
|
||||
}
|
||||
|
||||
func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
|
||||
return st.cache.getStatesForRuleUID(orgID, alertRuleUID)
|
||||
return st.cache.getStatesForRuleUID(orgID, alertRuleUID, st.doNotSaveNormalState)
|
||||
}
|
||||
|
||||
func (st *Manager) Put(states []*State) {
|
||||
@ -294,9 +299,14 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state
|
||||
instances := make([]ngModels.AlertInstance, 0, len(states))
|
||||
|
||||
for _, s := range states {
|
||||
// Do not save normal state to database and remove transition to Normal state but keep mapped states
|
||||
if st.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() {
|
||||
continue
|
||||
}
|
||||
|
||||
key, err := s.GetAlertInstanceKey()
|
||||
if err != nil {
|
||||
logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err)
|
||||
logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String())
|
||||
continue
|
||||
}
|
||||
fields := ngModels.AlertInstance{
|
||||
@ -311,6 +321,10 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state
|
||||
instances = append(instances, fields)
|
||||
}
|
||||
|
||||
if len(instances) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
|
||||
type debugInfo struct {
|
||||
State string
|
||||
|
@ -7,9 +7,13 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log/logtest"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
)
|
||||
|
||||
// Not for parallel tests.
|
||||
@ -65,3 +69,81 @@ func TestStateIsStale(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestManager_saveAlertStates(t *testing.T) {
|
||||
type stateWithReason struct {
|
||||
State eval.State
|
||||
Reason string
|
||||
}
|
||||
create := func(s eval.State, r string) stateWithReason {
|
||||
return stateWithReason{
|
||||
State: s,
|
||||
Reason: r,
|
||||
}
|
||||
}
|
||||
allStates := [...]stateWithReason{
|
||||
create(eval.Normal, ""),
|
||||
create(eval.Normal, eval.NoData.String()),
|
||||
create(eval.Normal, eval.Error.String()),
|
||||
create(eval.Normal, util.GenerateShortUID()),
|
||||
create(eval.Alerting, ""),
|
||||
create(eval.Pending, ""),
|
||||
create(eval.NoData, ""),
|
||||
create(eval.Error, ""),
|
||||
}
|
||||
|
||||
transitionToKey := map[ngmodels.AlertInstanceKey]StateTransition{}
|
||||
transitions := make([]StateTransition, 0)
|
||||
for _, fromState := range allStates {
|
||||
for i, toState := range allStates {
|
||||
tr := StateTransition{
|
||||
State: &State{
|
||||
State: toState.State,
|
||||
StateReason: toState.Reason,
|
||||
Labels: ngmodels.GenerateAlertLabels(5, fmt.Sprintf("%d--", i)),
|
||||
},
|
||||
PreviousState: fromState.State,
|
||||
PreviousStateReason: fromState.Reason,
|
||||
}
|
||||
key, err := tr.GetAlertInstanceKey()
|
||||
require.NoError(t, err)
|
||||
transitionToKey[key] = tr
|
||||
transitions = append(transitions, tr)
|
||||
}
|
||||
}
|
||||
|
||||
t.Run("should save all transitions if doNotSaveNormalState is false", func(t *testing.T) {
|
||||
st := &FakeInstanceStore{}
|
||||
m := Manager{instanceStore: st, doNotSaveNormalState: false}
|
||||
m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...)
|
||||
|
||||
savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{}
|
||||
for _, op := range st.RecordedOps {
|
||||
saved := op.(ngmodels.AlertInstance)
|
||||
savedKeys[saved.AlertInstanceKey] = saved
|
||||
}
|
||||
assert.Len(t, transitionToKey, len(savedKeys))
|
||||
|
||||
for key, tr := range transitionToKey {
|
||||
assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("should not save Normal->Normal if doNotSaveNormalState is true", func(t *testing.T) {
|
||||
st := &FakeInstanceStore{}
|
||||
m := Manager{instanceStore: st, doNotSaveNormalState: true}
|
||||
m.saveAlertStates(context.Background(), &logtest.Fake{}, transitions...)
|
||||
|
||||
savedKeys := map[ngmodels.AlertInstanceKey]ngmodels.AlertInstance{}
|
||||
for _, op := range st.RecordedOps {
|
||||
saved := op.(ngmodels.AlertInstance)
|
||||
savedKeys[saved.AlertInstanceKey] = saved
|
||||
}
|
||||
for key, tr := range transitionToKey {
|
||||
if tr.State.State == eval.Normal && tr.StateReason == "" && tr.PreviousState == eval.Normal && tr.PreviousStateReason == "" {
|
||||
continue
|
||||
}
|
||||
assert.Containsf(t, savedKeys, key, "state %s (%s) was not saved but should be", tr.State.State, tr.StateReason)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
@ -140,6 +140,11 @@ func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
|
||||
a.EndsAt = nextEndsTime(interval, evaluatedAt)
|
||||
}
|
||||
|
||||
// IsNormalStateWithNoReason returns true if the state is Normal and reason is empty
|
||||
func IsNormalStateWithNoReason(s *State) bool {
|
||||
return s.State == eval.Normal && s.StateReason == ""
|
||||
}
|
||||
|
||||
// StateTransition describes the transition from one state to another.
|
||||
type StateTransition struct {
|
||||
*State
|
||||
|
@ -30,15 +30,9 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
|
||||
if cmd.RuleUID != "" {
|
||||
addToQuery(` AND rule_uid = ?`, cmd.RuleUID)
|
||||
}
|
||||
|
||||
if cmd.State != "" {
|
||||
addToQuery(` AND current_state = ?`, cmd.State)
|
||||
if st.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingNoNormalState) {
|
||||
s.WriteString(fmt.Sprintf(" AND NOT (current_state = '%s' AND current_reason = '')", models.InstanceStateNormal))
|
||||
}
|
||||
|
||||
if cmd.StateReason != "" {
|
||||
addToQuery(` AND current_reason = ?`, cmd.StateReason)
|
||||
}
|
||||
|
||||
if err := sess.SQL(s.String(), params...).Find(&alertInstances); err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -7,8 +7,10 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/featuremgmt"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/tests"
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
)
|
||||
|
||||
const baseIntervalSeconds = 10
|
||||
@ -17,7 +19,7 @@ func BenchmarkAlertInstanceOperations(b *testing.B) {
|
||||
b.StopTimer()
|
||||
ctx := context.Background()
|
||||
_, dbstore := tests.SetupTestEnv(b, baseIntervalSeconds)
|
||||
dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = false
|
||||
dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingBigTransactions)
|
||||
|
||||
const mainOrgID int64 = 1
|
||||
|
||||
@ -86,7 +88,8 @@ func TestIntegrationAlertInstanceBulkWrite(t *testing.T) {
|
||||
}
|
||||
|
||||
for _, bigStmts := range []bool{false, true} {
|
||||
dbstore.FeatureToggles.(*tests.FakeFeatures).BigTransactions = bigStmts
|
||||
dbstore.FeatureToggles = featuremgmt.WithFeatures([]interface{}{featuremgmt.FlagAlertingBigTransactions, bigStmts})
|
||||
t.Log("Saving")
|
||||
err := dbstore.SaveAlertInstances(ctx, instances...)
|
||||
require.NoError(t, err)
|
||||
t.Log("Finished database write")
|
||||
@ -126,6 +129,16 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) {
|
||||
|
||||
const mainOrgID int64 = 1
|
||||
|
||||
containsHash := func(t *testing.T, instances []*models.AlertInstance, hash string) {
|
||||
t.Helper()
|
||||
for _, i := range instances {
|
||||
if i.LabelsHash == hash {
|
||||
return
|
||||
}
|
||||
}
|
||||
require.Fail(t, "%v does not contain an instance with hash %s", instances, hash)
|
||||
}
|
||||
|
||||
alertRule1 := tests.CreateTestAlertRule(t, ctx, dbstore, 60, mainOrgID)
|
||||
orgID := alertRule1.OrgID
|
||||
|
||||
@ -249,16 +262,56 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) {
|
||||
require.Len(t, listQuery.Result, 4)
|
||||
})
|
||||
|
||||
t.Run("can list all added instances in org filtered by current state", func(t *testing.T) {
|
||||
listQuery := &models.ListAlertInstancesQuery{
|
||||
RuleOrgID: orgID,
|
||||
State: models.InstanceStateNormal,
|
||||
t.Run("should ignore Normal state with no reason if feature flag is enabled", func(t *testing.T) {
|
||||
labels := models.InstanceLabels{"test": util.GenerateShortUID()}
|
||||
instance1 := models.AlertInstance{
|
||||
AlertInstanceKey: models.AlertInstanceKey{
|
||||
RuleOrgID: orgID,
|
||||
RuleUID: util.GenerateShortUID(),
|
||||
LabelsHash: util.GenerateShortUID(),
|
||||
},
|
||||
CurrentState: models.InstanceStateNormal,
|
||||
CurrentReason: "",
|
||||
Labels: labels,
|
||||
}
|
||||
|
||||
err := dbstore.ListAlertInstances(ctx, listQuery)
|
||||
instance2 := models.AlertInstance{
|
||||
AlertInstanceKey: models.AlertInstanceKey{
|
||||
RuleOrgID: orgID,
|
||||
RuleUID: util.GenerateShortUID(),
|
||||
LabelsHash: util.GenerateShortUID(),
|
||||
},
|
||||
CurrentState: models.InstanceStateNormal,
|
||||
CurrentReason: models.StateReasonError,
|
||||
Labels: labels,
|
||||
}
|
||||
err := dbstore.SaveAlertInstances(ctx, instance1, instance2)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Len(t, listQuery.Result, 1)
|
||||
listQuery := &models.ListAlertInstancesQuery{
|
||||
RuleOrgID: orgID,
|
||||
}
|
||||
|
||||
err = dbstore.ListAlertInstances(ctx, listQuery)
|
||||
require.NoError(t, err)
|
||||
|
||||
containsHash(t, listQuery.Result, instance1.LabelsHash)
|
||||
|
||||
f := dbstore.FeatureToggles
|
||||
dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingNoNormalState)
|
||||
t.Cleanup(func() {
|
||||
dbstore.FeatureToggles = f
|
||||
})
|
||||
|
||||
err = dbstore.ListAlertInstances(ctx, listQuery)
|
||||
require.NoError(t, err)
|
||||
|
||||
containsHash(t, listQuery.Result, instance2.LabelsHash)
|
||||
|
||||
for _, instance := range listQuery.Result {
|
||||
if instance.CurrentState == models.InstanceStateNormal && instance.CurrentReason == "" {
|
||||
require.Fail(t, "List operation expected to return all states except Normal but the result contains Normal states")
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("update instance with same org_id, uid and different state", func(t *testing.T) {
|
||||
|
@ -43,18 +43,6 @@ import (
|
||||
"github.com/grafana/grafana/pkg/util"
|
||||
)
|
||||
|
||||
type FakeFeatures struct {
|
||||
BigTransactions bool
|
||||
}
|
||||
|
||||
func (f *FakeFeatures) IsEnabled(feature string) bool {
|
||||
if feature == featuremgmt.FlagAlertingBigTransactions {
|
||||
return f.BigTransactions
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// SetupTestEnv initializes a store to used by the tests.
|
||||
func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG, *store.DBstore) {
|
||||
tb.Helper()
|
||||
@ -100,7 +88,7 @@ func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG,
|
||||
folderService := folderimpl.ProvideService(ac, bus, cfg, dashboardService, dashboardStore, nil, features, folderPermissions, nil)
|
||||
|
||||
ng, err := ngalert.ProvideService(
|
||||
cfg, &FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil),
|
||||
cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotatest.New(false, nil),
|
||||
secretsService, nil, m, folderService, ac, &dashboards.FakeDashboardService{}, nil, bus, ac, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer,
|
||||
)
|
||||
require.NoError(tb, err)
|
||||
|
@ -29,7 +29,6 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert"
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||
ngalertmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
ngalerttests "github.com/grafana/grafana/pkg/services/ngalert/tests"
|
||||
"github.com/grafana/grafana/pkg/services/org"
|
||||
"github.com/grafana/grafana/pkg/services/org/orgimpl"
|
||||
"github.com/grafana/grafana/pkg/services/quota"
|
||||
@ -478,7 +477,7 @@ func setupEnv(t *testing.T, sqlStore *sqlstore.SQLStore, b bus.Bus, quotaService
|
||||
require.NoError(t, err)
|
||||
m := metrics.NewNGAlert(prometheus.NewRegistry())
|
||||
_, err = ngalert.ProvideService(
|
||||
sqlStore.Cfg, &ngalerttests.FakeFeatures{}, nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService,
|
||||
sqlStore.Cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, nil, nil, nil, quotaService,
|
||||
secretsService, nil, m, &foldertest.FakeService{}, &acmock.Mock{}, &dashboards.FakeDashboardService{}, nil, b, &acmock.Mock{}, annotationstest.NewFakeAnnotationsRepo(), &plugins.FakePluginStore{}, tracer,
|
||||
)
|
||||
require.NoError(t, err)
|
||||
|
Loading…
Reference in New Issue
Block a user