Alerting: Add a feature flag to periodically save states (#80987)

This commit is contained in:
Jean-Philippe Quéméner 2024-01-23 17:03:30 +01:00 committed by GitHub
parent f7fd8e6cd1
commit aa25776f81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 81 additions and 23 deletions

View File

@ -1180,6 +1180,10 @@ min_interval = 10s
# (concurrent queries per rule disabled). # (concurrent queries per rule disabled).
max_state_save_concurrency = 1 max_state_save_concurrency = 1
# If the feature flag 'alertingSaveStatePeriodic' is enabled, this is the interval that is used to persist the alerting instances to the database.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
state_periodic_save_interval = 5m
[unified_alerting.screenshots] [unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering # Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service. # plugin, or set up Grafana to use a remote rendering service.

View File

@ -1112,6 +1112,15 @@
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;min_interval = 10s ;min_interval = 10s
# This is an experimental option to add parallelization to saving alert states in the database.
# It configures the maximum number of concurrent queries per rule evaluated. The default value is 1
# (concurrent queries per rule disabled).
;max_state_save_concurrency = 1
# If the feature flag 'alertingSaveStatePeriodic' is enabled, this is the interval that is used to persist the alerting instances to the database.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;state_periodic_save_interval = 5m
[unified_alerting.reserved_labels] [unified_alerting.reserved_labels]
# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled. # Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled.
# For example: `disabled_labels=grafana_folder` # For example: `disabled_labels=grafana_folder`

View File

@ -177,4 +177,5 @@ export interface FeatureToggles {
jitterAlertRules?: boolean; jitterAlertRules?: boolean;
jitterAlertRulesWithinGroups?: boolean; jitterAlertRulesWithinGroups?: boolean;
onPremToCloudMigrations?: boolean; onPremToCloudMigrations?: boolean;
alertingSaveStatePeriodic?: boolean;
} }

View File

@ -1354,5 +1354,13 @@ var (
Owner: grafanaOperatorExperienceSquad, Owner: grafanaOperatorExperienceSquad,
Created: time.Date(2024, time.January, 22, 3, 30, 00, 00, time.UTC), Created: time.Date(2024, time.January, 22, 3, 30, 00, 00, time.UTC),
}, },
{
Name: "alertingSaveStatePeriodic",
Description: "Writes the state periodically to the database, asynchronous to rule evaluation",
Stage: FeatureStagePrivatePreview,
FrontendOnly: false,
Owner: grafanaAlertingSquad,
Created: time.Date(2024, time.January, 22, 12, 0, 0, 0, time.UTC),
},
} }
) )

View File

@ -158,3 +158,4 @@ newFolderPicker,experimental,@grafana/grafana-frontend-platform,2024-01-12,false
jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
onPremToCloudMigrations,experimental,@grafana/grafana-operator-experience-squad,2024-01-22,false,false,false,false onPremToCloudMigrations,experimental,@grafana/grafana-operator-experience-squad,2024-01-22,false,false,false,false
alertingSaveStatePeriodic,privatePreview,@grafana/alerting-squad,2024-01-22,false,false,false,false

1 Name Stage Owner Created requiresDevMode RequiresLicense RequiresRestart FrontendOnly
158 jitterAlertRules experimental @grafana/alerting-squad 2024-01-17 false false true false
159 jitterAlertRulesWithinGroups experimental @grafana/alerting-squad 2024-01-17 false false true false
160 onPremToCloudMigrations experimental @grafana/grafana-operator-experience-squad 2024-01-22 false false false false
161 alertingSaveStatePeriodic privatePreview @grafana/alerting-squad 2024-01-22 false false false false

View File

@ -642,4 +642,8 @@ const (
// FlagOnPremToCloudMigrations // FlagOnPremToCloudMigrations
// In-development feature that will allow users to easily migrate their on-prem Grafana instances to Grafana Cloud. // In-development feature that will allow users to easily migrate their on-prem Grafana instances to Grafana Cloud.
FlagOnPremToCloudMigrations = "onPremToCloudMigrations" FlagOnPremToCloudMigrations = "onPremToCloudMigrations"
// FlagAlertingSaveStatePeriodic
// Writes the state periodically to the database, asynchronous to rule evaluation
FlagAlertingSaveStatePeriodic = "alertingSaveStatePeriodic"
) )

View File

@ -7,6 +7,7 @@ import (
type State struct { type State struct {
StateUpdateDuration prometheus.Histogram StateUpdateDuration prometheus.Histogram
StateFullSyncDuration prometheus.Histogram
r prometheus.Registerer r prometheus.Registerer
} }
@ -27,5 +28,14 @@ func NewStateMetrics(r prometheus.Registerer) *State {
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10}, Buckets: []float64{0.01, 0.1, 1, 2, 5, 10},
}, },
), ),
StateFullSyncDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "state_full_sync_duration_seconds",
Help: "The duration of fully synchronizing the state with the database.",
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10, 60},
},
),
} }
} }

View File

@ -296,7 +296,12 @@ func (ng *AlertNG) init() error {
Tracer: ng.tracer, Tracer: ng.tracer,
Log: log.New("ngalert.state.manager"), Log: log.New("ngalert.state.manager"),
} }
statePersister := state.NewSyncStatePersisiter(log.New("ngalert.state.manager.persist"), cfg) logger := log.New("ngalert.state.manager.persist")
statePersister := state.NewSyncStatePersisiter(logger, cfg)
if ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingSaveStatePeriodic) {
ticker := clock.New().Ticker(ng.Cfg.UnifiedAlerting.StatePeriodicSaveInterval)
statePersister = state.NewAsyncStatePersister(logger, ticker, cfg)
}
stateManager := state.NewManager(cfg, statePersister) stateManager := state.NewManager(cfg, statePersister)
scheduler := schedule.NewScheduler(schedCfg, stateManager) scheduler := schedule.NewScheduler(schedCfg, stateManager)
@ -423,6 +428,9 @@ func (ng *AlertNG) Run(ctx context.Context) error {
children.Go(func() error { children.Go(func() error {
return ng.schedule.Run(subCtx) return ng.schedule.Run(subCtx)
}) })
children.Go(func() error {
return ng.stateManager.Run(subCtx)
})
} }
return children.Wait() return children.Wait()
} }

View File

@ -30,7 +30,7 @@ type AlertInstanceManager interface {
} }
type StatePersister interface { type StatePersister interface {
Async(ctx context.Context, ticker *clock.Ticker, cache *cache) Async(ctx context.Context, cache *cache)
Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition) Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition)
} }
@ -103,6 +103,11 @@ func NewManager(cfg ManagerCfg, statePersister StatePersister) *Manager {
return m return m
} }
func (st *Manager) Run(ctx context.Context) error {
st.persister.Async(ctx, st.cache)
return nil
}
func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) { func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
if st.instanceStore == nil { if st.instanceStore == nil {
st.log.Info("Skip warming the state because instance store is not configured") st.log.Info("Skip warming the state because instance store is not configured")

View File

@ -8,6 +8,7 @@ import (
"go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
) )
type AsyncStatePersister struct { type AsyncStatePersister struct {
@ -15,20 +16,24 @@ type AsyncStatePersister struct {
// doNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods. // doNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods.
doNotSaveNormalState bool doNotSaveNormalState bool
store InstanceStore store InstanceStore
ticker *clock.Ticker
metrics *metrics.State
} }
func NewAsyncStatePersister(log log.Logger, cfg ManagerCfg) StatePersister { func NewAsyncStatePersister(log log.Logger, ticker *clock.Ticker, cfg ManagerCfg) StatePersister {
return &AsyncStatePersister{ return &AsyncStatePersister{
log: log, log: log,
store: cfg.InstanceStore, store: cfg.InstanceStore,
ticker: ticker,
doNotSaveNormalState: cfg.DoNotSaveNormalState, doNotSaveNormalState: cfg.DoNotSaveNormalState,
metrics: cfg.Metrics,
} }
} }
func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, cache *cache) { func (a *AsyncStatePersister) Async(ctx context.Context, cache *cache) {
for { for {
select { select {
case <-ticker.C: case <-a.ticker.C:
if err := a.fullSync(ctx, cache); err != nil { if err := a.fullSync(ctx, cache); err != nil {
a.log.Error("Failed to do a full state sync to database", "err", err) a.log.Error("Failed to do a full state sync to database", "err", err)
} }
@ -37,7 +42,7 @@ func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, c
if err := a.fullSync(context.Background(), cache); err != nil { if err := a.fullSync(context.Background(), cache); err != nil {
a.log.Error("Failed to do a full state sync to database", "err", err) a.log.Error("Failed to do a full state sync to database", "err", err)
} }
ticker.Stop() a.ticker.Stop()
a.log.Info("State async worker is shut down.") a.log.Info("State async worker is shut down.")
return return
} }
@ -46,13 +51,16 @@ func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, c
func (a *AsyncStatePersister) fullSync(ctx context.Context, cache *cache) error { func (a *AsyncStatePersister) fullSync(ctx context.Context, cache *cache) error {
startTime := time.Now() startTime := time.Now()
a.log.Info("Full state sync start") a.log.Debug("Full state sync start")
instances := cache.asInstances(a.doNotSaveNormalState) instances := cache.asInstances(a.doNotSaveNormalState)
if err := a.store.FullSync(ctx, instances); err != nil { if err := a.store.FullSync(ctx, instances); err != nil {
a.log.Error("Full state sync failed", "duration", time.Since(startTime), "instances", len(instances)) a.log.Error("Full state sync failed", "duration", time.Since(startTime), "instances", len(instances))
return err return err
} }
a.log.Info("Full state sync done", "duration", time.Since(startTime), "instances", len(instances)) a.log.Debug("Full state sync done", "duration", time.Since(startTime), "instances", len(instances))
if a.metrics != nil {
a.metrics.StateFullSyncDuration.Observe(time.Since(startTime).Seconds())
}
return nil return nil
} }

View File

@ -18,7 +18,7 @@ func TestAsyncStatePersister_Async(t *testing.T) {
store := &FakeInstanceStore{} store := &FakeInstanceStore{}
logger := log.New("async.test") logger := log.New("async.test")
persister := NewAsyncStatePersister(logger, ManagerCfg{ persister := NewAsyncStatePersister(logger, mockClock.Ticker(1*time.Second), ManagerCfg{
InstanceStore: store, InstanceStore: store,
}) })
@ -28,11 +28,9 @@ func TestAsyncStatePersister_Async(t *testing.T) {
cancel() cancel()
}() }()
ticker := mockClock.Ticker(1 * time.Second)
cache := newCache() cache := newCache()
go persister.Async(ctx, ticker, cache) go persister.Async(ctx, cache)
cache.set(&State{ cache.set(&State{
OrgID: 1, OrgID: 1,
@ -52,17 +50,15 @@ func TestAsyncStatePersister_Async(t *testing.T) {
store := &FakeInstanceStore{} store := &FakeInstanceStore{}
logger := log.New("async.test") logger := log.New("async.test")
persister := NewAsyncStatePersister(logger, ManagerCfg{ persister := NewAsyncStatePersister(logger, mockClock.Ticker(1*time.Second), ManagerCfg{
InstanceStore: store, InstanceStore: store,
}) })
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
ticker := mockClock.Ticker(1 * time.Second)
cache := newCache() cache := newCache()
go persister.Async(ctx, ticker, cache) go persister.Async(ctx, cache)
cache.set(&State{ cache.set(&State{
OrgID: 1, OrgID: 1,

View File

@ -3,13 +3,12 @@ package state
import ( import (
"context" "context"
"github.com/benbjohnson/clock"
"go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace"
) )
type NoopPersister struct{} type NoopPersister struct{}
func (n *NoopPersister) Async(_ context.Context, _ *clock.Ticker, _ *cache) {} func (n *NoopPersister) Async(_ context.Context, _ *cache) {}
func (n *NoopPersister) Sync(_ context.Context, _ trace.Span, _, _ []StateTransition) {} func (n *NoopPersister) Sync(_ context.Context, _ trace.Span, _, _ []StateTransition) {}
func NewNoopPersister() StatePersister { func NewNoopPersister() StatePersister {

View File

@ -4,7 +4,6 @@ import (
"context" "context"
"time" "time"
"github.com/benbjohnson/clock"
"github.com/grafana/dskit/concurrency" "github.com/grafana/dskit/concurrency"
"go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace"
@ -31,7 +30,7 @@ func NewSyncStatePersisiter(log log.Logger, cfg ManagerCfg) StatePersister {
} }
} }
func (a *SyncStatePersister) Async(_ context.Context, _ *clock.Ticker, _ *cache) { func (a *SyncStatePersister) Async(_ context.Context, _ *cache) {
a.log.Debug("Async: No-Op") a.log.Debug("Async: No-Op")
} }
func (a *SyncStatePersister) Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition) { func (a *SyncStatePersister) Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition) {

View File

@ -99,6 +99,7 @@ type UnifiedAlertingSettings struct {
Upgrade UnifiedAlertingUpgradeSettings Upgrade UnifiedAlertingUpgradeSettings
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel. // MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
MaxStateSaveConcurrency int MaxStateSaveConcurrency int
StatePeriodicSaveInterval time.Duration
} }
// RemoteAlertmanagerSettings contains the configuration needed // RemoteAlertmanagerSettings contains the configuration needed
@ -403,6 +404,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1) uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1)
uaCfg.StatePeriodicSaveInterval, err = gtime.ParseDuration(valueAsString(ua, "state_periodic_save_interval", (time.Minute * 5).String()))
if err != nil {
return err
}
upgrade := iniFile.Section("unified_alerting.upgrade") upgrade := iniFile.Section("unified_alerting.upgrade")
uaCfgUpgrade := UnifiedAlertingUpgradeSettings{ uaCfgUpgrade := UnifiedAlertingUpgradeSettings{
CleanUpgrade: upgrade.Key("clean_upgrade").MustBool(false), CleanUpgrade: upgrade.Key("clean_upgrade").MustBool(false),