mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add a feature flag to periodically save states (#80987)
This commit is contained in:
parent
f7fd8e6cd1
commit
aa25776f81
@ -1180,6 +1180,10 @@ min_interval = 10s
|
|||||||
# (concurrent queries per rule disabled).
|
# (concurrent queries per rule disabled).
|
||||||
max_state_save_concurrency = 1
|
max_state_save_concurrency = 1
|
||||||
|
|
||||||
|
# If the feature flag 'alertingSaveStatePeriodic' is enabled, this is the interval that is used to persist the alerting instances to the database.
|
||||||
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
state_periodic_save_interval = 5m
|
||||||
|
|
||||||
[unified_alerting.screenshots]
|
[unified_alerting.screenshots]
|
||||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||||
# plugin, or set up Grafana to use a remote rendering service.
|
# plugin, or set up Grafana to use a remote rendering service.
|
||||||
|
@ -1112,6 +1112,15 @@
|
|||||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
;min_interval = 10s
|
;min_interval = 10s
|
||||||
|
|
||||||
|
# This is an experimental option to add parallelization to saving alert states in the database.
|
||||||
|
# It configures the maximum number of concurrent queries per rule evaluated. The default value is 1
|
||||||
|
# (concurrent queries per rule disabled).
|
||||||
|
;max_state_save_concurrency = 1
|
||||||
|
|
||||||
|
# If the feature flag 'alertingSaveStatePeriodic' is enabled, this is the interval that is used to persist the alerting instances to the database.
|
||||||
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
;state_periodic_save_interval = 5m
|
||||||
|
|
||||||
[unified_alerting.reserved_labels]
|
[unified_alerting.reserved_labels]
|
||||||
# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled.
|
# Comma-separated list of reserved labels added by the Grafana Alerting engine that should be disabled.
|
||||||
# For example: `disabled_labels=grafana_folder`
|
# For example: `disabled_labels=grafana_folder`
|
||||||
|
@ -177,4 +177,5 @@ export interface FeatureToggles {
|
|||||||
jitterAlertRules?: boolean;
|
jitterAlertRules?: boolean;
|
||||||
jitterAlertRulesWithinGroups?: boolean;
|
jitterAlertRulesWithinGroups?: boolean;
|
||||||
onPremToCloudMigrations?: boolean;
|
onPremToCloudMigrations?: boolean;
|
||||||
|
alertingSaveStatePeriodic?: boolean;
|
||||||
}
|
}
|
||||||
|
@ -1354,5 +1354,13 @@ var (
|
|||||||
Owner: grafanaOperatorExperienceSquad,
|
Owner: grafanaOperatorExperienceSquad,
|
||||||
Created: time.Date(2024, time.January, 22, 3, 30, 00, 00, time.UTC),
|
Created: time.Date(2024, time.January, 22, 3, 30, 00, 00, time.UTC),
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Name: "alertingSaveStatePeriodic",
|
||||||
|
Description: "Writes the state periodically to the database, asynchronous to rule evaluation",
|
||||||
|
Stage: FeatureStagePrivatePreview,
|
||||||
|
FrontendOnly: false,
|
||||||
|
Owner: grafanaAlertingSquad,
|
||||||
|
Created: time.Date(2024, time.January, 22, 12, 0, 0, 0, time.UTC),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -158,3 +158,4 @@ newFolderPicker,experimental,@grafana/grafana-frontend-platform,2024-01-12,false
|
|||||||
jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
jitterAlertRules,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
||||||
jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
jitterAlertRulesWithinGroups,experimental,@grafana/alerting-squad,2024-01-17,false,false,true,false
|
||||||
onPremToCloudMigrations,experimental,@grafana/grafana-operator-experience-squad,2024-01-22,false,false,false,false
|
onPremToCloudMigrations,experimental,@grafana/grafana-operator-experience-squad,2024-01-22,false,false,false,false
|
||||||
|
alertingSaveStatePeriodic,privatePreview,@grafana/alerting-squad,2024-01-22,false,false,false,false
|
||||||
|
|
@ -642,4 +642,8 @@ const (
|
|||||||
// FlagOnPremToCloudMigrations
|
// FlagOnPremToCloudMigrations
|
||||||
// In-development feature that will allow users to easily migrate their on-prem Grafana instances to Grafana Cloud.
|
// In-development feature that will allow users to easily migrate their on-prem Grafana instances to Grafana Cloud.
|
||||||
FlagOnPremToCloudMigrations = "onPremToCloudMigrations"
|
FlagOnPremToCloudMigrations = "onPremToCloudMigrations"
|
||||||
|
|
||||||
|
// FlagAlertingSaveStatePeriodic
|
||||||
|
// Writes the state periodically to the database, asynchronous to rule evaluation
|
||||||
|
FlagAlertingSaveStatePeriodic = "alertingSaveStatePeriodic"
|
||||||
)
|
)
|
||||||
|
@ -6,8 +6,9 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type State struct {
|
type State struct {
|
||||||
StateUpdateDuration prometheus.Histogram
|
StateUpdateDuration prometheus.Histogram
|
||||||
r prometheus.Registerer
|
StateFullSyncDuration prometheus.Histogram
|
||||||
|
r prometheus.Registerer
|
||||||
}
|
}
|
||||||
|
|
||||||
// Registerer exposes the Prometheus register directly. The state package needs this as, it uses a collector to fetch the current alerts by state in the system.
|
// Registerer exposes the Prometheus register directly. The state package needs this as, it uses a collector to fetch the current alerts by state in the system.
|
||||||
@ -27,5 +28,14 @@ func NewStateMetrics(r prometheus.Registerer) *State {
|
|||||||
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10},
|
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10},
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
StateFullSyncDuration: promauto.With(r).NewHistogram(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
|
Namespace: Namespace,
|
||||||
|
Subsystem: Subsystem,
|
||||||
|
Name: "state_full_sync_duration_seconds",
|
||||||
|
Help: "The duration of fully synchronizing the state with the database.",
|
||||||
|
Buckets: []float64{0.01, 0.1, 1, 2, 5, 10, 60},
|
||||||
|
},
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -296,7 +296,12 @@ func (ng *AlertNG) init() error {
|
|||||||
Tracer: ng.tracer,
|
Tracer: ng.tracer,
|
||||||
Log: log.New("ngalert.state.manager"),
|
Log: log.New("ngalert.state.manager"),
|
||||||
}
|
}
|
||||||
statePersister := state.NewSyncStatePersisiter(log.New("ngalert.state.manager.persist"), cfg)
|
logger := log.New("ngalert.state.manager.persist")
|
||||||
|
statePersister := state.NewSyncStatePersisiter(logger, cfg)
|
||||||
|
if ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingSaveStatePeriodic) {
|
||||||
|
ticker := clock.New().Ticker(ng.Cfg.UnifiedAlerting.StatePeriodicSaveInterval)
|
||||||
|
statePersister = state.NewAsyncStatePersister(logger, ticker, cfg)
|
||||||
|
}
|
||||||
stateManager := state.NewManager(cfg, statePersister)
|
stateManager := state.NewManager(cfg, statePersister)
|
||||||
scheduler := schedule.NewScheduler(schedCfg, stateManager)
|
scheduler := schedule.NewScheduler(schedCfg, stateManager)
|
||||||
|
|
||||||
@ -423,6 +428,9 @@ func (ng *AlertNG) Run(ctx context.Context) error {
|
|||||||
children.Go(func() error {
|
children.Go(func() error {
|
||||||
return ng.schedule.Run(subCtx)
|
return ng.schedule.Run(subCtx)
|
||||||
})
|
})
|
||||||
|
children.Go(func() error {
|
||||||
|
return ng.stateManager.Run(subCtx)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
return children.Wait()
|
return children.Wait()
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ type AlertInstanceManager interface {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type StatePersister interface {
|
type StatePersister interface {
|
||||||
Async(ctx context.Context, ticker *clock.Ticker, cache *cache)
|
Async(ctx context.Context, cache *cache)
|
||||||
Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition)
|
Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,6 +103,11 @@ func NewManager(cfg ManagerCfg, statePersister StatePersister) *Manager {
|
|||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (st *Manager) Run(ctx context.Context) error {
|
||||||
|
st.persister.Async(ctx, st.cache)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
|
func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
|
||||||
if st.instanceStore == nil {
|
if st.instanceStore == nil {
|
||||||
st.log.Info("Skip warming the state because instance store is not configured")
|
st.log.Info("Skip warming the state because instance store is not configured")
|
||||||
|
@ -8,6 +8,7 @@ import (
|
|||||||
"go.opentelemetry.io/otel/trace"
|
"go.opentelemetry.io/otel/trace"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
||||||
)
|
)
|
||||||
|
|
||||||
type AsyncStatePersister struct {
|
type AsyncStatePersister struct {
|
||||||
@ -15,20 +16,24 @@ type AsyncStatePersister struct {
|
|||||||
// doNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods.
|
// doNotSaveNormalState controls whether eval.Normal state is persisted to the database and returned by get methods.
|
||||||
doNotSaveNormalState bool
|
doNotSaveNormalState bool
|
||||||
store InstanceStore
|
store InstanceStore
|
||||||
|
ticker *clock.Ticker
|
||||||
|
metrics *metrics.State
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewAsyncStatePersister(log log.Logger, cfg ManagerCfg) StatePersister {
|
func NewAsyncStatePersister(log log.Logger, ticker *clock.Ticker, cfg ManagerCfg) StatePersister {
|
||||||
return &AsyncStatePersister{
|
return &AsyncStatePersister{
|
||||||
log: log,
|
log: log,
|
||||||
store: cfg.InstanceStore,
|
store: cfg.InstanceStore,
|
||||||
|
ticker: ticker,
|
||||||
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
||||||
|
metrics: cfg.Metrics,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, cache *cache) {
|
func (a *AsyncStatePersister) Async(ctx context.Context, cache *cache) {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ticker.C:
|
case <-a.ticker.C:
|
||||||
if err := a.fullSync(ctx, cache); err != nil {
|
if err := a.fullSync(ctx, cache); err != nil {
|
||||||
a.log.Error("Failed to do a full state sync to database", "err", err)
|
a.log.Error("Failed to do a full state sync to database", "err", err)
|
||||||
}
|
}
|
||||||
@ -37,7 +42,7 @@ func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, c
|
|||||||
if err := a.fullSync(context.Background(), cache); err != nil {
|
if err := a.fullSync(context.Background(), cache); err != nil {
|
||||||
a.log.Error("Failed to do a full state sync to database", "err", err)
|
a.log.Error("Failed to do a full state sync to database", "err", err)
|
||||||
}
|
}
|
||||||
ticker.Stop()
|
a.ticker.Stop()
|
||||||
a.log.Info("State async worker is shut down.")
|
a.log.Info("State async worker is shut down.")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -46,13 +51,16 @@ func (a *AsyncStatePersister) Async(ctx context.Context, ticker *clock.Ticker, c
|
|||||||
|
|
||||||
func (a *AsyncStatePersister) fullSync(ctx context.Context, cache *cache) error {
|
func (a *AsyncStatePersister) fullSync(ctx context.Context, cache *cache) error {
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
a.log.Info("Full state sync start")
|
a.log.Debug("Full state sync start")
|
||||||
instances := cache.asInstances(a.doNotSaveNormalState)
|
instances := cache.asInstances(a.doNotSaveNormalState)
|
||||||
if err := a.store.FullSync(ctx, instances); err != nil {
|
if err := a.store.FullSync(ctx, instances); err != nil {
|
||||||
a.log.Error("Full state sync failed", "duration", time.Since(startTime), "instances", len(instances))
|
a.log.Error("Full state sync failed", "duration", time.Since(startTime), "instances", len(instances))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
a.log.Info("Full state sync done", "duration", time.Since(startTime), "instances", len(instances))
|
a.log.Debug("Full state sync done", "duration", time.Since(startTime), "instances", len(instances))
|
||||||
|
if a.metrics != nil {
|
||||||
|
a.metrics.StateFullSyncDuration.Observe(time.Since(startTime).Seconds())
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ func TestAsyncStatePersister_Async(t *testing.T) {
|
|||||||
store := &FakeInstanceStore{}
|
store := &FakeInstanceStore{}
|
||||||
logger := log.New("async.test")
|
logger := log.New("async.test")
|
||||||
|
|
||||||
persister := NewAsyncStatePersister(logger, ManagerCfg{
|
persister := NewAsyncStatePersister(logger, mockClock.Ticker(1*time.Second), ManagerCfg{
|
||||||
InstanceStore: store,
|
InstanceStore: store,
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -28,11 +28,9 @@ func TestAsyncStatePersister_Async(t *testing.T) {
|
|||||||
cancel()
|
cancel()
|
||||||
}()
|
}()
|
||||||
|
|
||||||
ticker := mockClock.Ticker(1 * time.Second)
|
|
||||||
|
|
||||||
cache := newCache()
|
cache := newCache()
|
||||||
|
|
||||||
go persister.Async(ctx, ticker, cache)
|
go persister.Async(ctx, cache)
|
||||||
|
|
||||||
cache.set(&State{
|
cache.set(&State{
|
||||||
OrgID: 1,
|
OrgID: 1,
|
||||||
@ -52,17 +50,15 @@ func TestAsyncStatePersister_Async(t *testing.T) {
|
|||||||
store := &FakeInstanceStore{}
|
store := &FakeInstanceStore{}
|
||||||
logger := log.New("async.test")
|
logger := log.New("async.test")
|
||||||
|
|
||||||
persister := NewAsyncStatePersister(logger, ManagerCfg{
|
persister := NewAsyncStatePersister(logger, mockClock.Ticker(1*time.Second), ManagerCfg{
|
||||||
InstanceStore: store,
|
InstanceStore: store,
|
||||||
})
|
})
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
|
||||||
ticker := mockClock.Ticker(1 * time.Second)
|
|
||||||
|
|
||||||
cache := newCache()
|
cache := newCache()
|
||||||
|
|
||||||
go persister.Async(ctx, ticker, cache)
|
go persister.Async(ctx, cache)
|
||||||
|
|
||||||
cache.set(&State{
|
cache.set(&State{
|
||||||
OrgID: 1,
|
OrgID: 1,
|
||||||
|
@ -3,13 +3,12 @@ package state
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
|
||||||
"github.com/benbjohnson/clock"
|
|
||||||
"go.opentelemetry.io/otel/trace"
|
"go.opentelemetry.io/otel/trace"
|
||||||
)
|
)
|
||||||
|
|
||||||
type NoopPersister struct{}
|
type NoopPersister struct{}
|
||||||
|
|
||||||
func (n *NoopPersister) Async(_ context.Context, _ *clock.Ticker, _ *cache) {}
|
func (n *NoopPersister) Async(_ context.Context, _ *cache) {}
|
||||||
func (n *NoopPersister) Sync(_ context.Context, _ trace.Span, _, _ []StateTransition) {}
|
func (n *NoopPersister) Sync(_ context.Context, _ trace.Span, _, _ []StateTransition) {}
|
||||||
|
|
||||||
func NewNoopPersister() StatePersister {
|
func NewNoopPersister() StatePersister {
|
||||||
|
@ -4,7 +4,6 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/benbjohnson/clock"
|
|
||||||
"github.com/grafana/dskit/concurrency"
|
"github.com/grafana/dskit/concurrency"
|
||||||
"go.opentelemetry.io/otel/attribute"
|
"go.opentelemetry.io/otel/attribute"
|
||||||
"go.opentelemetry.io/otel/trace"
|
"go.opentelemetry.io/otel/trace"
|
||||||
@ -31,7 +30,7 @@ func NewSyncStatePersisiter(log log.Logger, cfg ManagerCfg) StatePersister {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *SyncStatePersister) Async(_ context.Context, _ *clock.Ticker, _ *cache) {
|
func (a *SyncStatePersister) Async(_ context.Context, _ *cache) {
|
||||||
a.log.Debug("Async: No-Op")
|
a.log.Debug("Async: No-Op")
|
||||||
}
|
}
|
||||||
func (a *SyncStatePersister) Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition) {
|
func (a *SyncStatePersister) Sync(ctx context.Context, span trace.Span, states, staleStates []StateTransition) {
|
||||||
|
@ -98,7 +98,8 @@ type UnifiedAlertingSettings struct {
|
|||||||
RemoteAlertmanager RemoteAlertmanagerSettings
|
RemoteAlertmanager RemoteAlertmanagerSettings
|
||||||
Upgrade UnifiedAlertingUpgradeSettings
|
Upgrade UnifiedAlertingUpgradeSettings
|
||||||
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
|
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
|
||||||
MaxStateSaveConcurrency int
|
MaxStateSaveConcurrency int
|
||||||
|
StatePeriodicSaveInterval time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// RemoteAlertmanagerSettings contains the configuration needed
|
// RemoteAlertmanagerSettings contains the configuration needed
|
||||||
@ -403,6 +404,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
|||||||
|
|
||||||
uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1)
|
uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1)
|
||||||
|
|
||||||
|
uaCfg.StatePeriodicSaveInterval, err = gtime.ParseDuration(valueAsString(ua, "state_periodic_save_interval", (time.Minute * 5).String()))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
upgrade := iniFile.Section("unified_alerting.upgrade")
|
upgrade := iniFile.Section("unified_alerting.upgrade")
|
||||||
uaCfgUpgrade := UnifiedAlertingUpgradeSettings{
|
uaCfgUpgrade := UnifiedAlertingUpgradeSettings{
|
||||||
CleanUpgrade: upgrade.Key("clean_upgrade").MustBool(false),
|
CleanUpgrade: upgrade.Key("clean_upgrade").MustBool(false),
|
||||||
|
Loading…
Reference in New Issue
Block a user