Alerting: Emit warning when creating or updating unusually large groups (#82279)

* Add config for limit of rules per rule group

* Warn when editing big groups through normal API

* Warn on prov api writes for groups

* Wire up comp root, tests

* Also add warning to state manager warm

* Drop unnecessary conversion
This commit is contained in:
Alexander Weaver 2024-02-13 08:29:03 -06:00 committed by GitHub
parent 556d531c8d
commit 99fa064576
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 70 additions and 2 deletions

View File

@ -1111,6 +1111,10 @@ global_file = 1000
# global limit of correlations
global_correlations = -1
# Limit of the number of alert rules per rule group.
# This is not strictly enforced yet, but will be enforced over time.
alerting_rule_group_rules = 100
#################################### Unified Alerting ####################
[unified_alerting]
# Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed when switching. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details.

View File

@ -1041,6 +1041,10 @@
# global limit of correlations
; global_correlations = -1
# Limit of the number of alert rules per rule group.
# This is not strictly enforced yet, but will be enforced over time.
;alerting_rule_group_rules = 100
#################################### Unified Alerting ####################
[unified_alerting]
#Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed.```

View File

@ -1632,7 +1632,7 @@ func createProvisioningSrvSutFromEnv(t *testing.T, env *testEnvironment) Provisi
contactPointService: provisioning.NewContactPointService(env.configs, env.secrets, env.prov, env.xact, receiverSvc, env.log),
templates: provisioning.NewTemplateService(env.configs, env.prov, env.xact, env.log),
muteTimings: provisioning.NewMuteTimingService(env.configs, env.prov, env.xact, env.log),
alertRules: provisioning.NewAlertRuleService(env.store, env.prov, env.dashboardService, env.quotas, env.xact, 60, 10, env.log),
alertRules: provisioning.NewAlertRuleService(env.store, env.prov, env.dashboardService, env.quotas, env.xact, 60, 10, 100, env.log),
}
}

View File

@ -251,6 +251,10 @@ func (srv RulerSrv) RoutePostNameRulesConfig(c *contextmodel.ReqContext, ruleGro
return toNamespaceErrorResponse(err)
}
if err := srv.checkGroupLimits(ruleGroupConfig); err != nil {
return ErrResp(http.StatusBadRequest, err, "")
}
rules, err := validateRuleGroup(&ruleGroupConfig, c.SignedInUser.GetOrgID(), namespace, srv.cfg)
if err != nil {
return ErrResp(http.StatusBadRequest, err, "")
@ -265,6 +269,18 @@ func (srv RulerSrv) RoutePostNameRulesConfig(c *contextmodel.ReqContext, ruleGro
return srv.updateAlertRulesInGroup(c, groupKey, rules)
}
func (srv RulerSrv) checkGroupLimits(group apimodels.PostableRuleGroupConfig) error {
if srv.cfg.RulesPerRuleGroupLimit > 0 && int64(len(group.Rules)) > srv.cfg.RulesPerRuleGroupLimit {
srv.log.Warn("Large rule group was edited. Large groups are discouraged and may be rejected in the future.",
"limit", srv.cfg.RulesPerRuleGroupLimit,
"actual", len(group.Rules),
"group", group.Name,
)
}
return nil
}
// updateAlertRulesInGroup calculates changes (rules to add,update,delete), verifies that the user is authorized to do the calculated changes and updates database.
// All operations are performed in a single transaction
func (srv RulerSrv) updateAlertRulesInGroup(c *contextmodel.ReqContext, groupKey ngmodels.AlertRuleGroupKey, rules []*ngmodels.AlertRuleWithOptionals) response.Response {

View File

@ -301,6 +301,7 @@ func (ng *AlertNG) init() error {
DoNotSaveNormalState: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoNormalState),
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoDataErrorExecution),
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
RulesPerRuleGroupLimit: ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
Tracer: ng.tracer,
Log: log.New("ngalert.state.manager"),
}
@ -330,7 +331,8 @@ func (ng *AlertNG) init() error {
muteTimingService := provisioning.NewMuteTimingService(ng.store, ng.store, ng.store, ng.Log)
alertRuleService := provisioning.NewAlertRuleService(ng.store, ng.store, ng.dashboardService, ng.QuotaService, ng.store,
int64(ng.Cfg.UnifiedAlerting.DefaultRuleEvaluationInterval.Seconds()),
int64(ng.Cfg.UnifiedAlerting.BaseInterval.Seconds()), ng.Log)
int64(ng.Cfg.UnifiedAlerting.BaseInterval.Seconds()),
ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit, ng.Log)
ng.api = &api.API{
Cfg: ng.Cfg,

View File

@ -17,6 +17,7 @@ import (
type AlertRuleService struct {
defaultIntervalSeconds int64
baseIntervalSeconds int64
rulesPerRuleGroupLimit int64
ruleStore RuleStore
provenanceStore ProvisioningStore
dashboardService dashboards.DashboardService
@ -32,10 +33,12 @@ func NewAlertRuleService(ruleStore RuleStore,
xact TransactionManager,
defaultIntervalSeconds int64,
baseIntervalSeconds int64,
rulesPerRuleGroupLimit int64,
log log.Logger) *AlertRuleService {
return &AlertRuleService{
defaultIntervalSeconds: defaultIntervalSeconds,
baseIntervalSeconds: baseIntervalSeconds,
rulesPerRuleGroupLimit: rulesPerRuleGroupLimit,
ruleStore: ruleStore,
provenanceStore: provenanceStore,
dashboardService: dashboardService,
@ -248,6 +251,10 @@ func (service *AlertRuleService) ReplaceRuleGroup(ctx context.Context, orgID int
}
}
if err := service.checkGroupLimits(group); err != nil {
return fmt.Errorf("write rejected due to exceeded limits: %w", err)
}
key := models.AlertRuleGroupKey{
OrgID: orgID,
NamespaceUID: group.FolderUID,
@ -533,3 +540,15 @@ func withoutNilAlertRules(ptrs []*models.AlertRule) []models.AlertRule {
}
return result
}
func (service *AlertRuleService) checkGroupLimits(group models.AlertRuleGroup) error {
if service.rulesPerRuleGroupLimit > 0 && int64(len(group.Rules)) > service.rulesPerRuleGroupLimit {
service.log.Warn("Large rule group was edited. Large groups are discouraged and may be rejected in the future.",
"limit", service.rulesPerRuleGroupLimit,
"actual", len(group.Rules),
"group", group.Title,
)
}
return nil
}

View File

@ -50,6 +50,7 @@ type Manager struct {
doNotSaveNormalState bool
applyNoDataAndErrorToAllStates bool
rulesPerRuleGroupLimit int64
persister StatePersister
}
@ -68,6 +69,7 @@ type ManagerCfg struct {
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
ApplyNoDataAndErrorToAllStates bool
RulesPerRuleGroupLimit int64
Tracer tracing.Tracer
Log log.Logger
@ -92,6 +94,7 @@ func NewManager(cfg ManagerCfg, statePersister StatePersister) *Manager {
externalURL: cfg.ExternalURL,
doNotSaveNormalState: cfg.DoNotSaveNormalState,
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
rulesPerRuleGroupLimit: cfg.RulesPerRuleGroupLimit,
persister: statePersister,
tracer: cfg.Tracer,
}
@ -134,8 +137,23 @@ func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
}
ruleByUID := make(map[string]*ngModels.AlertRule, len(alertRules))
groupSizes := make(map[string]int64)
for _, rule := range alertRules {
ruleByUID[rule.UID] = rule
groupSizes[rule.RuleGroup] += 1
}
// Emit a warning if we detect a large group.
// We will not enforce this here, but it's convenient to emit the warning here as we load up all the rules.
for name, size := range groupSizes {
if st.rulesPerRuleGroupLimit > 0 && size > st.rulesPerRuleGroupLimit {
st.log.Warn(
"Large rule group was loaded. Large groups are discouraged and changes to them may be disallowed in the future.",
"limit", st.rulesPerRuleGroupLimit,
"actual", size,
"group", name,
)
}
}
orgStates := make(map[string]*ruleStates, len(ruleByUID))

View File

@ -280,6 +280,7 @@ func (ps *ProvisioningServiceImpl) ProvisionAlerting(ctx context.Context) error
ps.SQLStore,
int64(ps.Cfg.UnifiedAlerting.DefaultRuleEvaluationInterval.Seconds()),
int64(ps.Cfg.UnifiedAlerting.BaseInterval.Seconds()),
ps.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
ps.log)
receiverSvc := alertingNotifier.NewReceiverService(ps.ac, &st, st, ps.secretService, ps.SQLStore, ps.log)
contactPointService := provisioning.NewContactPointService(&st, ps.secretService,

View File

@ -101,6 +101,7 @@ type UnifiedAlertingSettings struct {
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
MaxStateSaveConcurrency int
StatePeriodicSaveInterval time.Duration
RulesPerRuleGroupLimit int64
}
// RemoteAlertmanagerSettings contains the configuration needed
@ -352,6 +353,9 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
uaCfg.DefaultRuleEvaluationInterval = uaMinInterval
}
quotas := iniFile.Section("quota")
uaCfg.RulesPerRuleGroupLimit = quotas.Key("alerting_rule_group_rules").MustInt64(100)
remoteAlertmanager := iniFile.Section("remote.alertmanager")
uaCfgRemoteAM := RemoteAlertmanagerSettings{
Enable: remoteAlertmanager.Key("enabled").MustBool(false),