mirror of
https://github.com/grafana/grafana.git
synced 2024-11-21 16:38:03 -06:00
Alerting: Emit warning when creating or updating unusually large groups (#82279)
* Add config for limit of rules per rule group * Warn when editing big groups through normal API * Warn on prov api writes for groups * Wire up comp root, tests * Also add warning to state manager warm * Drop unnecessary conversion
This commit is contained in:
parent
556d531c8d
commit
99fa064576
@ -1111,6 +1111,10 @@ global_file = 1000
|
||||
# global limit of correlations
|
||||
global_correlations = -1
|
||||
|
||||
# Limit of the number of alert rules per rule group.
|
||||
# This is not strictly enforced yet, but will be enforced over time.
|
||||
alerting_rule_group_rules = 100
|
||||
|
||||
#################################### Unified Alerting ####################
|
||||
[unified_alerting]
|
||||
# Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed when switching. When this configuration section and flag are not defined, the state is defined at runtime. See the documentation for more details.
|
||||
|
@ -1041,6 +1041,10 @@
|
||||
# global limit of correlations
|
||||
; global_correlations = -1
|
||||
|
||||
# Limit of the number of alert rules per rule group.
|
||||
# This is not strictly enforced yet, but will be enforced over time.
|
||||
;alerting_rule_group_rules = 100
|
||||
|
||||
#################################### Unified Alerting ####################
|
||||
[unified_alerting]
|
||||
#Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed.```
|
||||
|
@ -1632,7 +1632,7 @@ func createProvisioningSrvSutFromEnv(t *testing.T, env *testEnvironment) Provisi
|
||||
contactPointService: provisioning.NewContactPointService(env.configs, env.secrets, env.prov, env.xact, receiverSvc, env.log),
|
||||
templates: provisioning.NewTemplateService(env.configs, env.prov, env.xact, env.log),
|
||||
muteTimings: provisioning.NewMuteTimingService(env.configs, env.prov, env.xact, env.log),
|
||||
alertRules: provisioning.NewAlertRuleService(env.store, env.prov, env.dashboardService, env.quotas, env.xact, 60, 10, env.log),
|
||||
alertRules: provisioning.NewAlertRuleService(env.store, env.prov, env.dashboardService, env.quotas, env.xact, 60, 10, 100, env.log),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -251,6 +251,10 @@ func (srv RulerSrv) RoutePostNameRulesConfig(c *contextmodel.ReqContext, ruleGro
|
||||
return toNamespaceErrorResponse(err)
|
||||
}
|
||||
|
||||
if err := srv.checkGroupLimits(ruleGroupConfig); err != nil {
|
||||
return ErrResp(http.StatusBadRequest, err, "")
|
||||
}
|
||||
|
||||
rules, err := validateRuleGroup(&ruleGroupConfig, c.SignedInUser.GetOrgID(), namespace, srv.cfg)
|
||||
if err != nil {
|
||||
return ErrResp(http.StatusBadRequest, err, "")
|
||||
@ -265,6 +269,18 @@ func (srv RulerSrv) RoutePostNameRulesConfig(c *contextmodel.ReqContext, ruleGro
|
||||
return srv.updateAlertRulesInGroup(c, groupKey, rules)
|
||||
}
|
||||
|
||||
func (srv RulerSrv) checkGroupLimits(group apimodels.PostableRuleGroupConfig) error {
|
||||
if srv.cfg.RulesPerRuleGroupLimit > 0 && int64(len(group.Rules)) > srv.cfg.RulesPerRuleGroupLimit {
|
||||
srv.log.Warn("Large rule group was edited. Large groups are discouraged and may be rejected in the future.",
|
||||
"limit", srv.cfg.RulesPerRuleGroupLimit,
|
||||
"actual", len(group.Rules),
|
||||
"group", group.Name,
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// updateAlertRulesInGroup calculates changes (rules to add,update,delete), verifies that the user is authorized to do the calculated changes and updates database.
|
||||
// All operations are performed in a single transaction
|
||||
func (srv RulerSrv) updateAlertRulesInGroup(c *contextmodel.ReqContext, groupKey ngmodels.AlertRuleGroupKey, rules []*ngmodels.AlertRuleWithOptionals) response.Response {
|
||||
|
@ -301,6 +301,7 @@ func (ng *AlertNG) init() error {
|
||||
DoNotSaveNormalState: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoNormalState),
|
||||
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoDataErrorExecution),
|
||||
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
|
||||
RulesPerRuleGroupLimit: ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
|
||||
Tracer: ng.tracer,
|
||||
Log: log.New("ngalert.state.manager"),
|
||||
}
|
||||
@ -330,7 +331,8 @@ func (ng *AlertNG) init() error {
|
||||
muteTimingService := provisioning.NewMuteTimingService(ng.store, ng.store, ng.store, ng.Log)
|
||||
alertRuleService := provisioning.NewAlertRuleService(ng.store, ng.store, ng.dashboardService, ng.QuotaService, ng.store,
|
||||
int64(ng.Cfg.UnifiedAlerting.DefaultRuleEvaluationInterval.Seconds()),
|
||||
int64(ng.Cfg.UnifiedAlerting.BaseInterval.Seconds()), ng.Log)
|
||||
int64(ng.Cfg.UnifiedAlerting.BaseInterval.Seconds()),
|
||||
ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit, ng.Log)
|
||||
|
||||
ng.api = &api.API{
|
||||
Cfg: ng.Cfg,
|
||||
|
@ -17,6 +17,7 @@ import (
|
||||
type AlertRuleService struct {
|
||||
defaultIntervalSeconds int64
|
||||
baseIntervalSeconds int64
|
||||
rulesPerRuleGroupLimit int64
|
||||
ruleStore RuleStore
|
||||
provenanceStore ProvisioningStore
|
||||
dashboardService dashboards.DashboardService
|
||||
@ -32,10 +33,12 @@ func NewAlertRuleService(ruleStore RuleStore,
|
||||
xact TransactionManager,
|
||||
defaultIntervalSeconds int64,
|
||||
baseIntervalSeconds int64,
|
||||
rulesPerRuleGroupLimit int64,
|
||||
log log.Logger) *AlertRuleService {
|
||||
return &AlertRuleService{
|
||||
defaultIntervalSeconds: defaultIntervalSeconds,
|
||||
baseIntervalSeconds: baseIntervalSeconds,
|
||||
rulesPerRuleGroupLimit: rulesPerRuleGroupLimit,
|
||||
ruleStore: ruleStore,
|
||||
provenanceStore: provenanceStore,
|
||||
dashboardService: dashboardService,
|
||||
@ -248,6 +251,10 @@ func (service *AlertRuleService) ReplaceRuleGroup(ctx context.Context, orgID int
|
||||
}
|
||||
}
|
||||
|
||||
if err := service.checkGroupLimits(group); err != nil {
|
||||
return fmt.Errorf("write rejected due to exceeded limits: %w", err)
|
||||
}
|
||||
|
||||
key := models.AlertRuleGroupKey{
|
||||
OrgID: orgID,
|
||||
NamespaceUID: group.FolderUID,
|
||||
@ -533,3 +540,15 @@ func withoutNilAlertRules(ptrs []*models.AlertRule) []models.AlertRule {
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func (service *AlertRuleService) checkGroupLimits(group models.AlertRuleGroup) error {
|
||||
if service.rulesPerRuleGroupLimit > 0 && int64(len(group.Rules)) > service.rulesPerRuleGroupLimit {
|
||||
service.log.Warn("Large rule group was edited. Large groups are discouraged and may be rejected in the future.",
|
||||
"limit", service.rulesPerRuleGroupLimit,
|
||||
"actual", len(group.Rules),
|
||||
"group", group.Title,
|
||||
)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -50,6 +50,7 @@ type Manager struct {
|
||||
|
||||
doNotSaveNormalState bool
|
||||
applyNoDataAndErrorToAllStates bool
|
||||
rulesPerRuleGroupLimit int64
|
||||
|
||||
persister StatePersister
|
||||
}
|
||||
@ -68,6 +69,7 @@ type ManagerCfg struct {
|
||||
// ApplyNoDataAndErrorToAllStates makes state manager to apply exceptional results (NoData and Error)
|
||||
// to all states when corresponding execution in the rule definition is set to either `Alerting` or `OK`
|
||||
ApplyNoDataAndErrorToAllStates bool
|
||||
RulesPerRuleGroupLimit int64
|
||||
|
||||
Tracer tracing.Tracer
|
||||
Log log.Logger
|
||||
@ -92,6 +94,7 @@ func NewManager(cfg ManagerCfg, statePersister StatePersister) *Manager {
|
||||
externalURL: cfg.ExternalURL,
|
||||
doNotSaveNormalState: cfg.DoNotSaveNormalState,
|
||||
applyNoDataAndErrorToAllStates: cfg.ApplyNoDataAndErrorToAllStates,
|
||||
rulesPerRuleGroupLimit: cfg.RulesPerRuleGroupLimit,
|
||||
persister: statePersister,
|
||||
tracer: cfg.Tracer,
|
||||
}
|
||||
@ -134,8 +137,23 @@ func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
|
||||
}
|
||||
|
||||
ruleByUID := make(map[string]*ngModels.AlertRule, len(alertRules))
|
||||
groupSizes := make(map[string]int64)
|
||||
for _, rule := range alertRules {
|
||||
ruleByUID[rule.UID] = rule
|
||||
groupSizes[rule.RuleGroup] += 1
|
||||
}
|
||||
|
||||
// Emit a warning if we detect a large group.
|
||||
// We will not enforce this here, but it's convenient to emit the warning here as we load up all the rules.
|
||||
for name, size := range groupSizes {
|
||||
if st.rulesPerRuleGroupLimit > 0 && size > st.rulesPerRuleGroupLimit {
|
||||
st.log.Warn(
|
||||
"Large rule group was loaded. Large groups are discouraged and changes to them may be disallowed in the future.",
|
||||
"limit", st.rulesPerRuleGroupLimit,
|
||||
"actual", size,
|
||||
"group", name,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
orgStates := make(map[string]*ruleStates, len(ruleByUID))
|
||||
|
@ -280,6 +280,7 @@ func (ps *ProvisioningServiceImpl) ProvisionAlerting(ctx context.Context) error
|
||||
ps.SQLStore,
|
||||
int64(ps.Cfg.UnifiedAlerting.DefaultRuleEvaluationInterval.Seconds()),
|
||||
int64(ps.Cfg.UnifiedAlerting.BaseInterval.Seconds()),
|
||||
ps.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
|
||||
ps.log)
|
||||
receiverSvc := alertingNotifier.NewReceiverService(ps.ac, &st, st, ps.secretService, ps.SQLStore, ps.log)
|
||||
contactPointService := provisioning.NewContactPointService(&st, ps.secretService,
|
||||
|
@ -101,6 +101,7 @@ type UnifiedAlertingSettings struct {
|
||||
// MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel.
|
||||
MaxStateSaveConcurrency int
|
||||
StatePeriodicSaveInterval time.Duration
|
||||
RulesPerRuleGroupLimit int64
|
||||
}
|
||||
|
||||
// RemoteAlertmanagerSettings contains the configuration needed
|
||||
@ -352,6 +353,9 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
uaCfg.DefaultRuleEvaluationInterval = uaMinInterval
|
||||
}
|
||||
|
||||
quotas := iniFile.Section("quota")
|
||||
uaCfg.RulesPerRuleGroupLimit = quotas.Key("alerting_rule_group_rules").MustInt64(100)
|
||||
|
||||
remoteAlertmanager := iniFile.Section("remote.alertmanager")
|
||||
uaCfgRemoteAM := RemoteAlertmanagerSettings{
|
||||
Enable: remoteAlertmanager.Key("enabled").MustBool(false),
|
||||
|
Loading…
Reference in New Issue
Block a user