From fbad76007d88490d04b1bcf087ffaa0cd0c04a23 Mon Sep 17 00:00:00 2001 From: Brandon <2858207+igloo12@users.noreply.github.com> Date: Fri, 4 Oct 2024 14:31:21 -0700 Subject: [PATCH] Alerting: Limit and clean up old alert rules versions (#89754) --- conf/defaults.ini | 9 +- conf/sample.ini | 5 + pkg/services/ngalert/store/alert_rule.go | 57 +++++++- pkg/services/ngalert/store/alert_rule_test.go | 129 ++++++++++++++++++ pkg/setting/setting_unified_alerting.go | 10 ++ 5 files changed, 207 insertions(+), 3 deletions(-) diff --git a/conf/defaults.ini b/conf/defaults.ini index cb3965cc554..662e300834f 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1346,6 +1346,11 @@ notification_log_retention = 5d # Duration for which a resolved alert state transition will continue to be sent to the Alertmanager. resolved_alert_retention = 15m +# Defines the limit of how many alert rule versions +# should be stored in the database for each alert rule in an organization including the current one. +# 0 value means no limit +rule_version_record_limit = 0 + [unified_alerting.screenshots] # Enable screenshots in notifications. You must have either installed the Grafana image rendering # plugin, or set up Grafana to use a remote rendering service. @@ -1560,8 +1565,8 @@ expire_time = 7 #################################### Internal Grafana Metrics ############ # Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId [metrics] -enabled = true -interval_seconds = 10 +enabled = true +interval_seconds = 10 # Disable total stats (stat_totals_*) metrics to be generated disable_total_stats = false # The interval at which the total stats collector will update the stats. Default is 1800 seconds. diff --git a/conf/sample.ini b/conf/sample.ini index 099de883121..366b04d884a 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1335,6 +1335,11 @@ # Duration for which a resolved alert state transition will continue to be sent to the Alertmanager. ;resolved_alert_retention = 15m +# Defines the limit of how many alert rule versions +# should be stored in the database for each alert rule in an organization including the current one. +# 0 value means no limit +;rule_version_record_limit= 0 + [unified_alerting.screenshots] # Enable screenshots in notifications. You must have either installed the Grafana image rendering # plugin, or set up Grafana to use a remote rendering service. diff --git a/pkg/services/ngalert/store/alert_rule.go b/pkg/services/ngalert/store/alert_rule.go index 93c19037b15..a09f1dcc144 100644 --- a/pkg/services/ngalert/store/alert_rule.go +++ b/pkg/services/ngalert/store/alert_rule.go @@ -274,9 +274,64 @@ func (st DBstore) UpdateAlertRules(ctx context.Context, rules []ngmodels.UpdateR if _, err := sess.Insert(&ruleVersions); err != nil { return fmt.Errorf("failed to create new rule versions: %w", err) } + + for _, rule := range ruleVersions { + // delete old versions of alert rule + _, err = st.deleteOldAlertRuleVersions(ctx, rule.RuleUID, rule.RuleOrgID, st.Cfg.RuleVersionRecordLimit) + if err != nil { + st.Logger.Warn("Failed to delete old alert rule versions", "org", rule.RuleOrgID, "rule", rule.RuleUID, "error", err) + } + } + } + + return nil + }) +} + +func (st DBstore) deleteOldAlertRuleVersions(ctx context.Context, ruleUID string, orgID int64, limit int) (int64, error) { + if limit < 0 { + return 0, fmt.Errorf("failed to delete old alert rule versions: limit is set to '%d' but needs to be > 0", limit) + } + + if limit < 1 { + return 0, nil + } + + var affectedRows int64 + err := st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error { + highest := &alertRuleVersion{} + ok, err := sess.Table("alert_rule_version").Desc("id").Where("rule_org_id = ?", orgID).Where("rule_uid = ?", ruleUID).Limit(1, limit).Get(highest) + if err != nil { + return err + } + if !ok { + // No alert rule versions past the limit exist. Nothing to clean up. + affectedRows = 0 + return nil + } + + res, err := sess.Exec(` + DELETE FROM + alert_rule_version + WHERE + rule_org_id = ? AND rule_uid = ? + AND + id <= ? + `, orgID, ruleUID, highest.ID) + if err != nil { + return err + } + rows, err := res.RowsAffected() + if err != nil { + return err + } + affectedRows = rows + if affectedRows > 0 { + st.Logger.Info("Deleted old alert_rule_version(s)", "org", orgID, "limit", limit, "delete_count", affectedRows) } return nil }) + return affectedRows, err } // preventIntermediateUniqueConstraintViolations prevents unique constraint violations caused by an intermediate update. @@ -352,7 +407,7 @@ func newTitlesOverlapExisting(rules []ngmodels.UpdateRule) bool { // CountInFolder is a handler for retrieving the number of alert rules of // specific organisation associated with a given namespace (parent folder). -func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, u identity.Requester) (int64, error) { +func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, _ identity.Requester) (int64, error) { if len(folderUIDs) == 0 { return 0, nil } diff --git a/pkg/services/ngalert/store/alert_rule_test.go b/pkg/services/ngalert/store/alert_rule_test.go index 2969f24d56b..df2cb83ddbc 100644 --- a/pkg/services/ngalert/store/alert_rule_test.go +++ b/pkg/services/ngalert/store/alert_rule_test.go @@ -1472,3 +1472,132 @@ func setupFolderService(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, features return testutil.SetupFolderService(t, cfg, sqlStore, dashboardStore, folderStore, inProcBus, features, &actest.FakeAccessControl{ExpectedEvaluate: true}) } + +func TestIntegration_AlertRuleVersionsCleanup(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test") + } + cfg := setting.NewCfg() + cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{ + BaseInterval: time.Duration(rand.Int63n(100)+1) * time.Second, + } + sqlStore := db.InitTestDB(t) + store := &DBstore{ + SQLStore: sqlStore, + Cfg: cfg.UnifiedAlerting, + FolderService: setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()), + Logger: &logtest.Fake{}, + } + generator := models.RuleGen + generator = generator.With(generator.WithIntervalMatching(store.Cfg.BaseInterval), generator.WithUniqueOrgID()) + + t.Run("when calling the cleanup with fewer records than the limit all records should stay", func(t *testing.T) { + alertingCfgSnapshot := cfg.UnifiedAlerting + defer func() { + cfg.UnifiedAlerting = alertingCfgSnapshot + }() + cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 10} + rule := createRule(t, store, generator) + firstNewRule := models.CopyRule(rule) + firstNewRule.Title = util.GenerateShortUID() + err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{ + Existing: rule, + New: *firstNewRule, + }, + }) + require.NoError(t, err) + firstNewRule.Version = firstNewRule.Version + 1 + secondNewRule := models.CopyRule(firstNewRule) + secondNewRule.Title = util.GenerateShortUID() + err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{ + Existing: firstNewRule, + New: *secondNewRule, + }, + }) + require.NoError(t, err) + titleMap := map[string]bool{ + secondNewRule.Title: false, + rule.Title: false, + } + + err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error { + alertRuleVersions := make([]*alertRuleVersion, 0) + err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions) + if err != nil { + return err + } + require.NoError(t, err) + assert.Len(t, alertRuleVersions, 2) + for _, value := range alertRuleVersions { + assert.False(t, titleMap[value.Title]) + titleMap[value.Title] = true + } + assert.Equal(t, true, titleMap[firstNewRule.Title]) + assert.Equal(t, true, titleMap[secondNewRule.Title]) + return err + }) + require.NoError(t, err) + }) + + t.Run("only oldest records surpassing the limit should be deleted", func(t *testing.T) { + alertingCfgSnapshot := cfg.UnifiedAlerting + defer func() { + cfg.UnifiedAlerting = alertingCfgSnapshot + }() + cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 1} + rule := createRule(t, store, generator) + oldRule := models.CopyRule(rule) + oldRule.Title = "old-record" + err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{ + Existing: rule, + New: *oldRule, + }}) // first entry in `rule_version_history` table happens here + require.NoError(t, err) + + rule.Version = rule.Version + 1 + middleRule := models.CopyRule(rule) + middleRule.Title = "middle-record" + err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{ + Existing: rule, + New: *middleRule, + }}) //second entry in `rule_version_history` table happens here + require.NoError(t, err) + + rule.Version = rule.Version + 1 + newerRule := models.CopyRule(rule) + newerRule.Title = "newer-record" + err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{ + Existing: rule, + New: *newerRule, + }}) //second entry in `rule_version_history` table happens here + require.NoError(t, err) + + // only the `old-record` should be deleted since limit is set to 1 and there are total 2 records + rowsAffected, err := store.deleteOldAlertRuleVersions(context.Background(), rule.UID, rule.OrgID, 1) + require.NoError(t, err) + require.Equal(t, int64(2), rowsAffected) + + err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error { + var alertRuleVersions []*alertRuleVersion + err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions) + if err != nil { + return err + } + require.NoError(t, err) + assert.Len(t, alertRuleVersions, 1) + assert.Equal(t, "newer-record", alertRuleVersions[0].Title) + return err + }) + require.NoError(t, err) + }) + + t.Run("limit set to 0 should not fail", func(t *testing.T) { + count, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, 0) + require.NoError(t, err) + require.Equal(t, int64(0), count) + }) + t.Run("limit set to negative should fail", func(t *testing.T) { + _, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, -1) + require.Error(t, err) + }) +} diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 3de83db8ce5..8fe863247b7 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -121,6 +121,11 @@ type UnifiedAlertingSettings struct { // Duration for which a resolved alert state transition will continue to be sent to the Alertmanager. ResolvedAlertRetention time.Duration + + // RuleVersionRecordLimit defines the limit of how many alert rule versions + // should be stored in the database for each alert_rule in an organization including the current one. + // 0 value means no limit + RuleVersionRecordLimit int } type RecordingRuleSettings struct { @@ -455,6 +460,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { return err } + uaCfg.RuleVersionRecordLimit = ua.Key("rule_version_record_limit").MustInt(0) + if uaCfg.RuleVersionRecordLimit < 0 { + return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed") + } + cfg.UnifiedAlerting = uaCfg return nil }