mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Limit and clean up old alert rules versions (#89754)
This commit is contained in:
parent
4ec75bcc60
commit
fbad76007d
@ -1346,6 +1346,11 @@ notification_log_retention = 5d
|
||||
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||
resolved_alert_retention = 15m
|
||||
|
||||
# Defines the limit of how many alert rule versions
|
||||
# should be stored in the database for each alert rule in an organization including the current one.
|
||||
# 0 value means no limit
|
||||
rule_version_record_limit = 0
|
||||
|
||||
[unified_alerting.screenshots]
|
||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||
# plugin, or set up Grafana to use a remote rendering service.
|
||||
@ -1560,8 +1565,8 @@ expire_time = 7
|
||||
#################################### Internal Grafana Metrics ############
|
||||
# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId
|
||||
[metrics]
|
||||
enabled = true
|
||||
interval_seconds = 10
|
||||
enabled = true
|
||||
interval_seconds = 10
|
||||
# Disable total stats (stat_totals_*) metrics to be generated
|
||||
disable_total_stats = false
|
||||
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
|
||||
|
@ -1335,6 +1335,11 @@
|
||||
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||
;resolved_alert_retention = 15m
|
||||
|
||||
# Defines the limit of how many alert rule versions
|
||||
# should be stored in the database for each alert rule in an organization including the current one.
|
||||
# 0 value means no limit
|
||||
;rule_version_record_limit= 0
|
||||
|
||||
[unified_alerting.screenshots]
|
||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||
# plugin, or set up Grafana to use a remote rendering service.
|
||||
|
@ -274,9 +274,64 @@ func (st DBstore) UpdateAlertRules(ctx context.Context, rules []ngmodels.UpdateR
|
||||
if _, err := sess.Insert(&ruleVersions); err != nil {
|
||||
return fmt.Errorf("failed to create new rule versions: %w", err)
|
||||
}
|
||||
|
||||
for _, rule := range ruleVersions {
|
||||
// delete old versions of alert rule
|
||||
_, err = st.deleteOldAlertRuleVersions(ctx, rule.RuleUID, rule.RuleOrgID, st.Cfg.RuleVersionRecordLimit)
|
||||
if err != nil {
|
||||
st.Logger.Warn("Failed to delete old alert rule versions", "org", rule.RuleOrgID, "rule", rule.RuleUID, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func (st DBstore) deleteOldAlertRuleVersions(ctx context.Context, ruleUID string, orgID int64, limit int) (int64, error) {
|
||||
if limit < 0 {
|
||||
return 0, fmt.Errorf("failed to delete old alert rule versions: limit is set to '%d' but needs to be > 0", limit)
|
||||
}
|
||||
|
||||
if limit < 1 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
var affectedRows int64
|
||||
err := st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error {
|
||||
highest := &alertRuleVersion{}
|
||||
ok, err := sess.Table("alert_rule_version").Desc("id").Where("rule_org_id = ?", orgID).Where("rule_uid = ?", ruleUID).Limit(1, limit).Get(highest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !ok {
|
||||
// No alert rule versions past the limit exist. Nothing to clean up.
|
||||
affectedRows = 0
|
||||
return nil
|
||||
}
|
||||
|
||||
res, err := sess.Exec(`
|
||||
DELETE FROM
|
||||
alert_rule_version
|
||||
WHERE
|
||||
rule_org_id = ? AND rule_uid = ?
|
||||
AND
|
||||
id <= ?
|
||||
`, orgID, ruleUID, highest.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rows, err := res.RowsAffected()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
affectedRows = rows
|
||||
if affectedRows > 0 {
|
||||
st.Logger.Info("Deleted old alert_rule_version(s)", "org", orgID, "limit", limit, "delete_count", affectedRows)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return affectedRows, err
|
||||
}
|
||||
|
||||
// preventIntermediateUniqueConstraintViolations prevents unique constraint violations caused by an intermediate update.
|
||||
@ -352,7 +407,7 @@ func newTitlesOverlapExisting(rules []ngmodels.UpdateRule) bool {
|
||||
|
||||
// CountInFolder is a handler for retrieving the number of alert rules of
|
||||
// specific organisation associated with a given namespace (parent folder).
|
||||
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, u identity.Requester) (int64, error) {
|
||||
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, _ identity.Requester) (int64, error) {
|
||||
if len(folderUIDs) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
@ -1472,3 +1472,132 @@ func setupFolderService(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, features
|
||||
|
||||
return testutil.SetupFolderService(t, cfg, sqlStore, dashboardStore, folderStore, inProcBus, features, &actest.FakeAccessControl{ExpectedEvaluate: true})
|
||||
}
|
||||
|
||||
func TestIntegration_AlertRuleVersionsCleanup(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test")
|
||||
}
|
||||
cfg := setting.NewCfg()
|
||||
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
|
||||
BaseInterval: time.Duration(rand.Int63n(100)+1) * time.Second,
|
||||
}
|
||||
sqlStore := db.InitTestDB(t)
|
||||
store := &DBstore{
|
||||
SQLStore: sqlStore,
|
||||
Cfg: cfg.UnifiedAlerting,
|
||||
FolderService: setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()),
|
||||
Logger: &logtest.Fake{},
|
||||
}
|
||||
generator := models.RuleGen
|
||||
generator = generator.With(generator.WithIntervalMatching(store.Cfg.BaseInterval), generator.WithUniqueOrgID())
|
||||
|
||||
t.Run("when calling the cleanup with fewer records than the limit all records should stay", func(t *testing.T) {
|
||||
alertingCfgSnapshot := cfg.UnifiedAlerting
|
||||
defer func() {
|
||||
cfg.UnifiedAlerting = alertingCfgSnapshot
|
||||
}()
|
||||
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 10}
|
||||
rule := createRule(t, store, generator)
|
||||
firstNewRule := models.CopyRule(rule)
|
||||
firstNewRule.Title = util.GenerateShortUID()
|
||||
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||
Existing: rule,
|
||||
New: *firstNewRule,
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
firstNewRule.Version = firstNewRule.Version + 1
|
||||
secondNewRule := models.CopyRule(firstNewRule)
|
||||
secondNewRule.Title = util.GenerateShortUID()
|
||||
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||
Existing: firstNewRule,
|
||||
New: *secondNewRule,
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
titleMap := map[string]bool{
|
||||
secondNewRule.Title: false,
|
||||
rule.Title: false,
|
||||
}
|
||||
|
||||
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
|
||||
alertRuleVersions := make([]*alertRuleVersion, 0)
|
||||
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, alertRuleVersions, 2)
|
||||
for _, value := range alertRuleVersions {
|
||||
assert.False(t, titleMap[value.Title])
|
||||
titleMap[value.Title] = true
|
||||
}
|
||||
assert.Equal(t, true, titleMap[firstNewRule.Title])
|
||||
assert.Equal(t, true, titleMap[secondNewRule.Title])
|
||||
return err
|
||||
})
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("only oldest records surpassing the limit should be deleted", func(t *testing.T) {
|
||||
alertingCfgSnapshot := cfg.UnifiedAlerting
|
||||
defer func() {
|
||||
cfg.UnifiedAlerting = alertingCfgSnapshot
|
||||
}()
|
||||
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 1}
|
||||
rule := createRule(t, store, generator)
|
||||
oldRule := models.CopyRule(rule)
|
||||
oldRule.Title = "old-record"
|
||||
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||
Existing: rule,
|
||||
New: *oldRule,
|
||||
}}) // first entry in `rule_version_history` table happens here
|
||||
require.NoError(t, err)
|
||||
|
||||
rule.Version = rule.Version + 1
|
||||
middleRule := models.CopyRule(rule)
|
||||
middleRule.Title = "middle-record"
|
||||
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||
Existing: rule,
|
||||
New: *middleRule,
|
||||
}}) //second entry in `rule_version_history` table happens here
|
||||
require.NoError(t, err)
|
||||
|
||||
rule.Version = rule.Version + 1
|
||||
newerRule := models.CopyRule(rule)
|
||||
newerRule.Title = "newer-record"
|
||||
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||
Existing: rule,
|
||||
New: *newerRule,
|
||||
}}) //second entry in `rule_version_history` table happens here
|
||||
require.NoError(t, err)
|
||||
|
||||
// only the `old-record` should be deleted since limit is set to 1 and there are total 2 records
|
||||
rowsAffected, err := store.deleteOldAlertRuleVersions(context.Background(), rule.UID, rule.OrgID, 1)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, int64(2), rowsAffected)
|
||||
|
||||
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
|
||||
var alertRuleVersions []*alertRuleVersion
|
||||
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, alertRuleVersions, 1)
|
||||
assert.Equal(t, "newer-record", alertRuleVersions[0].Title)
|
||||
return err
|
||||
})
|
||||
require.NoError(t, err)
|
||||
})
|
||||
|
||||
t.Run("limit set to 0 should not fail", func(t *testing.T) {
|
||||
count, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, 0)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, int64(0), count)
|
||||
})
|
||||
t.Run("limit set to negative should fail", func(t *testing.T) {
|
||||
_, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, -1)
|
||||
require.Error(t, err)
|
||||
})
|
||||
}
|
||||
|
@ -121,6 +121,11 @@ type UnifiedAlertingSettings struct {
|
||||
|
||||
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||
ResolvedAlertRetention time.Duration
|
||||
|
||||
// RuleVersionRecordLimit defines the limit of how many alert rule versions
|
||||
// should be stored in the database for each alert_rule in an organization including the current one.
|
||||
// 0 value means no limit
|
||||
RuleVersionRecordLimit int
|
||||
}
|
||||
|
||||
type RecordingRuleSettings struct {
|
||||
@ -455,6 +460,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
return err
|
||||
}
|
||||
|
||||
uaCfg.RuleVersionRecordLimit = ua.Key("rule_version_record_limit").MustInt(0)
|
||||
if uaCfg.RuleVersionRecordLimit < 0 {
|
||||
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
|
||||
}
|
||||
|
||||
cfg.UnifiedAlerting = uaCfg
|
||||
return nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user