mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Limit and clean up old alert rules versions (#89754)
This commit is contained in:
parent
4ec75bcc60
commit
fbad76007d
@ -1346,6 +1346,11 @@ notification_log_retention = 5d
|
|||||||
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||||
resolved_alert_retention = 15m
|
resolved_alert_retention = 15m
|
||||||
|
|
||||||
|
# Defines the limit of how many alert rule versions
|
||||||
|
# should be stored in the database for each alert rule in an organization including the current one.
|
||||||
|
# 0 value means no limit
|
||||||
|
rule_version_record_limit = 0
|
||||||
|
|
||||||
[unified_alerting.screenshots]
|
[unified_alerting.screenshots]
|
||||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||||
# plugin, or set up Grafana to use a remote rendering service.
|
# plugin, or set up Grafana to use a remote rendering service.
|
||||||
@ -1560,8 +1565,8 @@ expire_time = 7
|
|||||||
#################################### Internal Grafana Metrics ############
|
#################################### Internal Grafana Metrics ############
|
||||||
# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId
|
# Metrics available at HTTP URL /metrics and /metrics/plugins/:pluginId
|
||||||
[metrics]
|
[metrics]
|
||||||
enabled = true
|
enabled = true
|
||||||
interval_seconds = 10
|
interval_seconds = 10
|
||||||
# Disable total stats (stat_totals_*) metrics to be generated
|
# Disable total stats (stat_totals_*) metrics to be generated
|
||||||
disable_total_stats = false
|
disable_total_stats = false
|
||||||
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
|
# The interval at which the total stats collector will update the stats. Default is 1800 seconds.
|
||||||
|
@ -1335,6 +1335,11 @@
|
|||||||
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||||
;resolved_alert_retention = 15m
|
;resolved_alert_retention = 15m
|
||||||
|
|
||||||
|
# Defines the limit of how many alert rule versions
|
||||||
|
# should be stored in the database for each alert rule in an organization including the current one.
|
||||||
|
# 0 value means no limit
|
||||||
|
;rule_version_record_limit= 0
|
||||||
|
|
||||||
[unified_alerting.screenshots]
|
[unified_alerting.screenshots]
|
||||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||||
# plugin, or set up Grafana to use a remote rendering service.
|
# plugin, or set up Grafana to use a remote rendering service.
|
||||||
|
@ -274,9 +274,64 @@ func (st DBstore) UpdateAlertRules(ctx context.Context, rules []ngmodels.UpdateR
|
|||||||
if _, err := sess.Insert(&ruleVersions); err != nil {
|
if _, err := sess.Insert(&ruleVersions); err != nil {
|
||||||
return fmt.Errorf("failed to create new rule versions: %w", err)
|
return fmt.Errorf("failed to create new rule versions: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, rule := range ruleVersions {
|
||||||
|
// delete old versions of alert rule
|
||||||
|
_, err = st.deleteOldAlertRuleVersions(ctx, rule.RuleUID, rule.RuleOrgID, st.Cfg.RuleVersionRecordLimit)
|
||||||
|
if err != nil {
|
||||||
|
st.Logger.Warn("Failed to delete old alert rule versions", "org", rule.RuleOrgID, "rule", rule.RuleUID, "error", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (st DBstore) deleteOldAlertRuleVersions(ctx context.Context, ruleUID string, orgID int64, limit int) (int64, error) {
|
||||||
|
if limit < 0 {
|
||||||
|
return 0, fmt.Errorf("failed to delete old alert rule versions: limit is set to '%d' but needs to be > 0", limit)
|
||||||
|
}
|
||||||
|
|
||||||
|
if limit < 1 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var affectedRows int64
|
||||||
|
err := st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error {
|
||||||
|
highest := &alertRuleVersion{}
|
||||||
|
ok, err := sess.Table("alert_rule_version").Desc("id").Where("rule_org_id = ?", orgID).Where("rule_uid = ?", ruleUID).Limit(1, limit).Get(highest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
// No alert rule versions past the limit exist. Nothing to clean up.
|
||||||
|
affectedRows = 0
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
res, err := sess.Exec(`
|
||||||
|
DELETE FROM
|
||||||
|
alert_rule_version
|
||||||
|
WHERE
|
||||||
|
rule_org_id = ? AND rule_uid = ?
|
||||||
|
AND
|
||||||
|
id <= ?
|
||||||
|
`, orgID, ruleUID, highest.ID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
rows, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
affectedRows = rows
|
||||||
|
if affectedRows > 0 {
|
||||||
|
st.Logger.Info("Deleted old alert_rule_version(s)", "org", orgID, "limit", limit, "delete_count", affectedRows)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
return affectedRows, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// preventIntermediateUniqueConstraintViolations prevents unique constraint violations caused by an intermediate update.
|
// preventIntermediateUniqueConstraintViolations prevents unique constraint violations caused by an intermediate update.
|
||||||
@ -352,7 +407,7 @@ func newTitlesOverlapExisting(rules []ngmodels.UpdateRule) bool {
|
|||||||
|
|
||||||
// CountInFolder is a handler for retrieving the number of alert rules of
|
// CountInFolder is a handler for retrieving the number of alert rules of
|
||||||
// specific organisation associated with a given namespace (parent folder).
|
// specific organisation associated with a given namespace (parent folder).
|
||||||
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, u identity.Requester) (int64, error) {
|
func (st DBstore) CountInFolders(ctx context.Context, orgID int64, folderUIDs []string, _ identity.Requester) (int64, error) {
|
||||||
if len(folderUIDs) == 0 {
|
if len(folderUIDs) == 0 {
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
@ -1472,3 +1472,132 @@ func setupFolderService(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, features
|
|||||||
|
|
||||||
return testutil.SetupFolderService(t, cfg, sqlStore, dashboardStore, folderStore, inProcBus, features, &actest.FakeAccessControl{ExpectedEvaluate: true})
|
return testutil.SetupFolderService(t, cfg, sqlStore, dashboardStore, folderStore, inProcBus, features, &actest.FakeAccessControl{ExpectedEvaluate: true})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestIntegration_AlertRuleVersionsCleanup(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("skipping integration test")
|
||||||
|
}
|
||||||
|
cfg := setting.NewCfg()
|
||||||
|
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
|
||||||
|
BaseInterval: time.Duration(rand.Int63n(100)+1) * time.Second,
|
||||||
|
}
|
||||||
|
sqlStore := db.InitTestDB(t)
|
||||||
|
store := &DBstore{
|
||||||
|
SQLStore: sqlStore,
|
||||||
|
Cfg: cfg.UnifiedAlerting,
|
||||||
|
FolderService: setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures()),
|
||||||
|
Logger: &logtest.Fake{},
|
||||||
|
}
|
||||||
|
generator := models.RuleGen
|
||||||
|
generator = generator.With(generator.WithIntervalMatching(store.Cfg.BaseInterval), generator.WithUniqueOrgID())
|
||||||
|
|
||||||
|
t.Run("when calling the cleanup with fewer records than the limit all records should stay", func(t *testing.T) {
|
||||||
|
alertingCfgSnapshot := cfg.UnifiedAlerting
|
||||||
|
defer func() {
|
||||||
|
cfg.UnifiedAlerting = alertingCfgSnapshot
|
||||||
|
}()
|
||||||
|
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 10}
|
||||||
|
rule := createRule(t, store, generator)
|
||||||
|
firstNewRule := models.CopyRule(rule)
|
||||||
|
firstNewRule.Title = util.GenerateShortUID()
|
||||||
|
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||||
|
Existing: rule,
|
||||||
|
New: *firstNewRule,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
firstNewRule.Version = firstNewRule.Version + 1
|
||||||
|
secondNewRule := models.CopyRule(firstNewRule)
|
||||||
|
secondNewRule.Title = util.GenerateShortUID()
|
||||||
|
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||||
|
Existing: firstNewRule,
|
||||||
|
New: *secondNewRule,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
titleMap := map[string]bool{
|
||||||
|
secondNewRule.Title: false,
|
||||||
|
rule.Title: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
|
||||||
|
alertRuleVersions := make([]*alertRuleVersion, 0)
|
||||||
|
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Len(t, alertRuleVersions, 2)
|
||||||
|
for _, value := range alertRuleVersions {
|
||||||
|
assert.False(t, titleMap[value.Title])
|
||||||
|
titleMap[value.Title] = true
|
||||||
|
}
|
||||||
|
assert.Equal(t, true, titleMap[firstNewRule.Title])
|
||||||
|
assert.Equal(t, true, titleMap[secondNewRule.Title])
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("only oldest records surpassing the limit should be deleted", func(t *testing.T) {
|
||||||
|
alertingCfgSnapshot := cfg.UnifiedAlerting
|
||||||
|
defer func() {
|
||||||
|
cfg.UnifiedAlerting = alertingCfgSnapshot
|
||||||
|
}()
|
||||||
|
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{BaseInterval: alertingCfgSnapshot.BaseInterval, RuleVersionRecordLimit: 1}
|
||||||
|
rule := createRule(t, store, generator)
|
||||||
|
oldRule := models.CopyRule(rule)
|
||||||
|
oldRule.Title = "old-record"
|
||||||
|
err := store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||||
|
Existing: rule,
|
||||||
|
New: *oldRule,
|
||||||
|
}}) // first entry in `rule_version_history` table happens here
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
rule.Version = rule.Version + 1
|
||||||
|
middleRule := models.CopyRule(rule)
|
||||||
|
middleRule.Title = "middle-record"
|
||||||
|
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||||
|
Existing: rule,
|
||||||
|
New: *middleRule,
|
||||||
|
}}) //second entry in `rule_version_history` table happens here
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
rule.Version = rule.Version + 1
|
||||||
|
newerRule := models.CopyRule(rule)
|
||||||
|
newerRule.Title = "newer-record"
|
||||||
|
err = store.UpdateAlertRules(context.Background(), []models.UpdateRule{{
|
||||||
|
Existing: rule,
|
||||||
|
New: *newerRule,
|
||||||
|
}}) //second entry in `rule_version_history` table happens here
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// only the `old-record` should be deleted since limit is set to 1 and there are total 2 records
|
||||||
|
rowsAffected, err := store.deleteOldAlertRuleVersions(context.Background(), rule.UID, rule.OrgID, 1)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, int64(2), rowsAffected)
|
||||||
|
|
||||||
|
err = sqlStore.WithDbSession(context.Background(), func(sess *db.Session) error {
|
||||||
|
var alertRuleVersions []*alertRuleVersion
|
||||||
|
err := sess.Table(alertRuleVersion{}).Desc("id").Where("rule_org_id = ? and rule_uid = ?", rule.OrgID, rule.UID).Find(&alertRuleVersions)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.Len(t, alertRuleVersions, 1)
|
||||||
|
assert.Equal(t, "newer-record", alertRuleVersions[0].Title)
|
||||||
|
return err
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("limit set to 0 should not fail", func(t *testing.T) {
|
||||||
|
count, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, 0)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, int64(0), count)
|
||||||
|
})
|
||||||
|
t.Run("limit set to negative should fail", func(t *testing.T) {
|
||||||
|
_, err := store.deleteOldAlertRuleVersions(context.Background(), "", 1, -1)
|
||||||
|
require.Error(t, err)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
@ -121,6 +121,11 @@ type UnifiedAlertingSettings struct {
|
|||||||
|
|
||||||
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
|
||||||
ResolvedAlertRetention time.Duration
|
ResolvedAlertRetention time.Duration
|
||||||
|
|
||||||
|
// RuleVersionRecordLimit defines the limit of how many alert rule versions
|
||||||
|
// should be stored in the database for each alert_rule in an organization including the current one.
|
||||||
|
// 0 value means no limit
|
||||||
|
RuleVersionRecordLimit int
|
||||||
}
|
}
|
||||||
|
|
||||||
type RecordingRuleSettings struct {
|
type RecordingRuleSettings struct {
|
||||||
@ -455,6 +460,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uaCfg.RuleVersionRecordLimit = ua.Key("rule_version_record_limit").MustInt(0)
|
||||||
|
if uaCfg.RuleVersionRecordLimit < 0 {
|
||||||
|
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
|
||||||
|
}
|
||||||
|
|
||||||
cfg.UnifiedAlerting = uaCfg
|
cfg.UnifiedAlerting = uaCfg
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user