Unified Alerting, Issue 41156: Clean up expired silences. (#46740)

Expired silences older than the retention period were not being cleaned up. The root problem was that notifier.Alertmanager overrides the Prometheus alert manager's silence maintenance function and was not calling Silences.GC() in the overriden function.
This commit is contained in:
Joe Blubaugh 2022-03-23 16:49:02 +08:00 committed by GitHub
parent a80f04c949
commit c5b39dd3cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 84 additions and 3 deletions

View File

@ -49,8 +49,6 @@ const (
silencesFilename = "silences"
workingDir = "alerting"
// How long should we keep silences and notification entries on-disk after they've served their purpose.
retentionNotificationsAndSilences = 5 * 24 * time.Hour
// maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences
maintenanceNotificationAndSilences = 15 * time.Minute
// defaultResolveTimeout is the default timeout used for resolving an alert
@ -60,6 +58,10 @@ const (
memoryAlertsGCInterval = 30 * time.Minute
)
// How long should we keep silences and notification entries on-disk after they've served their purpose.
var retentionNotificationsAndSilences = 5 * 24 * time.Hour
var silenceMaintenanceInterval = 15 * time.Minute
func init() {
silence.ValidateMatcher = func(m *pb.Matcher) error {
switch m.Type {
@ -185,7 +187,14 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s
am.wg.Add(1)
go func() {
am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) {
am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) {
// Delete silences older than the retention period.
if _, err := am.silences.GC(); err != nil {
am.logger.Error("Silence Garbage Collection Failed at %v: %v", time.Now(), err)
// Don't return here - we need to snapshot our state first.
}
// Snapshot our silences to the Grafana KV store
return am.fileStore.Persist(ctx, silencesFilename, am.silences)
})
am.wg.Done()

View File

@ -340,3 +340,75 @@ func TestPutAlert(t *testing.T) {
})
}
}
// Tests cleanup of expired Silences. We rely on prometheus/alertmanager for
// our alert silencing functionality, so we rely on its tests. However, we
// implement a custom maintenance function for silences, because we snapshot
// our data differently, so we test that functionality.
func TestSilenceCleanup(t *testing.T) {
require := require.New(t)
oldRetention := retentionNotificationsAndSilences
retentionNotificationsAndSilences = 30 * time.Millisecond
oldMaintenance := silenceMaintenanceInterval
silenceMaintenanceInterval = 15 * time.Millisecond
t.Cleanup(
func() {
retentionNotificationsAndSilences = oldRetention
silenceMaintenanceInterval = oldMaintenance
})
am := setupAMTest(t)
now := time.Now()
dt := func(t time.Time) strfmt.DateTime { return strfmt.DateTime(t) }
makeSilence := func(comment string, createdBy string,
startsAt, endsAt strfmt.DateTime, matchers models.Matchers) *apimodels.PostableSilence {
return &apimodels.PostableSilence{
ID: "",
Silence: models.Silence{
Comment: &comment,
CreatedBy: &createdBy,
StartsAt: &startsAt,
EndsAt: &endsAt,
Matchers: matchers,
},
}
}
tru := true
testString := "testName"
matchers := models.Matchers{&models.Matcher{Name: &testString, IsEqual: &tru, IsRegex: &tru, Value: &testString}}
// Create silences - one in the future, one currently active, one expired but
// retained, one expired and not retained.
silences := []*apimodels.PostableSilence{
// Active in future
makeSilence("", "tests", dt(now.Add(5*time.Hour)), dt(now.Add(6*time.Hour)), matchers),
// Active now
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(6*time.Hour)), matchers),
// Expiring soon
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(2*time.Second)), matchers),
// Expiring *very* soon
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(20*time.Millisecond)), matchers),
}
for _, s := range silences {
_, err := am.CreateSilence(s)
require.NoError(err)
}
// Let enough time pass for the maintenance window to run.
require.Eventually(func() bool {
// So, what silences do we have now?
found, err := am.ListSilences(nil)
require.NoError(err)
return len(found) == 3
}, 1500*time.Millisecond, 150*time.Millisecond)
// Wait again for another silence to expire.
require.Eventually(func() bool {
found, err := am.ListSilences(nil)
require.NoError(err)
return len(found) == 2
}, 2*time.Second, 150*time.Millisecond)
}