mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Unified Alerting, Issue 41156: Clean up expired silences. (#46740)
Expired silences older than the retention period were not being cleaned up. The root problem was that notifier.Alertmanager overrides the Prometheus alert manager's silence maintenance function and was not calling Silences.GC() in the overriden function.
This commit is contained in:
parent
a80f04c949
commit
c5b39dd3cd
@ -49,8 +49,6 @@ const (
|
||||
silencesFilename = "silences"
|
||||
|
||||
workingDir = "alerting"
|
||||
// How long should we keep silences and notification entries on-disk after they've served their purpose.
|
||||
retentionNotificationsAndSilences = 5 * 24 * time.Hour
|
||||
// maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences
|
||||
maintenanceNotificationAndSilences = 15 * time.Minute
|
||||
// defaultResolveTimeout is the default timeout used for resolving an alert
|
||||
@ -60,6 +58,10 @@ const (
|
||||
memoryAlertsGCInterval = 30 * time.Minute
|
||||
)
|
||||
|
||||
// How long should we keep silences and notification entries on-disk after they've served their purpose.
|
||||
var retentionNotificationsAndSilences = 5 * 24 * time.Hour
|
||||
var silenceMaintenanceInterval = 15 * time.Minute
|
||||
|
||||
func init() {
|
||||
silence.ValidateMatcher = func(m *pb.Matcher) error {
|
||||
switch m.Type {
|
||||
@ -185,7 +187,14 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store s
|
||||
|
||||
am.wg.Add(1)
|
||||
go func() {
|
||||
am.silences.Maintenance(15*time.Minute, silencesFilePath, am.stopc, func() (int64, error) {
|
||||
am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) {
|
||||
// Delete silences older than the retention period.
|
||||
if _, err := am.silences.GC(); err != nil {
|
||||
am.logger.Error("Silence Garbage Collection Failed at %v: %v", time.Now(), err)
|
||||
// Don't return here - we need to snapshot our state first.
|
||||
}
|
||||
|
||||
// Snapshot our silences to the Grafana KV store
|
||||
return am.fileStore.Persist(ctx, silencesFilename, am.silences)
|
||||
})
|
||||
am.wg.Done()
|
||||
|
@ -340,3 +340,75 @@ func TestPutAlert(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Tests cleanup of expired Silences. We rely on prometheus/alertmanager for
|
||||
// our alert silencing functionality, so we rely on its tests. However, we
|
||||
// implement a custom maintenance function for silences, because we snapshot
|
||||
// our data differently, so we test that functionality.
|
||||
func TestSilenceCleanup(t *testing.T) {
|
||||
require := require.New(t)
|
||||
|
||||
oldRetention := retentionNotificationsAndSilences
|
||||
retentionNotificationsAndSilences = 30 * time.Millisecond
|
||||
oldMaintenance := silenceMaintenanceInterval
|
||||
silenceMaintenanceInterval = 15 * time.Millisecond
|
||||
t.Cleanup(
|
||||
func() {
|
||||
retentionNotificationsAndSilences = oldRetention
|
||||
silenceMaintenanceInterval = oldMaintenance
|
||||
})
|
||||
|
||||
am := setupAMTest(t)
|
||||
now := time.Now()
|
||||
dt := func(t time.Time) strfmt.DateTime { return strfmt.DateTime(t) }
|
||||
|
||||
makeSilence := func(comment string, createdBy string,
|
||||
startsAt, endsAt strfmt.DateTime, matchers models.Matchers) *apimodels.PostableSilence {
|
||||
return &apimodels.PostableSilence{
|
||||
ID: "",
|
||||
Silence: models.Silence{
|
||||
Comment: &comment,
|
||||
CreatedBy: &createdBy,
|
||||
StartsAt: &startsAt,
|
||||
EndsAt: &endsAt,
|
||||
Matchers: matchers,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
tru := true
|
||||
testString := "testName"
|
||||
matchers := models.Matchers{&models.Matcher{Name: &testString, IsEqual: &tru, IsRegex: &tru, Value: &testString}}
|
||||
// Create silences - one in the future, one currently active, one expired but
|
||||
// retained, one expired and not retained.
|
||||
silences := []*apimodels.PostableSilence{
|
||||
// Active in future
|
||||
makeSilence("", "tests", dt(now.Add(5*time.Hour)), dt(now.Add(6*time.Hour)), matchers),
|
||||
// Active now
|
||||
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(6*time.Hour)), matchers),
|
||||
// Expiring soon
|
||||
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(2*time.Second)), matchers),
|
||||
// Expiring *very* soon
|
||||
makeSilence("", "tests", dt(now.Add(-5*time.Hour)), dt(now.Add(20*time.Millisecond)), matchers),
|
||||
}
|
||||
|
||||
for _, s := range silences {
|
||||
_, err := am.CreateSilence(s)
|
||||
require.NoError(err)
|
||||
}
|
||||
|
||||
// Let enough time pass for the maintenance window to run.
|
||||
require.Eventually(func() bool {
|
||||
// So, what silences do we have now?
|
||||
found, err := am.ListSilences(nil)
|
||||
require.NoError(err)
|
||||
return len(found) == 3
|
||||
}, 1500*time.Millisecond, 150*time.Millisecond)
|
||||
|
||||
// Wait again for another silence to expire.
|
||||
require.Eventually(func() bool {
|
||||
found, err := am.ListSilences(nil)
|
||||
require.NoError(err)
|
||||
return len(found) == 2
|
||||
}, 2*time.Second, 150*time.Millisecond)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user