Alerting Unification: Use alerting.MaintenanceOptions to configure silences and nflog (#61384)

This commit is contained in:
gotjosh 2023-01-12 12:31:38 -04:00 committed by GitHub
parent 9055e1993d
commit 2d1faae0b5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -15,6 +15,7 @@ import (
"time"
"unicode/utf8"
"github.com/grafana/alerting/alerting"
"github.com/grafana/alerting/alerting/notifier/channels"
amv2 "github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/alertmanager/cluster"
@ -26,14 +27,13 @@ import (
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/provider/mem"
"github.com/prometheus/alertmanager/silence"
pb "github.com/prometheus/alertmanager/silence/silencepb"
"github.com/prometheus/alertmanager/template"
"github.com/prometheus/alertmanager/timeinterval"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
pb "github.com/prometheus/alertmanager/silence/silencepb"
alertingModels "github.com/grafana/alerting/alerting/models"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
@ -51,8 +51,8 @@ const (
silencesFilename = "silences"
workingDir = "alerting"
// maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications and silences
maintenanceNotificationAndSilences = 15 * time.Minute
// maintenanceNotificationAndSilences how often should we flush and gargabe collect notifications
notificationLogMaintenanceInterval = 15 * time.Minute
// defaultResolveTimeout is the default timeout used for resolving an alert
// if the end time is not specified.
defaultResolveTimeout = 5 * time.Minute
@ -137,6 +137,31 @@ type Alertmanager struct {
decryptFn channels.GetDecryptedValueFn
}
// maintenanceOptions represent the options for components that need maintenance on a frequency within the Alertmanager.
// It implements the alerting.MaintenanceOptions interface.
type maintenanceOptions struct {
filepath string
retention time.Duration
maintenanceFrequency time.Duration
maintenanceFunc func(alerting.State) (int64, error)
}
func (m maintenanceOptions) Filepath() string {
return m.filepath
}
func (m maintenanceOptions) Retention() time.Duration {
return m.retention
}
func (m maintenanceOptions) MaintenanceFrequency() time.Duration {
return m.maintenanceFrequency
}
func (m maintenanceOptions) MaintenanceFunc(state alerting.State) (int64, error) {
return m.maintenanceFunc(state)
}
func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store AlertingStore, kvStore kvstore.KVStore,
peer ClusterPeer, decryptFn channels.GetDecryptedValueFn, ns notifications.Service, m *metrics.Alertmanager) (*Alertmanager, error) {
am := &Alertmanager{
@ -166,13 +191,31 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A
return nil, err
}
// Initialize the notification log
silencesOptions := maintenanceOptions{
filepath: silencesFilePath,
retention: retentionNotificationsAndSilences,
maintenanceFrequency: silenceMaintenanceInterval,
maintenanceFunc: func(state alerting.State) (int64, error) {
return am.fileStore.Persist(ctx, silencesFilename, state)
},
}
nflogOptions := maintenanceOptions{
filepath: nflogFilepath,
retention: retentionNotificationsAndSilences,
maintenanceFrequency: notificationLogMaintenanceInterval,
maintenanceFunc: func(state alerting.State) (int64, error) {
return am.fileStore.Persist(ctx, notificationLogFilename, state)
},
}
// Initialize the notification log.
am.wg.Add(1)
am.notificationLog, err = nflog.New(
nflog.WithRetention(retentionNotificationsAndSilences),
nflog.WithSnapshot(nflogFilepath),
nflog.WithMaintenance(maintenanceNotificationAndSilences, am.stopc, am.wg.Done, func() (int64, error) {
return am.fileStore.Persist(ctx, notificationLogFilename, am.notificationLog)
nflog.WithRetention(nflogOptions.Retention()),
nflog.WithSnapshot(nflogOptions.Filepath()),
nflog.WithMaintenance(nflogOptions.MaintenanceFrequency(), am.stopc, am.wg.Done, func() (int64, error) {
return nflogOptions.MaintenanceFunc(am.notificationLog)
}),
)
if err != nil {
@ -184,8 +227,8 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A
// Initialize silences
am.silences, err = silence.New(silence.Options{
Metrics: m.Registerer,
SnapshotFile: silencesFilePath,
Retention: retentionNotificationsAndSilences,
SnapshotFile: silencesOptions.Filepath(),
Retention: silencesOptions.Retention(),
})
if err != nil {
return nil, fmt.Errorf("unable to initialize the silencing component of alerting: %w", err)
@ -196,16 +239,19 @@ func newAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A
am.wg.Add(1)
go func() {
am.silences.Maintenance(silenceMaintenanceInterval, silencesFilePath, am.stopc, func() (int64, error) {
// Delete silences older than the retention period.
if _, err := am.silences.GC(); err != nil {
am.logger.Error("silence garbage collection", "error", err)
// Don't return here - we need to snapshot our state first.
}
// Snapshot our silences to the Grafana KV store
return am.fileStore.Persist(ctx, silencesFilename, am.silences)
})
am.silences.Maintenance(
silencesOptions.MaintenanceFrequency(),
silencesOptions.Filepath(),
am.stopc,
func() (int64, error) {
// Delete silences older than the retention period.
if _, err := am.silences.GC(); err != nil {
am.logger.Error("silence garbage collection", "error", err)
// Don't return here - we need to snapshot our state first.
}
return silencesOptions.maintenanceFunc(am.silences)
},
)
am.wg.Done()
}()