From 0be6e1bb86613b6466e531119fe1cfec67d35aac Mon Sep 17 00:00:00 2001 From: Yuri Tseretyan Date: Fri, 31 Jan 2025 11:12:38 -0500 Subject: [PATCH] Alerting: Extra dedup stage in Grafana Alertmanager (#99825) * add feature flags * update alerting module * update grafana alertmanager to configure the extra dedup stage --------- Co-authored-by: Santiago --- go.mod | 2 +- go.sum | 4 +-- .../src/types/featureToggles.gen.ts | 2 ++ pkg/services/featuremgmt/registry.go | 18 +++++++++++ pkg/services/featuremgmt/toggles_gen.csv | 2 ++ pkg/services/featuremgmt/toggles_gen.go | 8 +++++ pkg/services/featuremgmt/toggles_gen.json | 30 +++++++++++++++++++ pkg/services/ngalert/notifier/alertmanager.go | 18 +++++++++-- .../ngalert/notifier/alertmanager_test.go | 3 +- .../ngalert/notifier/multiorg_alertmanager.go | 2 +- pkg/storage/unified/apistore/go.mod | 2 +- pkg/storage/unified/apistore/go.sum | 4 +-- pkg/storage/unified/resource/go.mod | 2 +- pkg/storage/unified/resource/go.sum | 4 +-- 14 files changed, 87 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index 69b092f4f60..e0b576dc50f 100644 --- a/go.mod +++ b/go.mod @@ -69,7 +69,7 @@ require ( github.com/googleapis/gax-go/v2 v2.14.1 // @grafana/grafana-backend-group github.com/gorilla/mux v1.8.1 // @grafana/grafana-backend-group github.com/gorilla/websocket v1.5.3 // @grafana/grafana-app-platform-squad - github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a // @grafana/alerting-backend + github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 // @grafana/alerting-backend github.com/grafana/authlib v0.0.0-20250123104008-e99947858901 // @grafana/identity-access-team github.com/grafana/authlib/types v0.0.0-20250120145936-5f0e28e7a87c // @grafana/identity-access-team github.com/grafana/dataplane/examples v0.0.1 // @grafana/observability-metrics diff --git a/go.sum b/go.sum index c1825644258..02b3b377551 100644 --- a/go.sum +++ b/go.sum @@ -1498,8 +1498,8 @@ github.com/gorilla/sessions v1.2.1 h1:DHd3rPN5lE3Ts3D8rKkQ8x/0kqfeNmBAaiSi+o7Fsg github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a h1:44E+I3EPdh/W02Uyfyig86EJKPjvzcF3y0A+FEi1fBk= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 h1:dmsycYQzl5JexuV8UxQpT3B79maSvhiIahid4/tezAM= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901 h1:nqV1YrtX+ZG+EYB5dcmFMWhg2Y038OMaAHAADbOC9RA= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901/go.mod h1:/gYfphsNu9v1qYWXxpv1NSvMEMSwvdf8qb8YlgwIRl8= github.com/grafana/authlib/types v0.0.0-20250120145936-5f0e28e7a87c h1:b0sPDtt33uFdmvUJjSCld3kwE2E49dUvevuUDSJsEuo= diff --git a/packages/grafana-data/src/types/featureToggles.gen.ts b/packages/grafana-data/src/types/featureToggles.gen.ts index f983b75f5fe..2792ff8a74a 100644 --- a/packages/grafana-data/src/types/featureToggles.gen.ts +++ b/packages/grafana-data/src/types/featureToggles.gen.ts @@ -255,4 +255,6 @@ export interface FeatureToggles { elasticsearchImprovedParsing?: boolean; datasourceConnectionsTab?: boolean; fetchRulesUsingPost?: boolean; + alertingAlertmanagerExtraDedupStage?: boolean; + alertingAlertmanagerExtraDedupStageStopPipeline?: boolean; } diff --git a/pkg/services/featuremgmt/registry.go b/pkg/services/featuremgmt/registry.go index 340946621be..f1cd1fe8cf8 100644 --- a/pkg/services/featuremgmt/registry.go +++ b/pkg/services/featuremgmt/registry.go @@ -1772,6 +1772,24 @@ var ( HideFromAdminPage: true, HideFromDocs: true, }, + { + Name: "alertingAlertmanagerExtraDedupStage", + Description: "enables extra deduplication stage in alertmanager that checks that timestamps of the pipeline and the current state are matching", + Stage: FeatureStageExperimental, + Owner: grafanaAlertingSquad, + HideFromAdminPage: true, + HideFromDocs: true, + RequiresRestart: true, + }, + { + Name: "alertingAlertmanagerExtraDedupStageStopPipeline", + Description: "works together with alertingAlertmanagerExtraDedupStage, if enabled, it will stop the pipeline if the timestamps are not matching. Otherwise, it will emit a warning", + Stage: FeatureStageExperimental, + Owner: grafanaAlertingSquad, + HideFromAdminPage: true, + HideFromDocs: true, + RequiresRestart: true, + }, } ) diff --git a/pkg/services/featuremgmt/toggles_gen.csv b/pkg/services/featuremgmt/toggles_gen.csv index 965d4b1fbe5..4d9fae9c95c 100644 --- a/pkg/services/featuremgmt/toggles_gen.csv +++ b/pkg/services/featuremgmt/toggles_gen.csv @@ -236,3 +236,5 @@ grafanaAdvisor,experimental,@grafana/plugins-platform-backend,false,false,false elasticsearchImprovedParsing,experimental,@grafana/aws-datasources,false,false,false datasourceConnectionsTab,experimental,@grafana/plugins-platform-backend,false,false,true fetchRulesUsingPost,experimental,@grafana/alerting-squad,false,false,false +alertingAlertmanagerExtraDedupStage,experimental,@grafana/alerting-squad,false,true,false +alertingAlertmanagerExtraDedupStageStopPipeline,experimental,@grafana/alerting-squad,false,true,false diff --git a/pkg/services/featuremgmt/toggles_gen.go b/pkg/services/featuremgmt/toggles_gen.go index 64e4b7b2fc3..e149127edf7 100644 --- a/pkg/services/featuremgmt/toggles_gen.go +++ b/pkg/services/featuremgmt/toggles_gen.go @@ -954,4 +954,12 @@ const ( // FlagFetchRulesUsingPost // Use a POST request to list rules by passing down the namespaces user has access to FlagFetchRulesUsingPost = "fetchRulesUsingPost" + + // FlagAlertingAlertmanagerExtraDedupStage + // enables extra deduplication stage in alertmanager that checks that timestamps of the pipeline and the current state are matching + FlagAlertingAlertmanagerExtraDedupStage = "alertingAlertmanagerExtraDedupStage" + + // FlagAlertingAlertmanagerExtraDedupStageStopPipeline + // works together with alertingAlertmanagerExtraDedupStage, if enabled, it will stop the pipeline if the timestamps are not matching. Otherwise, it will emit a warning + FlagAlertingAlertmanagerExtraDedupStageStopPipeline = "alertingAlertmanagerExtraDedupStageStopPipeline" ) diff --git a/pkg/services/featuremgmt/toggles_gen.json b/pkg/services/featuremgmt/toggles_gen.json index 5c3dcb7040e..580121c5120 100644 --- a/pkg/services/featuremgmt/toggles_gen.json +++ b/pkg/services/featuremgmt/toggles_gen.json @@ -143,6 +143,36 @@ "codeowner": "@grafana/alerting-squad" } }, + { + "metadata": { + "name": "alertingAlertmanagerExtraDedupStage", + "resourceVersion": "1738251165994", + "creationTimestamp": "2025-01-30T15:32:45Z" + }, + "spec": { + "description": "enables extra deduplication stage in alertmanager that checks that timestamps of the pipeline and the current state are matching", + "stage": "experimental", + "codeowner": "@grafana/alerting-squad", + "requiresRestart": true, + "hideFromAdminPage": true, + "hideFromDocs": true + } + }, + { + "metadata": { + "name": "alertingAlertmanagerExtraDedupStageStopPipeline", + "resourceVersion": "1738251165994", + "creationTimestamp": "2025-01-30T15:32:45Z" + }, + "spec": { + "description": "works together with alertingAlertmanagerExtraDedupStage, if enabled, it will stop the pipeline if the timestamps are not matching. Otherwise, it will emit a warning", + "stage": "experimental", + "codeowner": "@grafana/alerting-squad", + "requiresRestart": true, + "hideFromAdminPage": true, + "hideFromDocs": true + } + }, { "metadata": { "name": "alertingApiServer", diff --git a/pkg/services/ngalert/notifier/alertmanager.go b/pkg/services/ngalert/notifier/alertmanager.go index f80a2ae2a95..63c05aaefdf 100644 --- a/pkg/services/ngalert/notifier/alertmanager.go +++ b/pkg/services/ngalert/notifier/alertmanager.go @@ -10,6 +10,7 @@ import ( "time" alertingNotify "github.com/grafana/alerting/notify" + "github.com/grafana/alerting/notify/stages" "github.com/grafana/alerting/receivers" alertingTemplates "github.com/grafana/alerting/templates" "github.com/prometheus/alertmanager/config" @@ -17,6 +18,7 @@ import ( amv2 "github.com/prometheus/alertmanager/api/v2/models" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/services/featuremgmt" apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -91,7 +93,7 @@ func (m maintenanceOptions) MaintenanceFunc(state alertingNotify.State) (int64, func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store AlertingStore, stateStore stateStore, peer alertingNotify.ClusterPeer, decryptFn alertingNotify.GetDecryptedValueFn, ns notifications.Service, - m *metrics.Alertmanager, withAutogen bool, + m *metrics.Alertmanager, featureToggles featuremgmt.FeatureToggles, ) (*alertmanager, error) { nflog, err := stateStore.GetNotificationLog(ctx) if err != nil { @@ -121,6 +123,16 @@ func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A return stateStore.SaveNotificationLog(context.Background(), state) }, } + l := log.New("ngalert.notifier.alertmanager", "org", orgID) + action := stages.Disabled + if featureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingAlertmanagerExtraDedupStage) { + if featureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingAlertmanagerExtraDedupStageStopPipeline) { + action = stages.StopPipeline + } else { + action = stages.LogOnly + } + l.Info("Initializing Alertmanager", "extra_dedup_stage", action) + } amcfg := &alertingNotify.GrafanaAlertmanagerConfig{ ExternalURL: cfg.AppURL, @@ -132,9 +144,9 @@ func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A MaxSilences: cfg.UnifiedAlerting.AlertmanagerMaxSilencesCount, MaxSilenceSizeBytes: cfg.UnifiedAlerting.AlertmanagerMaxSilenceSizeBytes, }, + PipelineAndStateTimestampsMismatchAction: action, } - l := log.New("ngalert.notifier.alertmanager", "org", orgID) gam, err := alertingNotify.NewGrafanaAlertmanager("orgID", orgID, amcfg, peer, l, alertingNotify.NewGrafanaAlertmanagerMetrics(m.Registerer, l)) if err != nil { return nil, err @@ -152,7 +164,7 @@ func NewAlertmanager(ctx context.Context, orgID int64, cfg *setting.Cfg, store A logger: l, // TODO: Preferably, logic around autogen would be outside of the specific alertmanager implementation so that remote alertmanager will get it for free. - withAutogen: withAutogen, + withAutogen: featureToggles.IsEnabled(ctx, featuremgmt.FlagAlertingSimplifiedRouting), } return am, nil diff --git a/pkg/services/ngalert/notifier/alertmanager_test.go b/pkg/services/ngalert/notifier/alertmanager_test.go index 4f372d0fdda..7033e5f0b8a 100644 --- a/pkg/services/ngalert/notifier/alertmanager_test.go +++ b/pkg/services/ngalert/notifier/alertmanager_test.go @@ -11,6 +11,7 @@ import ( "github.com/grafana/grafana/pkg/infra/db" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/services/dashboards" + "github.com/grafana/grafana/pkg/services/featuremgmt" "github.com/grafana/grafana/pkg/services/ngalert/metrics" "github.com/grafana/grafana/pkg/services/ngalert/store" "github.com/grafana/grafana/pkg/services/ngalert/tests/fakes" @@ -52,7 +53,7 @@ func setupAMTest(t *testing.T) *alertmanager { orgID := 1 stateStore := NewFileStore(int64(orgID), kvStore) - am, err := NewAlertmanager(context.Background(), 1, cfg, s, stateStore, &NilPeer{}, decryptFn, nil, m, false) + am, err := NewAlertmanager(context.Background(), 1, cfg, s, stateStore, &NilPeer{}, decryptFn, nil, m, featuremgmt.WithFeatures()) require.NoError(t, err) return am } diff --git a/pkg/services/ngalert/notifier/multiorg_alertmanager.go b/pkg/services/ngalert/notifier/multiorg_alertmanager.go index 71140200558..9c7f2cd3fa5 100644 --- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go +++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go @@ -160,7 +160,7 @@ func NewMultiOrgAlertmanager( moa.factory = func(ctx context.Context, orgID int64) (Alertmanager, error) { m := metrics.NewAlertmanagerMetrics(moa.metrics.GetOrCreateOrgRegistry(orgID), l) stateStore := NewFileStore(orgID, kvStore) - return NewAlertmanager(ctx, orgID, moa.settings, moa.configStore, stateStore, moa.peer, moa.decryptFn, moa.ns, m, featureManager.IsEnabled(ctx, featuremgmt.FlagAlertingSimplifiedRouting)) + return NewAlertmanager(ctx, orgID, moa.settings, moa.configStore, stateStore, moa.peer, moa.decryptFn, moa.ns, m, featureManager) } for _, opt := range opts { diff --git a/pkg/storage/unified/apistore/go.mod b/pkg/storage/unified/apistore/go.mod index 48c6c358f39..1d4c20ae747 100644 --- a/pkg/storage/unified/apistore/go.mod +++ b/pkg/storage/unified/apistore/go.mod @@ -170,7 +170,7 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go/v2 v2.14.1 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a // indirect + github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 // indirect github.com/grafana/authlib v0.0.0-20250123104008-e99947858901 // indirect github.com/grafana/dataplane/sdata v0.0.9 // indirect github.com/grafana/dskit v0.0.0-20241105154643-a6b453a88040 // indirect diff --git a/pkg/storage/unified/apistore/go.sum b/pkg/storage/unified/apistore/go.sum index 1416f8d41f7..66c9b0db362 100644 --- a/pkg/storage/unified/apistore/go.sum +++ b/pkg/storage/unified/apistore/go.sum @@ -547,8 +547,8 @@ github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg= github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a h1:44E+I3EPdh/W02Uyfyig86EJKPjvzcF3y0A+FEi1fBk= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 h1:dmsycYQzl5JexuV8UxQpT3B79maSvhiIahid4/tezAM= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901 h1:nqV1YrtX+ZG+EYB5dcmFMWhg2Y038OMaAHAADbOC9RA= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901/go.mod h1:/gYfphsNu9v1qYWXxpv1NSvMEMSwvdf8qb8YlgwIRl8= github.com/grafana/authlib/types v0.0.0-20250120145936-5f0e28e7a87c h1:b0sPDtt33uFdmvUJjSCld3kwE2E49dUvevuUDSJsEuo= diff --git a/pkg/storage/unified/resource/go.mod b/pkg/storage/unified/resource/go.mod index a955e2d2476..44e8fe66e30 100644 --- a/pkg/storage/unified/resource/go.mod +++ b/pkg/storage/unified/resource/go.mod @@ -115,7 +115,7 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect github.com/googleapis/gax-go/v2 v2.14.1 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a // indirect + github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 // indirect github.com/grafana/dataplane/sdata v0.0.9 // indirect github.com/grafana/grafana-app-sdk/logging v0.30.0 // indirect github.com/grafana/grafana-aws-sdk v0.31.5 // indirect diff --git a/pkg/storage/unified/resource/go.sum b/pkg/storage/unified/resource/go.sum index 4e45d508a62..0231cdede72 100644 --- a/pkg/storage/unified/resource/go.sum +++ b/pkg/storage/unified/resource/go.sum @@ -403,8 +403,8 @@ github.com/gorilla/mux v1.6.2/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2z github.com/gorilla/mux v1.7.1/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a h1:44E+I3EPdh/W02Uyfyig86EJKPjvzcF3y0A+FEi1fBk= -github.com/grafana/alerting v0.0.0-20250129195454-3e5b80036b7a/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65 h1:dmsycYQzl5JexuV8UxQpT3B79maSvhiIahid4/tezAM= +github.com/grafana/alerting v0.0.0-20250130152446-d49e2e0b7d65/go.mod h1:QsnoKX/iYZxA4Cv+H+wC7uxutBD8qi8ZW5UJvD2TYmU= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901 h1:nqV1YrtX+ZG+EYB5dcmFMWhg2Y038OMaAHAADbOC9RA= github.com/grafana/authlib v0.0.0-20250123104008-e99947858901/go.mod h1:/gYfphsNu9v1qYWXxpv1NSvMEMSwvdf8qb8YlgwIRl8= github.com/grafana/authlib/types v0.0.0-20250120145936-5f0e28e7a87c h1:b0sPDtt33uFdmvUJjSCld3kwE2E49dUvevuUDSJsEuo=