Alerting: Promote configuration in the remote Alertmanager (#87388)

This commit is contained in:
Santiago
2024-05-16 12:06:03 +02:00
committed by GitHub
parent fbda55316d
commit e41434c332
6 changed files with 79 additions and 50 deletions

View File

@@ -191,20 +191,27 @@ func (ng *AlertNG) init() error {
}
// Create remote Alertmanager.
remoteAM, err := createRemoteAlertmanager(orgID, ng.Cfg.UnifiedAlerting.RemoteAlertmanager, ng.KVStore, ng.SecretsService.Decrypt, ng.Cfg.UnifiedAlerting.DefaultConfiguration, m)
cfg := remote.AlertmanagerConfig{
BasicAuthPassword: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.Password,
DefaultConfig: ng.Cfg.UnifiedAlerting.DefaultConfiguration,
OrgID: orgID,
TenantID: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.TenantID,
URL: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.URL,
}
remoteAM, err := createRemoteAlertmanager(cfg, ng.KVStore, ng.SecretsService.Decrypt, m)
if err != nil {
moaLogger.Error("Failed to create remote Alertmanager, falling back to using only the internal one", "err", err)
return internalAM, nil
}
// Use both Alertmanager implementations in the forked Alertmanager.
cfg := remote.RemoteSecondaryConfig{
rsCfg := remote.RemoteSecondaryConfig{
Logger: log.New("ngalert.forked-alertmanager.remote-secondary"),
OrgID: orgID,
Store: ng.store,
SyncInterval: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.SyncInterval,
}
return remote.NewRemoteSecondaryForkedAlertmanager(cfg, internalAM, remoteAM)
return remote.NewRemoteSecondaryForkedAlertmanager(rsCfg, internalAM, remoteAM)
}
})
@@ -540,12 +547,6 @@ func ApplyStateHistoryFeatureToggles(cfg *setting.UnifiedAlertingStateHistorySet
}
}
func createRemoteAlertmanager(orgID int64, amCfg setting.RemoteAlertmanagerSettings, kvstore kvstore.KVStore, decryptFn remote.DecryptFn, defaultConfig string, m *metrics.RemoteAlertmanager) (*remote.Alertmanager, error) {
externalAMCfg := remote.AlertmanagerConfig{
OrgID: orgID,
URL: amCfg.URL,
TenantID: amCfg.TenantID,
BasicAuthPassword: amCfg.Password,
}
return remote.NewAlertmanager(externalAMCfg, notifier.NewFileStore(orgID, kvstore), decryptFn, defaultConfig, m)
func createRemoteAlertmanager(cfg remote.AlertmanagerConfig, kvstore kvstore.KVStore, decryptFn remote.DecryptFn, m *metrics.RemoteAlertmanager) (*remote.Alertmanager, error) {
return remote.NewAlertmanager(cfg, notifier.NewFileStore(cfg.OrgID, kvstore), decryptFn, m)
}

View File

@@ -63,9 +63,10 @@ func TestMultiorgAlertmanager_RemoteSecondaryMode(t *testing.T) {
URL: testsrv.URL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: setting.GetAlertmanagerDefaultConfiguration(),
}
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
remoteAM, err := remote.NewAlertmanager(externalAMCfg, notifier.NewFileStore(orgID, kvStore), secretsService.Decrypt, setting.GetAlertmanagerDefaultConfiguration(), m)
remoteAM, err := remote.NewAlertmanager(externalAMCfg, notifier.NewFileStore(orgID, kvStore), secretsService.Decrypt, m)
require.NoError(t, err)
// Use both Alertmanager implementations in the forked Alertmanager.

View File

@@ -61,6 +61,11 @@ type AlertmanagerConfig struct {
URL string
TenantID string
BasicAuthPassword string
DefaultConfig string
// PromoteConfig is a flag that determines whether the configuration should be used in the remote Alertmanager.
// The same flag is used for promoting state.
PromoteConfig bool
}
func (cfg *AlertmanagerConfig) Validate() error {
@@ -78,7 +83,7 @@ func (cfg *AlertmanagerConfig) Validate() error {
return nil
}
func NewAlertmanager(cfg AlertmanagerConfig, store stateStore, decryptFn DecryptFn, defaultConfig string, metrics *metrics.RemoteAlertmanager) (*Alertmanager, error) {
func NewAlertmanager(cfg AlertmanagerConfig, store stateStore, decryptFn DecryptFn, metrics *metrics.RemoteAlertmanager) (*Alertmanager, error) {
if err := cfg.Validate(); err != nil {
return nil, err
}
@@ -90,10 +95,11 @@ func NewAlertmanager(cfg AlertmanagerConfig, store stateStore, decryptFn Decrypt
logger := log.New("ngalert.remote.alertmanager")
mcCfg := &remoteClient.Config{
URL: u,
TenantID: cfg.TenantID,
Password: cfg.BasicAuthPassword,
Logger: logger,
Logger: logger,
Password: cfg.BasicAuthPassword,
TenantID: cfg.TenantID,
URL: u,
PromoteConfig: cfg.PromoteConfig,
}
mc, err := remoteClient.New(mcCfg, metrics)
if err != nil {
@@ -124,7 +130,7 @@ func NewAlertmanager(cfg AlertmanagerConfig, store stateStore, decryptFn Decrypt
}
// Parse the default configuration into a postable config.
pCfg, err := notifier.Load([]byte(defaultConfig))
pCfg, err := notifier.Load([]byte(cfg.DefaultConfig))
if err != nil {
return nil, err
}
@@ -529,6 +535,10 @@ func (am *Alertmanager) shouldSendConfig(ctx context.Context, config *apimodels.
return true
}
if rc.Promoted != am.mimirClient.ShouldPromoteConfig() {
return true
}
rawRemote, err := json.Marshal(rc.GrafanaAlertmanagerConfig)
if err != nil {
am.log.Error("Unable to marshal the remote Alertmanager configuration for comparison", "err", err)

View File

@@ -99,9 +99,10 @@ func TestNewAlertmanager(t *testing.T) {
URL: test.url,
TenantID: test.tenantID,
BasicAuthPassword: test.password,
DefaultConfig: defaultGrafanaConfig,
}
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, m)
if test.expErr != "" {
require.EqualError(tt, err, test.expErr)
return
@@ -121,16 +122,11 @@ func TestApplyConfig(t *testing.T) {
w.WriteHeader(http.StatusInternalServerError)
})
var configSent string
var configSent client.UserGrafanaConfig
okHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method == http.MethodPost && strings.Contains(r.URL.Path, "/config") {
var c client.UserGrafanaConfig
require.NoError(t, json.NewDecoder(r.Body).Decode(&c))
amCfg, err := json.Marshal(c.GrafanaAlertmanagerConfig)
require.NoError(t, err)
configSent = string(amCfg)
require.NoError(t, json.NewDecoder(r.Body).Decode(&configSent))
}
w.WriteHeader(http.StatusOK)
})
@@ -152,9 +148,11 @@ func TestApplyConfig(t *testing.T) {
// A non-200 response should result in an error.
server := httptest.NewServer(errorHandler)
cfg := AlertmanagerConfig{
OrgID: 1,
TenantID: "test",
URL: server.URL,
OrgID: 1,
TenantID: "test",
URL: server.URL,
DefaultConfig: defaultGrafanaConfig,
PromoteConfig: true,
}
ctx := context.Background()
@@ -165,7 +163,7 @@ func TestApplyConfig(t *testing.T) {
// An error response from the remote Alertmanager should result in the readiness check failing.
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, fstore, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, fstore, secretsService.Decrypt, m)
require.NoError(t, err)
config := &ngmodels.AlertConfiguration{
@@ -179,8 +177,11 @@ func TestApplyConfig(t *testing.T) {
require.NoError(t, am.ApplyConfig(ctx, config))
require.True(t, am.Ready())
// Secrets in the sent configuration should be unencrypted.
require.JSONEq(t, testGrafanaConfigWithSecret, configSent)
// The sent configuration should be unencrypted and promoted.
amCfg, err := json.Marshal(configSent.GrafanaAlertmanagerConfig)
require.NoError(t, err)
require.JSONEq(t, testGrafanaConfigWithSecret, string(amCfg))
require.True(t, configSent.Promoted)
// If we already got a 200 status code response, we shouldn't make the HTTP request again.
server.Config.Handler = errorHandler
@@ -216,14 +217,14 @@ func TestCompareAndSendConfiguration(t *testing.T) {
fstore := notifier.NewFileStore(1, ngfakes.NewFakeKVStore(t))
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
cfg := AlertmanagerConfig{
OrgID: 1,
TenantID: "test",
URL: server.URL,
OrgID: 1,
TenantID: "test",
URL: server.URL,
DefaultConfig: defaultGrafanaConfig,
}
am, err := NewAlertmanager(cfg,
fstore,
decryptFn,
defaultGrafanaConfig,
m,
)
require.NoError(t, err)
@@ -298,6 +299,7 @@ func TestIntegrationRemoteAlertmanagerConfiguration(t *testing.T) {
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: defaultGrafanaConfig,
}
testConfigHash := fmt.Sprintf("%x", md5.Sum([]byte(testGrafanaConfig)))
@@ -319,7 +321,7 @@ func TestIntegrationRemoteAlertmanagerConfiguration(t *testing.T) {
secretsService := secretsManager.SetupTestService(t, database.ProvideSecretsStore(db.InitTestDB(t)))
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, fstore, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, fstore, secretsService.Decrypt, m)
require.NoError(t, err)
encodedFullState, err := am.getFullState(ctx)
@@ -461,11 +463,12 @@ func TestIntegrationRemoteAlertmanagerGetStatus(t *testing.T) {
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: defaultGrafanaConfig,
}
secretsService := secretsManager.SetupTestService(t, fakes.NewFakeSecretsStore())
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, m)
require.NoError(t, err)
// We should get the default Cloud Alertmanager configuration.
@@ -494,11 +497,12 @@ func TestIntegrationRemoteAlertmanagerSilences(t *testing.T) {
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: defaultGrafanaConfig,
}
secretsService := secretsManager.SetupTestService(t, fakes.NewFakeSecretsStore())
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, m)
require.NoError(t, err)
// We should have no silences at first.
@@ -578,11 +582,12 @@ func TestIntegrationRemoteAlertmanagerAlerts(t *testing.T) {
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: defaultGrafanaConfig,
}
secretsService := secretsManager.SetupTestService(t, fakes.NewFakeSecretsStore())
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, m)
require.NoError(t, err)
// Wait until the Alertmanager is ready to send alerts.
@@ -646,11 +651,12 @@ func TestIntegrationRemoteAlertmanagerReceivers(t *testing.T) {
URL: amURL,
TenantID: tenantID,
BasicAuthPassword: password,
DefaultConfig: defaultGrafanaConfig,
}
secretsService := secretsManager.SetupTestService(t, fakes.NewFakeSecretsStore())
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, defaultGrafanaConfig, m)
am, err := NewAlertmanager(cfg, nil, secretsService.Decrypt, m)
require.NoError(t, err)
// We should start with the default config.

View File

@@ -19,6 +19,11 @@ type UserGrafanaConfig struct {
Hash string `json:"configuration_hash"`
CreatedAt int64 `json:"created"`
Default bool `json:"default"`
Promoted bool `json:"promoted"`
}
func (mc *Mimir) ShouldPromoteConfig() bool {
return mc.promoteConfig
}
func (mc *Mimir) GetGrafanaAlertmanagerConfig(ctx context.Context) (*UserGrafanaConfig, error) {
@@ -46,6 +51,7 @@ func (mc *Mimir) CreateGrafanaAlertmanagerConfig(ctx context.Context, cfg *apimo
Hash: hash,
CreatedAt: createdAt,
Default: isDefault,
Promoted: mc.promoteConfig,
})
if err != nil {
return err

View File

@@ -26,13 +26,16 @@ type MimirClient interface {
GetGrafanaAlertmanagerConfig(ctx context.Context) (*UserGrafanaConfig, error)
CreateGrafanaAlertmanagerConfig(ctx context.Context, configuration *apimodels.PostableUserConfig, hash string, createdAt int64, isDefault bool) error
DeleteGrafanaAlertmanagerConfig(ctx context.Context) error
ShouldPromoteConfig() bool
}
type Mimir struct {
client client.Requester
endpoint *url.URL
logger log.Logger
metrics *metrics.RemoteAlertmanager
client client.Requester
endpoint *url.URL
logger log.Logger
metrics *metrics.RemoteAlertmanager
promoteConfig bool
}
type Config struct {
@@ -40,7 +43,8 @@ type Config struct {
TenantID string
Password string
Logger log.Logger
Logger log.Logger
PromoteConfig bool
}
// successResponse represents a successful response from the Mimir API.
@@ -76,10 +80,11 @@ func New(cfg *Config, metrics *metrics.RemoteAlertmanager) (*Mimir, error) {
}
return &Mimir{
endpoint: cfg.URL,
client: client.NewTimedClient(c, metrics.RequestLatency),
logger: cfg.Logger,
metrics: metrics,
endpoint: cfg.URL,
client: client.NewTimedClient(c, metrics.RequestLatency),
logger: cfg.Logger,
metrics: metrics,
promoteConfig: cfg.PromoteConfig,
}, nil
}