Alerting: Add ha_reconnect_timeout configuration option (#88823)

* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration --------- Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
2025-02-25 18:55:37 -06:00 · 2024-06-11 19:25:48 +02:00 · 2024-06-11 19:25:48 +02:00 · eb76ea47a0
commit eb76ea47a0
parent 2d370f3983
7 changed files with 23 additions and 1 deletions
--- a/conf/defaults.ini
+++ b/conf/defaults.ini
@ -1265,6 +1265,10 @@ ha_label =
 # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
 ha_gossip_interval = 200ms

+# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
+# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+ha_reconnect_timeout = 6h
+
 # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
 # across larger clusters at the expense of increased bandwidth usage.
 # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
--- a/conf/sample.ini
+++ b/conf/sample.ini
@ -1251,6 +1251,10 @@
 # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
 ;ha_gossip_interval = "200ms"

+# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
+# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
+;ha_reconnect_timeout = 6h
+
 # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
 # across larger clusters at the expense of increased bandwidth usage.
 # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
--- a/docs/sources/alerting/set-up/configure-high-availability/_index.md
+++ b/docs/sources/alerting/set-up/configure-high-availability/_index.md
@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics`
   ha_peers = "grafana-alerting.grafana:9094"
   ha_advertise_address = "${POD_IP}:9094"
   ha_peer_timeout = 15s
+   ha_reconnect_timeout = 2m
   ```
--- a/docs/sources/setup-grafana/configure-grafana/_index.md
+++ b/docs/sources/setup-grafana/configure-grafana/_index.md
@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def

 The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

+### ha_reconnect_timeout
+
+Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`.
+
+The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`.
+
 ### ha_push_pull_interval

 The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
--- a/pkg/services/ngalert/notifier/multiorg_alertmanager.go
+++ b/pkg/services/ngalert/notifier/multiorg_alertmanager.go
@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
 			return fmt.Errorf("unable to initialize gossip mesh: %w", err)
 		}

-		err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout)
+		err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout)
 		if err != nil {
 			moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
 		}
--- a/pkg/setting/setting_unified_alerting.go
+++ b/pkg/setting/setting_unified_alerting.go
@ -18,6 +18,7 @@ const (
 	alertmanagerDefaultClusterAddr        = "0.0.0.0:9094"
 	alertmanagerDefaultPeerTimeout        = 15 * time.Second
 	alertmanagerDefaultGossipInterval     = alertingCluster.DefaultGossipInterval
+	alertmanagerDefaultReconnectTimeout   = alertingCluster.DefaultReconnectTimeout
 	alertmanagerDefaultPushPullInterval   = alertingCluster.DefaultPushPullInterval
 	alertmanagerDefaultConfigPollInterval = time.Minute
 	alertmanagerRedisDefaultMaxConns      = 5
@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct {
 	HAPeers                        []string
 	HAPeerTimeout                  time.Duration
 	HAGossipInterval               time.Duration
+	HAReconnectTimeout             time.Duration
 	HAPushPullInterval             time.Duration
 	HALabel                        string
 	HARedisClusterModeEnabled      bool
@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
 	if err != nil {
 		return err
 	}
+	uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String()))
+	if err != nil {
+		return err
+	}
 	uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
 	if err != nil {
 		return err
--- a/pkg/setting/setting_unified_alerting_test.go
+++ b/pkg/setting/setting_unified_alerting_test.go
@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
 		require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
 		require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
 		require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
+		require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout)
 	}

 	// With peers set, it correctly parses them.