Alerting: Add ha_reconnect_timeout configuration option (#88823)

* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration

---------

Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
Jacob Valdemar 2024-06-11 19:25:48 +02:00 committed by GitHub
parent 2d370f3983
commit eb76ea47a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 23 additions and 1 deletions

View File

@ -1265,6 +1265,10 @@ ha_label =
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_gossip_interval = 200ms ha_gossip_interval = 200ms
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_reconnect_timeout = 6h
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
# across larger clusters at the expense of increased bandwidth usage. # across larger clusters at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -1251,6 +1251,10 @@
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;ha_gossip_interval = "200ms" ;ha_gossip_interval = "200ms"
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;ha_reconnect_timeout = 6h
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds # The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
# across larger clusters at the expense of increased bandwidth usage. # across larger clusters at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics`
ha_peers = "grafana-alerting.grafana:9094" ha_peers = "grafana-alerting.grafana:9094"
ha_advertise_address = "${POD_IP}:9094" ha_advertise_address = "${POD_IP}:9094"
ha_peer_timeout = 15s ha_peer_timeout = 15s
ha_reconnect_timeout = 2m
``` ```

View File

@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
### ha_reconnect_timeout
Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`.
The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`.
### ha_push_pull_interval ### ha_push_pull_interval
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds

View File

@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
return fmt.Errorf("unable to initialize gossip mesh: %w", err) return fmt.Errorf("unable to initialize gossip mesh: %w", err)
} }
err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout) err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout)
if err != nil { if err != nil {
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err) moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
} }

View File

@ -18,6 +18,7 @@ const (
alertmanagerDefaultClusterAddr = "0.0.0.0:9094" alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
alertmanagerDefaultPeerTimeout = 15 * time.Second alertmanagerDefaultPeerTimeout = 15 * time.Second
alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval
alertmanagerDefaultReconnectTimeout = alertingCluster.DefaultReconnectTimeout
alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval
alertmanagerDefaultConfigPollInterval = time.Minute alertmanagerDefaultConfigPollInterval = time.Minute
alertmanagerRedisDefaultMaxConns = 5 alertmanagerRedisDefaultMaxConns = 5
@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct {
HAPeers []string HAPeers []string
HAPeerTimeout time.Duration HAPeerTimeout time.Duration
HAGossipInterval time.Duration HAGossipInterval time.Duration
HAReconnectTimeout time.Duration
HAPushPullInterval time.Duration HAPushPullInterval time.Duration
HALabel string HALabel string
HARedisClusterModeEnabled bool HARedisClusterModeEnabled bool
@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
if err != nil { if err != nil {
return err return err
} }
uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String()))
if err != nil {
return err
}
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String())) uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
if err != nil { if err != nil {
return err return err

View File

@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
require.Len(t, cfg.UnifiedAlerting.HAPeers, 0) require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval) require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval) require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout)
} }
// With peers set, it correctly parses them. // With peers set, it correctly parses them.