Alerting: Add ha_reconnect_timeout configuration option (#88823)

* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration

---------

Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
Jacob Valdemar 2024-06-11 19:25:48 +02:00 committed by GitHub
parent 2d370f3983
commit eb76ea47a0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 23 additions and 1 deletions

View File

@ -1265,6 +1265,10 @@ ha_label =
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_gossip_interval = 200ms
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_reconnect_timeout = 6h
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
# across larger clusters at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -1251,6 +1251,10 @@
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;ha_gossip_interval = "200ms"
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;ha_reconnect_timeout = 6h
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
# across larger clusters at the expense of increased bandwidth usage.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics`
ha_peers = "grafana-alerting.grafana:9094"
ha_advertise_address = "${POD_IP}:9094"
ha_peer_timeout = 15s
ha_reconnect_timeout = 2m
```

View File

@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
### ha_reconnect_timeout
Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`.
The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`.
### ha_push_pull_interval
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds

View File

@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
return fmt.Errorf("unable to initialize gossip mesh: %w", err)
}
err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout)
err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout)
if err != nil {
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
}

View File

@ -18,6 +18,7 @@ const (
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
alertmanagerDefaultPeerTimeout = 15 * time.Second
alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval
alertmanagerDefaultReconnectTimeout = alertingCluster.DefaultReconnectTimeout
alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval
alertmanagerDefaultConfigPollInterval = time.Minute
alertmanagerRedisDefaultMaxConns = 5
@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct {
HAPeers []string
HAPeerTimeout time.Duration
HAGossipInterval time.Duration
HAReconnectTimeout time.Duration
HAPushPullInterval time.Duration
HALabel string
HARedisClusterModeEnabled bool
@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
if err != nil {
return err
}
uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String()))
if err != nil {
return err
}
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
if err != nil {
return err

View File

@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout)
}
// With peers set, it correctly parses them.