mirror of
https://github.com/grafana/grafana.git
synced 2025-01-26 16:27:02 -06:00
Alerting: Add ha_reconnect_timeout configuration option (#88823)
* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration --------- Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
parent
2d370f3983
commit
eb76ea47a0
@ -1265,6 +1265,10 @@ ha_label =
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_gossip_interval = 200ms
|
||||
|
||||
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
|
||||
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_reconnect_timeout = 6h
|
||||
|
||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
# across larger clusters at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
@ -1251,6 +1251,10 @@
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;ha_gossip_interval = "200ms"
|
||||
|
||||
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
|
||||
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;ha_reconnect_timeout = 6h
|
||||
|
||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
# across larger clusters at the expense of increased bandwidth usage.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics`
|
||||
ha_peers = "grafana-alerting.grafana:9094"
|
||||
ha_advertise_address = "${POD_IP}:9094"
|
||||
ha_peer_timeout = 15s
|
||||
ha_reconnect_timeout = 2m
|
||||
```
|
||||
|
@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### ha_reconnect_timeout
|
||||
|
||||
Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`.
|
||||
|
||||
The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`.
|
||||
|
||||
### ha_push_pull_interval
|
||||
|
||||
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||
|
@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
|
||||
return fmt.Errorf("unable to initialize gossip mesh: %w", err)
|
||||
}
|
||||
|
||||
err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout)
|
||||
err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout)
|
||||
if err != nil {
|
||||
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
|
||||
}
|
||||
|
@ -18,6 +18,7 @@ const (
|
||||
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
||||
alertmanagerDefaultPeerTimeout = 15 * time.Second
|
||||
alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval
|
||||
alertmanagerDefaultReconnectTimeout = alertingCluster.DefaultReconnectTimeout
|
||||
alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval
|
||||
alertmanagerDefaultConfigPollInterval = time.Minute
|
||||
alertmanagerRedisDefaultMaxConns = 5
|
||||
@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct {
|
||||
HAPeers []string
|
||||
HAPeerTimeout time.Duration
|
||||
HAGossipInterval time.Duration
|
||||
HAReconnectTimeout time.Duration
|
||||
HAPushPullInterval time.Duration
|
||||
HALabel string
|
||||
HARedisClusterModeEnabled bool
|
||||
@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
|
||||
require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
|
||||
require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
|
||||
require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
|
||||
require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout)
|
||||
}
|
||||
|
||||
// With peers set, it correctly parses them.
|
||||
|
Loading…
Reference in New Issue
Block a user