mirror of
https://github.com/grafana/grafana.git
synced 2024-11-25 02:10:45 -06:00
Alerting: Add ha_reconnect_timeout configuration option (#88823)
* Docs: Update "Configure high availability" guide with ha_reconnect_timeout configuration --------- Co-authored-by: Christopher Moyer <35463610+chri2547@users.noreply.github.com>
This commit is contained in:
parent
2d370f3983
commit
eb76ea47a0
@ -1265,6 +1265,10 @@ ha_label =
|
|||||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
ha_gossip_interval = 200ms
|
ha_gossip_interval = 200ms
|
||||||
|
|
||||||
|
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
|
||||||
|
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
ha_reconnect_timeout = 6h
|
||||||
|
|
||||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||||
# across larger clusters at the expense of increased bandwidth usage.
|
# across larger clusters at the expense of increased bandwidth usage.
|
||||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
@ -1251,6 +1251,10 @@
|
|||||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
;ha_gossip_interval = "200ms"
|
;ha_gossip_interval = "200ms"
|
||||||
|
|
||||||
|
# Length of time to attempt to reconnect to a lost peer. Recommended to be short (<15m) when Grafana is running in a Kubernetes cluster.
|
||||||
|
# The string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
;ha_reconnect_timeout = 6h
|
||||||
|
|
||||||
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
# The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||||
# across larger clusters at the expense of increased bandwidth usage.
|
# across larger clusters at the expense of increased bandwidth usage.
|
||||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
@ -147,4 +147,5 @@ The following metrics can be used for meta monitoring, exposed by the `/metrics`
|
|||||||
ha_peers = "grafana-alerting.grafana:9094"
|
ha_peers = "grafana-alerting.grafana:9094"
|
||||||
ha_advertise_address = "${POD_IP}:9094"
|
ha_advertise_address = "${POD_IP}:9094"
|
||||||
ha_peer_timeout = 15s
|
ha_peer_timeout = 15s
|
||||||
|
ha_reconnect_timeout = 2m
|
||||||
```
|
```
|
||||||
|
@ -1635,6 +1635,12 @@ across cluster more quickly at the expense of increased bandwidth usage. The def
|
|||||||
|
|
||||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||||
|
|
||||||
|
### ha_reconnect_timeout
|
||||||
|
|
||||||
|
Length of time to attempt to reconnect to a lost peer. When running Grafana in a Kubernetes cluster, set this duration to less than `15m`.
|
||||||
|
|
||||||
|
The string is a possibly signed sequence of decimal numbers followed by a unit suffix (ms, s, m, h, d), such as `30s` or `1m`.
|
||||||
|
|
||||||
### ha_push_pull_interval
|
### ha_push_pull_interval
|
||||||
|
|
||||||
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
The interval between gossip full state syncs. Setting this interval lower (more frequent) will increase convergence speeds
|
||||||
|
@ -215,7 +215,7 @@ func (moa *MultiOrgAlertmanager) setupClustering(cfg *setting.Cfg) error {
|
|||||||
return fmt.Errorf("unable to initialize gossip mesh: %w", err)
|
return fmt.Errorf("unable to initialize gossip mesh: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = peer.Join(alertingCluster.DefaultReconnectInterval, alertingCluster.DefaultReconnectTimeout)
|
err = peer.Join(alertingCluster.DefaultReconnectInterval, cfg.UnifiedAlerting.HAReconnectTimeout)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
|
moa.logger.Error("Msg", "Unable to join gossip mesh while initializing cluster for high availability mode", "error", err)
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ const (
|
|||||||
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
||||||
alertmanagerDefaultPeerTimeout = 15 * time.Second
|
alertmanagerDefaultPeerTimeout = 15 * time.Second
|
||||||
alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval
|
alertmanagerDefaultGossipInterval = alertingCluster.DefaultGossipInterval
|
||||||
|
alertmanagerDefaultReconnectTimeout = alertingCluster.DefaultReconnectTimeout
|
||||||
alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval
|
alertmanagerDefaultPushPullInterval = alertingCluster.DefaultPushPullInterval
|
||||||
alertmanagerDefaultConfigPollInterval = time.Minute
|
alertmanagerDefaultConfigPollInterval = time.Minute
|
||||||
alertmanagerRedisDefaultMaxConns = 5
|
alertmanagerRedisDefaultMaxConns = 5
|
||||||
@ -71,6 +72,7 @@ type UnifiedAlertingSettings struct {
|
|||||||
HAPeers []string
|
HAPeers []string
|
||||||
HAPeerTimeout time.Duration
|
HAPeerTimeout time.Duration
|
||||||
HAGossipInterval time.Duration
|
HAGossipInterval time.Duration
|
||||||
|
HAReconnectTimeout time.Duration
|
||||||
HAPushPullInterval time.Duration
|
HAPushPullInterval time.Duration
|
||||||
HALabel string
|
HALabel string
|
||||||
HARedisClusterModeEnabled bool
|
HARedisClusterModeEnabled bool
|
||||||
@ -217,6 +219,10 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
uaCfg.HAReconnectTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_reconnect_timeout", (alertmanagerDefaultReconnectTimeout).String()))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
|
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -25,6 +25,7 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
|
|||||||
require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
|
require.Len(t, cfg.UnifiedAlerting.HAPeers, 0)
|
||||||
require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
|
require.Equal(t, 200*time.Millisecond, cfg.UnifiedAlerting.HAGossipInterval)
|
||||||
require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
|
require.Equal(t, time.Minute, cfg.UnifiedAlerting.HAPushPullInterval)
|
||||||
|
require.Equal(t, 6*time.Hour, cfg.UnifiedAlerting.HAReconnectTimeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
// With peers set, it correctly parses them.
|
// With peers set, it correctly parses them.
|
||||||
|
Loading…
Reference in New Issue
Block a user