Unified Alerting: Set max_attempts to 1 by default (#79095)

* Unified Alerting: Set `max_attempts` to 1 by default

The retry logic for unified alerting has been broken as far as v9.4.x, rather than fixing it in one go and causing a headache to our users with rules putting extra load on their datasources - I think a better approach is to simply set 1 as a default and then let our users change it.

I see two cons with this approach:

- Configuration for legacy to unified alerting cannot be ported over automatically, users will have to manually set `max_attempts` to 3 when migrating.
- Users expecting to get any sort of retrying (as with legacy alerting) will not have it out of the box and will have to manually edit the configuration.

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---------

Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
gotjosh 2023-12-05 17:42:34 +00:00 committed by GitHub
parent 7cdddb2790
commit 0c9356a3c7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 11 additions and 20 deletions

View File

@ -1147,8 +1147,8 @@ execute_alerts = true
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
max_attempts = 3
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
max_attempts = 1
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -1094,8 +1094,8 @@
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
;max_attempts = 3
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1.
;max_attempts = 1
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.

View File

@ -1595,7 +1595,7 @@ The timeout string is a possibly signed sequence of decimal numbers, followed by
### max_attempts
Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `3`. This option has a [legacy version in the alerting section]({{< relref "#max_attempts-1" >}}) that takes precedence.
Sets a maximum number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is `1`.
### min_interval

View File

@ -47,7 +47,7 @@ const (
evaluatorDefaultEvaluationTimeout = 30 * time.Second
schedulerDefaultAdminConfigPollInterval = time.Minute
schedulereDefaultExecuteAlerts = true
schedulerDefaultMaxAttempts = 3
schedulerDefaultMaxAttempts = 1
schedulerDefaultLegacyMinInterval = 1
screenshotsDefaultCapture = false
screenshotsDefaultCaptureTimeout = 10 * time.Second
@ -294,15 +294,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
}
uaCfg.EvaluationTimeout = uaEvaluationTimeout
uaMaxAttempts := ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if uaMaxAttempts == schedulerDefaultMaxAttempts { // unified option or equals the default
legacyMaxAttempts := alerting.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if legacyMaxAttempts != schedulerDefaultMaxAttempts {
cfg.Logger.Warn("falling back to legacy setting of 'max_attempts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaMaxAttempts = legacyMaxAttempts
}
uaCfg.MaxAttempts = uaMaxAttempts
uaCfg.MaxAttempts = ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
uaCfg.BaseInterval = SchedulerBaseInterval

View File

@ -110,20 +110,19 @@ func TestUnifiedAlertingSettings(t *testing.T) {
desc: "when the unified options equal the defaults, it should apply the legacy ones",
unifiedAlertingOptions: map[string]string{
"admin_config_poll_interval": "120s",
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
"min_interval": SchedulerBaseInterval.String(),
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
},
alertingOptions: map[string]string{
"max_attempts": "12",
"max_attempts": "1",
"min_interval_seconds": "120",
"execute_alerts": "true",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
@ -164,14 +163,14 @@ func TestUnifiedAlertingSettings(t *testing.T) {
"evaluation_timeout": "invalid",
},
alertingOptions: map[string]string{
"max_attempts": "12",
"max_attempts": "1",
"min_interval_seconds": "120",
"execute_alerts": "false",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)