Alerting: tune rule evaluation via configuration (#35623)

* Alerting: Configure max evaluation retries

* Alerting: Enforce minimum rule evaluation interval

* Alerting: Disable rule evaluation from configuration

* Update docs

* Alerting: Configure rule evaluation timeout

* Move options on unified_alerting config section

* Apply suggestions from code review

Co-authored-by: gotjosh <josue@grafana.com>
This commit is contained in:
Sofia Papagiannaki 2021-09-28 13:00:16 +03:00 committed by GitHub
parent cc94c55e48
commit f6f3a54742
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 393 additions and 43 deletions

View File

@ -764,11 +764,26 @@ ha_gossip_interval = 200ms
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
ha_push_pull_interval = 60s
# Enable or disable alerting rule execution. The alerting UI remains visible. This option has a legacy version in the `[alerting]` section that takes precedence.
execute_alerts = true
# Alert evaluation timeout when fetching data from the datasource. This option has a legacy version in the `[alerting]` section that takes precedence.
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
evaluation_timeout = 30s
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
max_attempts = 3
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
min_interval = 10s
#################################### Alerting ############################
[alerting]
# Disable alerting engine & UI features
enabled = true
# Makes it possible to turn off alert rule execution but alerting UI is visible
# Makes it possible to turn off alert execution but alerting UI is visible
execute_alerts = true
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)

View File

@ -742,11 +742,26 @@
;ha_push_pull_interval = "60s"
# Makes it possible to turn off alert rule execution but alerting UI is visible. If it's true (the default) then the respective legacy option is applied.
;execute_alerts = true
# Alert evaluation timeout when fetching data from the datasource. It refers to the This option has a legacy version in the `[alerting]` section that takes precedence.
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;evaluation_timeout = 30s
# Default setting for max attempts to sending alert notifications. Default value is 3. If it's 3 (the default) then the respective legacy option is applied.
;max_attempts = 3
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value. Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
;min_interval = 10s
#################################### Alerting ############################
[alerting]
# Disable alerting engine & UI features
;enabled = true
# Makes it possible to turn off alert rule execution but alerting UI is visible
# Makes it possible to turn off alert execution but alerting UI is visible
;execute_alerts = true
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
@ -759,7 +774,6 @@
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
;concurrent_render_limit = 5
# Default setting for alert calculation timeout. Default value is 30
;evaluation_timeout_seconds = 30

View File

@ -14,7 +14,7 @@ Grafana has default and custom configuration files. You can customize your Grafa
## Configuration file location
The default settings for a Grafana instance are stored in the `$WORKING_DIR/conf/defaults.ini` file. _Do not_ change the location in this file.
The default settings for a Grafana instance are stored in the `$WORKING_DIR/conf/defaults.ini` file. _Do not_ change the location in this file.
Depending on your OS, your custom configuration file is either the `$WORKING_DIR/conf/defaults.ini` file or the `/usr/local/etc/grafana/grafana.ini` file. The custom configuration file path can be overridden using the `--config` parameter.
@ -1159,6 +1159,28 @@ across larger clusters at the expense of increased bandwidth usage. The default
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
### execute_alerts
Enable or disable alerting rule execution. Default is `true`. The alerting UI remains visible. This option has a [legacy version in the alerting section]({{< relref "#execute_alerts-1">}}) that takes precedence.
### evaluation_timeout
Sets the alert evaluation timeout when fetching data from the datasource. Default value is `30s`. This option has a [legacy version in the alerting section]({{< relref "#evaluation_timeout_seconds">}}) that takes precedence.
The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
### max_attempts
Sets a maximum limit on attempts to sending alert notifications. Default value is `3`. If it's `3` (the default) then the [respective legacy option]({{< relref "#max_attempts-1">}}) is applied
### min_interval
Sets the minimum interval to enforce between rule evaluations. Default value is `10s` which equals the scheduler interval. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has [a legacy version in the alerting section]({{< relref "#min_interval_seconds">}}) that takes precedence.
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
> **Note.** This setting has precedence over each individual rule frequency. If a rule frequency is lower than this value, then this value is enforced.
<hr>
## [alerting]

View File

@ -21,8 +21,6 @@ import (
"github.com/grafana/grafana/pkg/expr"
)
const alertingEvaluationTimeout = 30 * time.Second
type Evaluator struct {
Cfg *setting.Cfg
Log log.Logger
@ -434,7 +432,7 @@ func (evalResults Results) AsDataFrame() data.Frame {
// ConditionEval executes conditions and evaluates the result.
func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, dataService *tsdb.Service) (Results, error) {
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
defer cancelFn()
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
@ -447,7 +445,7 @@ func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, da
// QueriesAndExpressionsEval executes queries and expressions and returns the result.
func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (*backend.QueryDataResponse, error) {
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
defer cancelFn()
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}

View File

@ -26,7 +26,6 @@ import (
)
const (
maxAttempts int64 = 3
// scheduler interval
// changing this value is discouraged
// because this could cause existing alert definition
@ -93,10 +92,10 @@ func (ng *AlertNG) init() error {
baseInterval *= time.Second
store := &store.DBstore{
BaseInterval: baseInterval,
DefaultIntervalSeconds: defaultIntervalSeconds,
SQLStore: ng.SQLStore,
Logger: ng.Log,
BaseInterval: baseInterval,
DefaultInterval: ng.getRuleDefaultInterval(),
SQLStore: ng.SQLStore,
Logger: ng.Log,
}
multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics()
@ -113,8 +112,8 @@ func (ng *AlertNG) init() error {
schedCfg := schedule.SchedulerCfg{
C: clock.New(),
BaseInterval: baseInterval,
Logger: log.New("ngalert.scheduler"),
MaxAttempts: maxAttempts,
Logger: ng.Log,
MaxAttempts: ng.Cfg.UnifiedAlerting.MaxAttempts,
Evaluator: eval.Evaluator{Cfg: ng.Cfg, Log: ng.Log},
InstanceStore: store,
RuleStore: store,
@ -123,6 +122,7 @@ func (ng *AlertNG) init() error {
MultiOrgNotifier: ng.MultiOrgAlertmanager,
Metrics: ng.Metrics.GetSchedulerMetrics(),
AdminConfigPollInterval: ng.Cfg.UnifiedAlerting.AdminConfigPollInterval,
MinRuleInterval: ng.getRuleMinInterval(),
}
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
schedule := schedule.NewScheduler(schedCfg, ng.DataService, ng.Cfg.AppURL, stateManager)
@ -156,9 +156,12 @@ func (ng *AlertNG) Run(ctx context.Context) error {
ng.stateManager.Warm()
children, subCtx := errgroup.WithContext(ctx)
children.Go(func() error {
return ng.schedule.Run(subCtx)
})
if ng.Cfg.UnifiedAlerting.ExecuteAlerts {
children.Go(func() error {
return ng.schedule.Run(subCtx)
})
}
children.Go(func() error {
return ng.MultiOrgAlertmanager.Run(subCtx)
})
@ -172,3 +175,30 @@ func (ng *AlertNG) IsDisabled() bool {
}
return !ng.Cfg.IsNgAlertEnabled()
}
// getRuleDefaultIntervalSeconds returns the default rule interval if the interval is not set.
// If this constant (1 minute) is lower than the configured minimum evaluation interval then
// this configuration is returned.
func (ng *AlertNG) getRuleDefaultInterval() time.Duration {
ruleMinInterval := ng.getRuleMinInterval()
if defaultIntervalSeconds < int64(ruleMinInterval.Seconds()) {
return ruleMinInterval
}
return time.Duration(defaultIntervalSeconds) * time.Second
}
// getRuleMinIntervalSeconds returns the configured minimum rule interval.
// If this value is less or equal to zero or not divided exactly by the scheduler interval
// the scheduler interval (10 seconds) is returned.
func (ng *AlertNG) getRuleMinInterval() time.Duration {
if ng.Cfg.UnifiedAlerting.MinInterval <= 0 {
return defaultBaseIntervalSeconds // if it's not configured; apply default
}
if ng.Cfg.UnifiedAlerting.MinInterval%defaultBaseIntervalSeconds != 0 {
ng.Log.Error("Configured minimum evaluation interval is not divided exactly by the scheduler interval and it will fallback to default", "alertingMinInterval", ng.Cfg.UnifiedAlerting.MinInterval, "baseIntervalSeconds", defaultBaseIntervalSeconds, "defaultIntervalSeconds", defaultIntervalSeconds)
return defaultBaseIntervalSeconds // if it's invalid; apply default
}
return ng.Cfg.UnifiedAlerting.MinInterval
}

View File

@ -0,0 +1,73 @@
package ngalert
import (
"testing"
"time"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/setting"
"github.com/stretchr/testify/require"
)
func TestGetRuleDefaultIntervalSeconds(t *testing.T) {
testCases := []struct {
desc string
alertingMinIntervalCfg time.Duration
// the expected default rule interval (applied if a rule interval is missing)
expDefaultInterval time.Duration
// the expected minimum rule interval (enforced if a rule interval is lower than this value; it is used also for computing the default rule interval)
expMinInterval time.Duration
}{
{
desc: "negative min rule interval",
alertingMinIntervalCfg: -1,
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
expMinInterval: defaultBaseIntervalSeconds, // 10s
},
{
desc: "zero min rule interval",
alertingMinIntervalCfg: 0,
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
expMinInterval: defaultBaseIntervalSeconds, // 10s
},
{
desc: "min rule interval not divided exactly by the scheduler interval",
alertingMinIntervalCfg: 1,
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
expMinInterval: defaultBaseIntervalSeconds, // 10s
},
{
desc: "min rule interval equals base scheduler interval",
alertingMinIntervalCfg: defaultBaseIntervalSeconds, // 10s
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
expMinInterval: defaultBaseIntervalSeconds, // 10s
},
{
desc: "valid min rule interval less than default rule interval",
alertingMinIntervalCfg: time.Duration(defaultIntervalSeconds-defaultBaseIntervalSeconds) * time.Second, // 50s
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
expMinInterval: time.Duration(defaultIntervalSeconds-defaultBaseIntervalSeconds) * time.Second, // 50s
},
{
desc: "valid min rule interval greater than default rule interval",
alertingMinIntervalCfg: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
expDefaultInterval: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
expMinInterval: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
alertNG := AlertNG{
Log: log.New("test"),
Cfg: &setting.Cfg{
UnifiedAlerting: setting.UnifiedAlertingSettings{
MinInterval: tc.alertingMinIntervalCfg,
},
},
}
require.Equal(t, tc.expDefaultInterval, alertNG.getRuleDefaultInterval())
require.Equal(t, tc.expMinInterval, alertNG.getRuleMinInterval())
})
}
}

View File

@ -41,10 +41,10 @@ func setupAMTest(t *testing.T) *Alertmanager {
m := metrics.NewAlertmanagerMetrics(prometheus.NewRegistry())
sqlStore := sqlstore.InitTestDB(t)
s := &store.DBstore{
BaseInterval: 10 * time.Second,
DefaultIntervalSeconds: 60,
SQLStore: sqlStore,
Logger: log.New("alertmanager-test"),
BaseInterval: 10 * time.Second,
DefaultInterval: 60 * time.Second,
SQLStore: sqlStore,
Logger: log.New("alertmanager-test"),
}
kvStore := newFakeKVStore(t)

View File

@ -33,7 +33,7 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
m := metrics.NewNGAlert(reg)
cfg := &setting.Cfg{
DataPath: tmpDir,
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.AlertmanagerDefaultConfiguration}, // do not poll in tests.
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
}
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
require.NoError(t, err)
@ -95,7 +95,7 @@ func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
require.NoError(t, err)
cfg := &setting.Cfg{
DataPath: tmpDir,
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.AlertmanagerDefaultConfiguration}, // do not poll in tests.
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
}
kvStore := newFakeKVStore(t)
reg := prometheus.NewPedanticRegistry()

View File

@ -84,6 +84,7 @@ type schedule struct {
sendersCfgHash map[int64]string
senders map[int64]*sender.Sender
adminConfigPollInterval time.Duration
minRuleInterval time.Duration
}
// SchedulerCfg is the scheduler configuration.
@ -102,12 +103,14 @@ type SchedulerCfg struct {
MultiOrgNotifier *notifier.MultiOrgAlertmanager
Metrics *metrics.Scheduler
AdminConfigPollInterval time.Duration
MinRuleInterval time.Duration
}
// NewScheduler returns a new schedule.
func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, stateManager *state.Manager) *schedule {
ticker := alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
sch := schedule{
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]alertRuleInfo)},
maxAttempts: cfg.MaxAttempts,
clock: cfg.C,
@ -129,6 +132,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
senders: map[int64]*sender.Sender{},
sendersCfgHash: map[int64]string{},
adminConfigPollInterval: cfg.AdminConfigPollInterval,
minRuleInterval: cfg.MinRuleInterval,
}
return &sch
}
@ -334,6 +338,13 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
itemVersion := item.Version
newRoutine := !sch.registry.exists(key)
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
// enforce minimum evaluation interval
if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
sch.log.Debug("interval adjusted", "rule_interval_seconds", item.IntervalSeconds, "min_interval_seconds", sch.minRuleInterval.Seconds(), "key", key)
item.IntervalSeconds = int64(sch.minRuleInterval.Seconds())
}
invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
if newRoutine && !invalidInterval {
@ -344,7 +355,7 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
if invalidInterval {
// this is expected to be always false
// give that we validate interval during alert rule updates
// given that we validate interval during alert rule updates
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
continue
}

View File

@ -211,7 +211,7 @@ func (st DBstore) UpsertAlertRules(rules []UpsertRule) error {
r.New.UID = uid
if r.New.IntervalSeconds == 0 {
r.New.IntervalSeconds = st.DefaultIntervalSeconds
r.New.IntervalSeconds = int64(st.DefaultInterval.Seconds())
}
r.New.Version = 1

View File

@ -28,7 +28,7 @@ type DBstore struct {
// the base scheduler tick rate; it's used for validating definition interval
BaseInterval time.Duration
// default alert definiiton interval
DefaultIntervalSeconds int64
SQLStore *sqlstore.SQLStore
Logger log.Logger
DefaultInterval time.Duration
SQLStore *sqlstore.SQLStore
Logger log.Logger
}

View File

@ -11,15 +11,14 @@ import (
)
const (
AlertmanagerDefaultClusterAddr = "0.0.0.0:9094"
AlertmanagerDefaultPeerTimeout = 15 * time.Second
AlertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
AlertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
SchedulerDefaultAdminConfigPollInterval = 60 * time.Second
AlertmanagerDefaultConfigPollInterval = 60 * time.Second
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
alertmanagerDefaultPeerTimeout = 15 * time.Second
alertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
alertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
alertmanagerDefaultConfigPollInterval = 60 * time.Second
// To start, the alertmanager needs at least one route defined.
// TODO: we should move this to Grafana settings and define this as the default.
AlertmanagerDefaultConfiguration = `{
alertmanagerDefaultConfiguration = `{
"alertmanager_config": {
"route": {
"receiver": "grafana-default-email"
@ -39,6 +38,12 @@ const (
}
}
`
evaluatorDefaultEvaluationTimeout = 30 * time.Second
schedulerDefaultAdminConfigPollInterval = 60 * time.Second
schedulereDefaultExecuteAlerts = true
schedulerDefaultMaxAttempts = 3
schedulerDefaultLegacyMinInterval = 1
schedulerDefaultMinInterval = 10 * time.Second
)
type UnifiedAlertingSettings struct {
@ -50,34 +55,40 @@ type UnifiedAlertingSettings struct {
HAPeerTimeout time.Duration
HAGossipInterval time.Duration
HAPushPullInterval time.Duration
MaxAttempts int64
MinInterval time.Duration
EvaluationTimeout time.Duration
ExecuteAlerts bool
DefaultConfiguration string
}
// ReadUnifiedAlertingSettings reads both the `unified_alerting` and `alerting` sections of the configuration while preferring configuration the `alerting` section.
// It first reads the `unified_alerting` section, then looks for non-defaults on the `alerting` section and prefers those.
func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
uaCfg := UnifiedAlertingSettings{}
ua := iniFile.Section("unified_alerting")
var err error
uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String()))
uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (schedulerDefaultAdminConfigPollInterval).String()))
if err != nil {
return err
}
uaCfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String()))
uaCfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (alertmanagerDefaultConfigPollInterval).String()))
if err != nil {
return err
}
uaCfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String()))
uaCfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (alertmanagerDefaultPeerTimeout).String()))
if err != nil {
return err
}
uaCfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String()))
uaCfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (alertmanagerDefaultGossipInterval).String()))
if err != nil {
return err
}
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String()))
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
if err != nil {
return err
}
uaCfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr)
uaCfg.HAListenAddr = ua.Key("ha_listen_address").MustString(alertmanagerDefaultClusterAddr)
uaCfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("")
peers := ua.Key("ha_peers").MustString("")
uaCfg.HAPeers = make([]string, 0)
@ -87,8 +98,58 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
uaCfg.HAPeers = append(uaCfg.HAPeers, peer)
}
}
// TODO load from ini file
uaCfg.DefaultConfiguration = AlertmanagerDefaultConfiguration
uaCfg.DefaultConfiguration = alertmanagerDefaultConfiguration
alerting := iniFile.Section("alerting")
uaExecuteAlerts := ua.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
if uaExecuteAlerts { // unified option equals the default (true)
legacyExecuteAlerts := alerting.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
if !legacyExecuteAlerts {
cfg.Logger.Warn("falling back to legacy setting of 'execute_alerts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaExecuteAlerts = legacyExecuteAlerts
}
uaCfg.ExecuteAlerts = uaExecuteAlerts
// if the unified alerting options equal the defaults, apply the respective legacy one
uaEvaluationTimeout, err := gtime.ParseDuration(valueAsString(ua, "evaluation_timeout", evaluatorDefaultEvaluationTimeout.String()))
if err != nil || uaEvaluationTimeout == evaluatorDefaultEvaluationTimeout { // unified option is invalid duration or equals the default
legaceEvaluationTimeout := time.Duration(alerting.Key("evaluation_timeout_seconds").MustInt64(int64(evaluatorDefaultEvaluationTimeout.Seconds()))) * time.Second
if legaceEvaluationTimeout != evaluatorDefaultEvaluationTimeout {
cfg.Logger.Warn("falling back to legacy setting of 'evaluation_timeout_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaEvaluationTimeout = legaceEvaluationTimeout
}
uaCfg.EvaluationTimeout = uaEvaluationTimeout
uaMaxAttempts := ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if uaMaxAttempts == schedulerDefaultMaxAttempts { // unified option or equals the default
legacyMaxAttempts := alerting.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
if legacyMaxAttempts != schedulerDefaultMaxAttempts {
cfg.Logger.Warn("falling back to legacy setting of 'max_attempts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaMaxAttempts = legacyMaxAttempts
}
uaCfg.MaxAttempts = uaMaxAttempts
uaMinInterval, err := gtime.ParseDuration(valueAsString(ua, "min_interval", schedulerDefaultMinInterval.String()))
if err != nil || uaMinInterval == schedulerDefaultMinInterval { // unified option is invalid duration or equals the default
// if the legacy option is invalid, fallback to 10 (unified alerting min interval default)
legacyMinInterval := time.Duration(alerting.Key("min_interval_seconds").MustInt64(int64(schedulerDefaultMinInterval.Seconds()))) * time.Second
if legacyMinInterval != schedulerDefaultLegacyMinInterval {
cfg.Logger.Warn("falling back to legacy setting of 'min_interval_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
}
uaMinInterval = legacyMinInterval
}
uaCfg.MinInterval = uaMinInterval
cfg.UnifiedAlerting = uaCfg
return nil
}
func GetAlertmanagerDefaultConfiguration() string {
return alertmanagerDefaultConfiguration
}

View File

@ -1,10 +1,12 @@
package setting
import (
"strconv"
"testing"
"time"
"github.com/stretchr/testify/require"
"gopkg.in/ini.v1"
)
func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
@ -37,3 +39,125 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.UnifiedAlerting.HAPeers)
}
}
func TestUnifiedAlertingSettings(t *testing.T) {
testCases := []struct {
desc string
unifiedAlertingOptions map[string]string
alertingOptions map[string]string
verifyCfg func(*testing.T, Cfg)
}{
{
desc: "when the unified options do not equal the defaults, it should not apply the legacy ones",
unifiedAlertingOptions: map[string]string{
"admin_config_poll_interval": "120s",
"max_attempts": "6",
"min_interval": "60s",
"execute_alerts": "false",
"evaluation_timeout": "90s",
},
alertingOptions: map[string]string{
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
"min_interval_seconds": strconv.FormatInt(schedulerDefaultLegacyMinInterval, 10),
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
"evaluation_timeout_seconds": strconv.FormatInt(int64(evaluatorDefaultEvaluationTimeout.Seconds()), 10),
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(6), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 60*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 90*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
},
},
{
desc: "when the unified options equal the defaults, it should apply the legacy ones",
unifiedAlertingOptions: map[string]string{
"admin_config_poll_interval": "120s",
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
"min_interval": schedulerDefaultMinInterval.String(),
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
},
alertingOptions: map[string]string{
"max_attempts": "12",
"min_interval_seconds": "120",
"execute_alerts": "true",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
},
},
{
desc: "when both unified and legacy options are invalid, apply the defaults",
unifiedAlertingOptions: map[string]string{
"max_attempts": "invalid",
"min_interval": "invalid",
"execute_alerts": "invalid",
"evaluation_timeouts": "invalid",
},
alertingOptions: map[string]string{
"max_attempts": "invalid",
"min_interval_seconds": "invalid",
"execute_alerts": "invalid",
"evaluation_timeout_seconds": "invalid",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(schedulerDefaultMaxAttempts), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, schedulerDefaultMinInterval, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, schedulereDefaultExecuteAlerts, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, evaluatorDefaultEvaluationTimeout, cfg.UnifiedAlerting.EvaluationTimeout)
},
},
{
desc: "when unified alerting options are invalid, apply legacy options",
unifiedAlertingOptions: map[string]string{
"max_attempts": "invalid",
"min_interval": "invalid",
"execute_alerts": "invalid",
"evaluation_timeout": "invalid",
},
alertingOptions: map[string]string{
"max_attempts": "12",
"min_interval_seconds": "120",
"execute_alerts": "false",
"evaluation_timeout_seconds": "160",
},
verifyCfg: func(t *testing.T, cfg Cfg) {
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
},
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
f := ini.Empty()
cfg := NewCfg()
unifiedAlertingSec, err := f.NewSection("unified_alerting")
require.NoError(t, err)
for k, v := range tc.unifiedAlertingOptions {
_, err = unifiedAlertingSec.NewKey(k, v)
require.NoError(t, err)
}
alertingSec, err := f.NewSection("alerting")
require.NoError(t, err)
for k, v := range tc.alertingOptions {
_, err = alertingSec.NewKey(k, v)
require.NoError(t, err)
}
err = cfg.ReadUnifiedAlertingSettings(f)
require.NoError(t, err)
tc.verifyCfg(t, *cfg)
})
}
}

View File

@ -191,6 +191,8 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
require.NoError(t, err)
_, err = alertingSect.NewKey("notification_timeout_seconds", "1")
require.NoError(t, err)
_, err = alertingSect.NewKey("max_attempts", "3")
require.NoError(t, err)
for _, o := range opts {
if o.EnableCSP {