mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: tune rule evaluation via configuration (#35623)
* Alerting: Configure max evaluation retries * Alerting: Enforce minimum rule evaluation interval * Alerting: Disable rule evaluation from configuration * Update docs * Alerting: Configure rule evaluation timeout * Move options on unified_alerting config section * Apply suggestions from code review Co-authored-by: gotjosh <josue@grafana.com>
This commit is contained in:
parent
cc94c55e48
commit
f6f3a54742
@ -764,11 +764,26 @@ ha_gossip_interval = 200ms
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
ha_push_pull_interval = 60s
|
||||
|
||||
# Enable or disable alerting rule execution. The alerting UI remains visible. This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
execute_alerts = true
|
||||
|
||||
# Alert evaluation timeout when fetching data from the datasource. This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
evaluation_timeout = 30s
|
||||
|
||||
# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
max_attempts = 3
|
||||
|
||||
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
min_interval = 10s
|
||||
|
||||
#################################### Alerting ############################
|
||||
[alerting]
|
||||
# Disable alerting engine & UI features
|
||||
enabled = true
|
||||
# Makes it possible to turn off alert rule execution but alerting UI is visible
|
||||
|
||||
# Makes it possible to turn off alert execution but alerting UI is visible
|
||||
execute_alerts = true
|
||||
|
||||
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
|
||||
|
@ -742,11 +742,26 @@
|
||||
;ha_push_pull_interval = "60s"
|
||||
|
||||
|
||||
# Makes it possible to turn off alert rule execution but alerting UI is visible. If it's true (the default) then the respective legacy option is applied.
|
||||
;execute_alerts = true
|
||||
|
||||
# Alert evaluation timeout when fetching data from the datasource. It refers to the This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
# The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;evaluation_timeout = 30s
|
||||
|
||||
# Default setting for max attempts to sending alert notifications. Default value is 3. If it's 3 (the default) then the respective legacy option is applied.
|
||||
;max_attempts = 3
|
||||
|
||||
# Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value. Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has a legacy version in the `[alerting]` section that takes precedence.
|
||||
# The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
;min_interval = 10s
|
||||
|
||||
#################################### Alerting ############################
|
||||
[alerting]
|
||||
# Disable alerting engine & UI features
|
||||
;enabled = true
|
||||
# Makes it possible to turn off alert rule execution but alerting UI is visible
|
||||
|
||||
# Makes it possible to turn off alert execution but alerting UI is visible
|
||||
;execute_alerts = true
|
||||
|
||||
# Default setting for new alert rules. Defaults to categorize error and timeouts as alerting. (alerting, keep_state)
|
||||
@ -759,7 +774,6 @@
|
||||
# This limit will protect the server from render overloading and make sure notifications are sent out quickly
|
||||
;concurrent_render_limit = 5
|
||||
|
||||
|
||||
# Default setting for alert calculation timeout. Default value is 30
|
||||
;evaluation_timeout_seconds = 30
|
||||
|
||||
|
@ -14,7 +14,7 @@ Grafana has default and custom configuration files. You can customize your Grafa
|
||||
|
||||
## Configuration file location
|
||||
|
||||
The default settings for a Grafana instance are stored in the `$WORKING_DIR/conf/defaults.ini` file. _Do not_ change the location in this file.
|
||||
The default settings for a Grafana instance are stored in the `$WORKING_DIR/conf/defaults.ini` file. _Do not_ change the location in this file.
|
||||
|
||||
Depending on your OS, your custom configuration file is either the `$WORKING_DIR/conf/defaults.ini` file or the `/usr/local/etc/grafana/grafana.ini` file. The custom configuration file path can be overridden using the `--config` parameter.
|
||||
|
||||
@ -1159,6 +1159,28 @@ across larger clusters at the expense of increased bandwidth usage. The default
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### execute_alerts
|
||||
|
||||
Enable or disable alerting rule execution. Default is `true`. The alerting UI remains visible. This option has a [legacy version in the alerting section]({{< relref "#execute_alerts-1">}}) that takes precedence.
|
||||
|
||||
### evaluation_timeout
|
||||
|
||||
Sets the alert evaluation timeout when fetching data from the datasource. Default value is `30s`. This option has a [legacy version in the alerting section]({{< relref "#evaluation_timeout_seconds">}}) that takes precedence.
|
||||
|
||||
The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
### max_attempts
|
||||
|
||||
Sets a maximum limit on attempts to sending alert notifications. Default value is `3`. If it's `3` (the default) then the [respective legacy option]({{< relref "#max_attempts-1">}}) is applied
|
||||
|
||||
### min_interval
|
||||
|
||||
Sets the minimum interval to enforce between rule evaluations. Default value is `10s` which equals the scheduler interval. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. This option has [a legacy version in the alerting section]({{< relref "#min_interval_seconds">}}) that takes precedence.
|
||||
|
||||
The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m.
|
||||
|
||||
> **Note.** This setting has precedence over each individual rule frequency. If a rule frequency is lower than this value, then this value is enforced.
|
||||
|
||||
<hr>
|
||||
|
||||
## [alerting]
|
||||
|
@ -21,8 +21,6 @@ import (
|
||||
"github.com/grafana/grafana/pkg/expr"
|
||||
)
|
||||
|
||||
const alertingEvaluationTimeout = 30 * time.Second
|
||||
|
||||
type Evaluator struct {
|
||||
Cfg *setting.Cfg
|
||||
Log log.Logger
|
||||
@ -434,7 +432,7 @@ func (evalResults Results) AsDataFrame() data.Frame {
|
||||
|
||||
// ConditionEval executes conditions and evaluates the result.
|
||||
func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, dataService *tsdb.Service) (Results, error) {
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
defer cancelFn()
|
||||
|
||||
alertExecCtx := AlertExecCtx{OrgID: condition.OrgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
|
||||
@ -447,7 +445,7 @@ func (e *Evaluator) ConditionEval(condition *models.Condition, now time.Time, da
|
||||
|
||||
// QueriesAndExpressionsEval executes queries and expressions and returns the result.
|
||||
func (e *Evaluator) QueriesAndExpressionsEval(orgID int64, data []models.AlertQuery, now time.Time, dataService *tsdb.Service) (*backend.QueryDataResponse, error) {
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), alertingEvaluationTimeout)
|
||||
alertCtx, cancelFn := context.WithTimeout(context.Background(), e.Cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
defer cancelFn()
|
||||
|
||||
alertExecCtx := AlertExecCtx{OrgID: orgID, Ctx: alertCtx, ExpressionsEnabled: e.Cfg.ExpressionsEnabled, Log: e.Log}
|
||||
|
@ -26,7 +26,6 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
maxAttempts int64 = 3
|
||||
// scheduler interval
|
||||
// changing this value is discouraged
|
||||
// because this could cause existing alert definition
|
||||
@ -93,10 +92,10 @@ func (ng *AlertNG) init() error {
|
||||
baseInterval *= time.Second
|
||||
|
||||
store := &store.DBstore{
|
||||
BaseInterval: baseInterval,
|
||||
DefaultIntervalSeconds: defaultIntervalSeconds,
|
||||
SQLStore: ng.SQLStore,
|
||||
Logger: ng.Log,
|
||||
BaseInterval: baseInterval,
|
||||
DefaultInterval: ng.getRuleDefaultInterval(),
|
||||
SQLStore: ng.SQLStore,
|
||||
Logger: ng.Log,
|
||||
}
|
||||
|
||||
multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics()
|
||||
@ -113,8 +112,8 @@ func (ng *AlertNG) init() error {
|
||||
schedCfg := schedule.SchedulerCfg{
|
||||
C: clock.New(),
|
||||
BaseInterval: baseInterval,
|
||||
Logger: log.New("ngalert.scheduler"),
|
||||
MaxAttempts: maxAttempts,
|
||||
Logger: ng.Log,
|
||||
MaxAttempts: ng.Cfg.UnifiedAlerting.MaxAttempts,
|
||||
Evaluator: eval.Evaluator{Cfg: ng.Cfg, Log: ng.Log},
|
||||
InstanceStore: store,
|
||||
RuleStore: store,
|
||||
@ -123,6 +122,7 @@ func (ng *AlertNG) init() error {
|
||||
MultiOrgNotifier: ng.MultiOrgAlertmanager,
|
||||
Metrics: ng.Metrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: ng.Cfg.UnifiedAlerting.AdminConfigPollInterval,
|
||||
MinRuleInterval: ng.getRuleMinInterval(),
|
||||
}
|
||||
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
|
||||
schedule := schedule.NewScheduler(schedCfg, ng.DataService, ng.Cfg.AppURL, stateManager)
|
||||
@ -156,9 +156,12 @@ func (ng *AlertNG) Run(ctx context.Context) error {
|
||||
ng.stateManager.Warm()
|
||||
|
||||
children, subCtx := errgroup.WithContext(ctx)
|
||||
children.Go(func() error {
|
||||
return ng.schedule.Run(subCtx)
|
||||
})
|
||||
|
||||
if ng.Cfg.UnifiedAlerting.ExecuteAlerts {
|
||||
children.Go(func() error {
|
||||
return ng.schedule.Run(subCtx)
|
||||
})
|
||||
}
|
||||
children.Go(func() error {
|
||||
return ng.MultiOrgAlertmanager.Run(subCtx)
|
||||
})
|
||||
@ -172,3 +175,30 @@ func (ng *AlertNG) IsDisabled() bool {
|
||||
}
|
||||
return !ng.Cfg.IsNgAlertEnabled()
|
||||
}
|
||||
|
||||
// getRuleDefaultIntervalSeconds returns the default rule interval if the interval is not set.
|
||||
// If this constant (1 minute) is lower than the configured minimum evaluation interval then
|
||||
// this configuration is returned.
|
||||
func (ng *AlertNG) getRuleDefaultInterval() time.Duration {
|
||||
ruleMinInterval := ng.getRuleMinInterval()
|
||||
if defaultIntervalSeconds < int64(ruleMinInterval.Seconds()) {
|
||||
return ruleMinInterval
|
||||
}
|
||||
return time.Duration(defaultIntervalSeconds) * time.Second
|
||||
}
|
||||
|
||||
// getRuleMinIntervalSeconds returns the configured minimum rule interval.
|
||||
// If this value is less or equal to zero or not divided exactly by the scheduler interval
|
||||
// the scheduler interval (10 seconds) is returned.
|
||||
func (ng *AlertNG) getRuleMinInterval() time.Duration {
|
||||
if ng.Cfg.UnifiedAlerting.MinInterval <= 0 {
|
||||
return defaultBaseIntervalSeconds // if it's not configured; apply default
|
||||
}
|
||||
|
||||
if ng.Cfg.UnifiedAlerting.MinInterval%defaultBaseIntervalSeconds != 0 {
|
||||
ng.Log.Error("Configured minimum evaluation interval is not divided exactly by the scheduler interval and it will fallback to default", "alertingMinInterval", ng.Cfg.UnifiedAlerting.MinInterval, "baseIntervalSeconds", defaultBaseIntervalSeconds, "defaultIntervalSeconds", defaultIntervalSeconds)
|
||||
return defaultBaseIntervalSeconds // if it's invalid; apply default
|
||||
}
|
||||
|
||||
return ng.Cfg.UnifiedAlerting.MinInterval
|
||||
}
|
||||
|
73
pkg/services/ngalert/ngalert_test.go
Normal file
73
pkg/services/ngalert/ngalert_test.go
Normal file
@ -0,0 +1,73 @@
|
||||
package ngalert
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/infra/log"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestGetRuleDefaultIntervalSeconds(t *testing.T) {
|
||||
testCases := []struct {
|
||||
desc string
|
||||
alertingMinIntervalCfg time.Duration
|
||||
// the expected default rule interval (applied if a rule interval is missing)
|
||||
expDefaultInterval time.Duration
|
||||
// the expected minimum rule interval (enforced if a rule interval is lower than this value; it is used also for computing the default rule interval)
|
||||
expMinInterval time.Duration
|
||||
}{
|
||||
{
|
||||
desc: "negative min rule interval",
|
||||
alertingMinIntervalCfg: -1,
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
|
||||
expMinInterval: defaultBaseIntervalSeconds, // 10s
|
||||
},
|
||||
{
|
||||
desc: "zero min rule interval",
|
||||
alertingMinIntervalCfg: 0,
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
|
||||
expMinInterval: defaultBaseIntervalSeconds, // 10s
|
||||
},
|
||||
{
|
||||
desc: "min rule interval not divided exactly by the scheduler interval",
|
||||
alertingMinIntervalCfg: 1,
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
|
||||
expMinInterval: defaultBaseIntervalSeconds, // 10s
|
||||
},
|
||||
{
|
||||
desc: "min rule interval equals base scheduler interval",
|
||||
alertingMinIntervalCfg: defaultBaseIntervalSeconds, // 10s
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
|
||||
expMinInterval: defaultBaseIntervalSeconds, // 10s
|
||||
},
|
||||
{
|
||||
desc: "valid min rule interval less than default rule interval",
|
||||
alertingMinIntervalCfg: time.Duration(defaultIntervalSeconds-defaultBaseIntervalSeconds) * time.Second, // 50s
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds) * time.Second, // 60s
|
||||
expMinInterval: time.Duration(defaultIntervalSeconds-defaultBaseIntervalSeconds) * time.Second, // 50s
|
||||
},
|
||||
{
|
||||
desc: "valid min rule interval greater than default rule interval",
|
||||
alertingMinIntervalCfg: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
|
||||
expDefaultInterval: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
|
||||
expMinInterval: time.Duration(defaultIntervalSeconds+defaultBaseIntervalSeconds) * time.Second, // 70s
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
alertNG := AlertNG{
|
||||
Log: log.New("test"),
|
||||
Cfg: &setting.Cfg{
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{
|
||||
MinInterval: tc.alertingMinIntervalCfg,
|
||||
},
|
||||
},
|
||||
}
|
||||
require.Equal(t, tc.expDefaultInterval, alertNG.getRuleDefaultInterval())
|
||||
require.Equal(t, tc.expMinInterval, alertNG.getRuleMinInterval())
|
||||
})
|
||||
}
|
||||
}
|
@ -41,10 +41,10 @@ func setupAMTest(t *testing.T) *Alertmanager {
|
||||
m := metrics.NewAlertmanagerMetrics(prometheus.NewRegistry())
|
||||
sqlStore := sqlstore.InitTestDB(t)
|
||||
s := &store.DBstore{
|
||||
BaseInterval: 10 * time.Second,
|
||||
DefaultIntervalSeconds: 60,
|
||||
SQLStore: sqlStore,
|
||||
Logger: log.New("alertmanager-test"),
|
||||
BaseInterval: 10 * time.Second,
|
||||
DefaultInterval: 60 * time.Second,
|
||||
SQLStore: sqlStore,
|
||||
Logger: log.New("alertmanager-test"),
|
||||
}
|
||||
|
||||
kvStore := newFakeKVStore(t)
|
||||
|
@ -33,7 +33,7 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
m := metrics.NewNGAlert(reg)
|
||||
cfg := &setting.Cfg{
|
||||
DataPath: tmpDir,
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.AlertmanagerDefaultConfiguration}, // do not poll in tests.
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
|
||||
}
|
||||
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
|
||||
require.NoError(t, err)
|
||||
@ -95,7 +95,7 @@ func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
cfg := &setting.Cfg{
|
||||
DataPath: tmpDir,
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.AlertmanagerDefaultConfiguration}, // do not poll in tests.
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
|
||||
}
|
||||
kvStore := newFakeKVStore(t)
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
|
@ -84,6 +84,7 @@ type schedule struct {
|
||||
sendersCfgHash map[int64]string
|
||||
senders map[int64]*sender.Sender
|
||||
adminConfigPollInterval time.Duration
|
||||
minRuleInterval time.Duration
|
||||
}
|
||||
|
||||
// SchedulerCfg is the scheduler configuration.
|
||||
@ -102,12 +103,14 @@ type SchedulerCfg struct {
|
||||
MultiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
Metrics *metrics.Scheduler
|
||||
AdminConfigPollInterval time.Duration
|
||||
MinRuleInterval time.Duration
|
||||
}
|
||||
|
||||
// NewScheduler returns a new schedule.
|
||||
func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, stateManager *state.Manager) *schedule {
|
||||
ticker := alerting.NewTicker(cfg.C.Now(), time.Second*0, cfg.C, int64(cfg.BaseInterval.Seconds()))
|
||||
sch := schedule{
|
||||
|
||||
registry: alertRuleRegistry{alertRuleInfo: make(map[models.AlertRuleKey]alertRuleInfo)},
|
||||
maxAttempts: cfg.MaxAttempts,
|
||||
clock: cfg.C,
|
||||
@ -129,6 +132,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
|
||||
senders: map[int64]*sender.Sender{},
|
||||
sendersCfgHash: map[int64]string{},
|
||||
adminConfigPollInterval: cfg.AdminConfigPollInterval,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
}
|
||||
return &sch
|
||||
}
|
||||
@ -334,6 +338,13 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||
itemVersion := item.Version
|
||||
newRoutine := !sch.registry.exists(key)
|
||||
ruleInfo := sch.registry.getOrCreateInfo(key, itemVersion)
|
||||
|
||||
// enforce minimum evaluation interval
|
||||
if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
|
||||
sch.log.Debug("interval adjusted", "rule_interval_seconds", item.IntervalSeconds, "min_interval_seconds", sch.minRuleInterval.Seconds(), "key", key)
|
||||
item.IntervalSeconds = int64(sch.minRuleInterval.Seconds())
|
||||
}
|
||||
|
||||
invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0
|
||||
|
||||
if newRoutine && !invalidInterval {
|
||||
@ -344,7 +355,7 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||
|
||||
if invalidInterval {
|
||||
// this is expected to be always false
|
||||
// give that we validate interval during alert rule updates
|
||||
// given that we validate interval during alert rule updates
|
||||
sch.log.Debug("alert rule with invalid interval will be ignored: interval should be divided exactly by scheduler interval", "key", key, "interval", time.Duration(item.IntervalSeconds)*time.Second, "scheduler interval", sch.baseInterval)
|
||||
continue
|
||||
}
|
||||
|
@ -211,7 +211,7 @@ func (st DBstore) UpsertAlertRules(rules []UpsertRule) error {
|
||||
r.New.UID = uid
|
||||
|
||||
if r.New.IntervalSeconds == 0 {
|
||||
r.New.IntervalSeconds = st.DefaultIntervalSeconds
|
||||
r.New.IntervalSeconds = int64(st.DefaultInterval.Seconds())
|
||||
}
|
||||
|
||||
r.New.Version = 1
|
||||
|
@ -28,7 +28,7 @@ type DBstore struct {
|
||||
// the base scheduler tick rate; it's used for validating definition interval
|
||||
BaseInterval time.Duration
|
||||
// default alert definiiton interval
|
||||
DefaultIntervalSeconds int64
|
||||
SQLStore *sqlstore.SQLStore
|
||||
Logger log.Logger
|
||||
DefaultInterval time.Duration
|
||||
SQLStore *sqlstore.SQLStore
|
||||
Logger log.Logger
|
||||
}
|
||||
|
@ -11,15 +11,14 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
AlertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
||||
AlertmanagerDefaultPeerTimeout = 15 * time.Second
|
||||
AlertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
|
||||
AlertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
|
||||
SchedulerDefaultAdminConfigPollInterval = 60 * time.Second
|
||||
AlertmanagerDefaultConfigPollInterval = 60 * time.Second
|
||||
alertmanagerDefaultClusterAddr = "0.0.0.0:9094"
|
||||
alertmanagerDefaultPeerTimeout = 15 * time.Second
|
||||
alertmanagerDefaultGossipInterval = cluster.DefaultGossipInterval
|
||||
alertmanagerDefaultPushPullInterval = cluster.DefaultPushPullInterval
|
||||
alertmanagerDefaultConfigPollInterval = 60 * time.Second
|
||||
// To start, the alertmanager needs at least one route defined.
|
||||
// TODO: we should move this to Grafana settings and define this as the default.
|
||||
AlertmanagerDefaultConfiguration = `{
|
||||
alertmanagerDefaultConfiguration = `{
|
||||
"alertmanager_config": {
|
||||
"route": {
|
||||
"receiver": "grafana-default-email"
|
||||
@ -39,6 +38,12 @@ const (
|
||||
}
|
||||
}
|
||||
`
|
||||
evaluatorDefaultEvaluationTimeout = 30 * time.Second
|
||||
schedulerDefaultAdminConfigPollInterval = 60 * time.Second
|
||||
schedulereDefaultExecuteAlerts = true
|
||||
schedulerDefaultMaxAttempts = 3
|
||||
schedulerDefaultLegacyMinInterval = 1
|
||||
schedulerDefaultMinInterval = 10 * time.Second
|
||||
)
|
||||
|
||||
type UnifiedAlertingSettings struct {
|
||||
@ -50,34 +55,40 @@ type UnifiedAlertingSettings struct {
|
||||
HAPeerTimeout time.Duration
|
||||
HAGossipInterval time.Duration
|
||||
HAPushPullInterval time.Duration
|
||||
MaxAttempts int64
|
||||
MinInterval time.Duration
|
||||
EvaluationTimeout time.Duration
|
||||
ExecuteAlerts bool
|
||||
DefaultConfiguration string
|
||||
}
|
||||
|
||||
// ReadUnifiedAlertingSettings reads both the `unified_alerting` and `alerting` sections of the configuration while preferring configuration the `alerting` section.
|
||||
// It first reads the `unified_alerting` section, then looks for non-defaults on the `alerting` section and prefers those.
|
||||
func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
uaCfg := UnifiedAlertingSettings{}
|
||||
ua := iniFile.Section("unified_alerting")
|
||||
var err error
|
||||
uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (SchedulerDefaultAdminConfigPollInterval).String()))
|
||||
uaCfg.AdminConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "admin_config_poll_interval", (schedulerDefaultAdminConfigPollInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (AlertmanagerDefaultConfigPollInterval).String()))
|
||||
uaCfg.AlertmanagerConfigPollInterval, err = gtime.ParseDuration(valueAsString(ua, "alertmanager_config_poll_interval", (alertmanagerDefaultConfigPollInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (AlertmanagerDefaultPeerTimeout).String()))
|
||||
uaCfg.HAPeerTimeout, err = gtime.ParseDuration(valueAsString(ua, "ha_peer_timeout", (alertmanagerDefaultPeerTimeout).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (AlertmanagerDefaultGossipInterval).String()))
|
||||
uaCfg.HAGossipInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_gossip_interval", (alertmanagerDefaultGossipInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (AlertmanagerDefaultPushPullInterval).String()))
|
||||
uaCfg.HAPushPullInterval, err = gtime.ParseDuration(valueAsString(ua, "ha_push_pull_interval", (alertmanagerDefaultPushPullInterval).String()))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
uaCfg.HAListenAddr = ua.Key("ha_listen_address").MustString(AlertmanagerDefaultClusterAddr)
|
||||
uaCfg.HAListenAddr = ua.Key("ha_listen_address").MustString(alertmanagerDefaultClusterAddr)
|
||||
uaCfg.HAAdvertiseAddr = ua.Key("ha_advertise_address").MustString("")
|
||||
peers := ua.Key("ha_peers").MustString("")
|
||||
uaCfg.HAPeers = make([]string, 0)
|
||||
@ -87,8 +98,58 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
uaCfg.HAPeers = append(uaCfg.HAPeers, peer)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO load from ini file
|
||||
uaCfg.DefaultConfiguration = AlertmanagerDefaultConfiguration
|
||||
uaCfg.DefaultConfiguration = alertmanagerDefaultConfiguration
|
||||
|
||||
alerting := iniFile.Section("alerting")
|
||||
|
||||
uaExecuteAlerts := ua.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
|
||||
if uaExecuteAlerts { // unified option equals the default (true)
|
||||
legacyExecuteAlerts := alerting.Key("execute_alerts").MustBool(schedulereDefaultExecuteAlerts)
|
||||
if !legacyExecuteAlerts {
|
||||
cfg.Logger.Warn("falling back to legacy setting of 'execute_alerts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
|
||||
}
|
||||
uaExecuteAlerts = legacyExecuteAlerts
|
||||
}
|
||||
uaCfg.ExecuteAlerts = uaExecuteAlerts
|
||||
|
||||
// if the unified alerting options equal the defaults, apply the respective legacy one
|
||||
uaEvaluationTimeout, err := gtime.ParseDuration(valueAsString(ua, "evaluation_timeout", evaluatorDefaultEvaluationTimeout.String()))
|
||||
if err != nil || uaEvaluationTimeout == evaluatorDefaultEvaluationTimeout { // unified option is invalid duration or equals the default
|
||||
legaceEvaluationTimeout := time.Duration(alerting.Key("evaluation_timeout_seconds").MustInt64(int64(evaluatorDefaultEvaluationTimeout.Seconds()))) * time.Second
|
||||
if legaceEvaluationTimeout != evaluatorDefaultEvaluationTimeout {
|
||||
cfg.Logger.Warn("falling back to legacy setting of 'evaluation_timeout_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
|
||||
}
|
||||
uaEvaluationTimeout = legaceEvaluationTimeout
|
||||
}
|
||||
uaCfg.EvaluationTimeout = uaEvaluationTimeout
|
||||
|
||||
uaMaxAttempts := ua.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
|
||||
if uaMaxAttempts == schedulerDefaultMaxAttempts { // unified option or equals the default
|
||||
legacyMaxAttempts := alerting.Key("max_attempts").MustInt64(schedulerDefaultMaxAttempts)
|
||||
if legacyMaxAttempts != schedulerDefaultMaxAttempts {
|
||||
cfg.Logger.Warn("falling back to legacy setting of 'max_attempts'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
|
||||
}
|
||||
uaMaxAttempts = legacyMaxAttempts
|
||||
}
|
||||
uaCfg.MaxAttempts = uaMaxAttempts
|
||||
|
||||
uaMinInterval, err := gtime.ParseDuration(valueAsString(ua, "min_interval", schedulerDefaultMinInterval.String()))
|
||||
if err != nil || uaMinInterval == schedulerDefaultMinInterval { // unified option is invalid duration or equals the default
|
||||
// if the legacy option is invalid, fallback to 10 (unified alerting min interval default)
|
||||
legacyMinInterval := time.Duration(alerting.Key("min_interval_seconds").MustInt64(int64(schedulerDefaultMinInterval.Seconds()))) * time.Second
|
||||
if legacyMinInterval != schedulerDefaultLegacyMinInterval {
|
||||
cfg.Logger.Warn("falling back to legacy setting of 'min_interval_seconds'; please use the configuration option in the `unified_alerting` section if Grafana 8 alerts are enabled.")
|
||||
}
|
||||
uaMinInterval = legacyMinInterval
|
||||
}
|
||||
uaCfg.MinInterval = uaMinInterval
|
||||
|
||||
cfg.UnifiedAlerting = uaCfg
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetAlertmanagerDefaultConfiguration() string {
|
||||
return alertmanagerDefaultConfiguration
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
package setting
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"gopkg.in/ini.v1"
|
||||
)
|
||||
|
||||
func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
|
||||
@ -37,3 +39,125 @@ func TestCfg_ReadUnifiedAlertingSettings(t *testing.T) {
|
||||
require.ElementsMatch(t, []string{"hostname1:9090", "hostname2:9090", "hostname3:9090"}, cfg.UnifiedAlerting.HAPeers)
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnifiedAlertingSettings(t *testing.T) {
|
||||
testCases := []struct {
|
||||
desc string
|
||||
unifiedAlertingOptions map[string]string
|
||||
alertingOptions map[string]string
|
||||
verifyCfg func(*testing.T, Cfg)
|
||||
}{
|
||||
{
|
||||
desc: "when the unified options do not equal the defaults, it should not apply the legacy ones",
|
||||
unifiedAlertingOptions: map[string]string{
|
||||
"admin_config_poll_interval": "120s",
|
||||
"max_attempts": "6",
|
||||
"min_interval": "60s",
|
||||
"execute_alerts": "false",
|
||||
"evaluation_timeout": "90s",
|
||||
},
|
||||
alertingOptions: map[string]string{
|
||||
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
|
||||
"min_interval_seconds": strconv.FormatInt(schedulerDefaultLegacyMinInterval, 10),
|
||||
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
|
||||
"evaluation_timeout_seconds": strconv.FormatInt(int64(evaluatorDefaultEvaluationTimeout.Seconds()), 10),
|
||||
},
|
||||
verifyCfg: func(t *testing.T, cfg Cfg) {
|
||||
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
|
||||
require.Equal(t, int64(6), cfg.UnifiedAlerting.MaxAttempts)
|
||||
require.Equal(t, 60*time.Second, cfg.UnifiedAlerting.MinInterval)
|
||||
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
|
||||
require.Equal(t, 90*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "when the unified options equal the defaults, it should apply the legacy ones",
|
||||
unifiedAlertingOptions: map[string]string{
|
||||
"admin_config_poll_interval": "120s",
|
||||
"max_attempts": strconv.FormatInt(schedulerDefaultMaxAttempts, 10),
|
||||
"min_interval": schedulerDefaultMinInterval.String(),
|
||||
"execute_alerts": strconv.FormatBool(schedulereDefaultExecuteAlerts),
|
||||
"evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(),
|
||||
},
|
||||
alertingOptions: map[string]string{
|
||||
"max_attempts": "12",
|
||||
"min_interval_seconds": "120",
|
||||
"execute_alerts": "true",
|
||||
"evaluation_timeout_seconds": "160",
|
||||
},
|
||||
verifyCfg: func(t *testing.T, cfg Cfg) {
|
||||
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval)
|
||||
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
|
||||
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
|
||||
require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts)
|
||||
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "when both unified and legacy options are invalid, apply the defaults",
|
||||
unifiedAlertingOptions: map[string]string{
|
||||
"max_attempts": "invalid",
|
||||
"min_interval": "invalid",
|
||||
"execute_alerts": "invalid",
|
||||
"evaluation_timeouts": "invalid",
|
||||
},
|
||||
alertingOptions: map[string]string{
|
||||
"max_attempts": "invalid",
|
||||
"min_interval_seconds": "invalid",
|
||||
"execute_alerts": "invalid",
|
||||
"evaluation_timeout_seconds": "invalid",
|
||||
},
|
||||
verifyCfg: func(t *testing.T, cfg Cfg) {
|
||||
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
|
||||
require.Equal(t, int64(schedulerDefaultMaxAttempts), cfg.UnifiedAlerting.MaxAttempts)
|
||||
require.Equal(t, schedulerDefaultMinInterval, cfg.UnifiedAlerting.MinInterval)
|
||||
require.Equal(t, schedulereDefaultExecuteAlerts, cfg.UnifiedAlerting.ExecuteAlerts)
|
||||
require.Equal(t, evaluatorDefaultEvaluationTimeout, cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
},
|
||||
},
|
||||
{
|
||||
desc: "when unified alerting options are invalid, apply legacy options",
|
||||
unifiedAlertingOptions: map[string]string{
|
||||
"max_attempts": "invalid",
|
||||
"min_interval": "invalid",
|
||||
"execute_alerts": "invalid",
|
||||
"evaluation_timeout": "invalid",
|
||||
},
|
||||
alertingOptions: map[string]string{
|
||||
"max_attempts": "12",
|
||||
"min_interval_seconds": "120",
|
||||
"execute_alerts": "false",
|
||||
"evaluation_timeout_seconds": "160",
|
||||
},
|
||||
verifyCfg: func(t *testing.T, cfg Cfg) {
|
||||
require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval)
|
||||
require.Equal(t, int64(12), cfg.UnifiedAlerting.MaxAttempts)
|
||||
require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval)
|
||||
require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts)
|
||||
require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.desc, func(t *testing.T) {
|
||||
f := ini.Empty()
|
||||
cfg := NewCfg()
|
||||
unifiedAlertingSec, err := f.NewSection("unified_alerting")
|
||||
require.NoError(t, err)
|
||||
for k, v := range tc.unifiedAlertingOptions {
|
||||
_, err = unifiedAlertingSec.NewKey(k, v)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
alertingSec, err := f.NewSection("alerting")
|
||||
require.NoError(t, err)
|
||||
for k, v := range tc.alertingOptions {
|
||||
_, err = alertingSec.NewKey(k, v)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
err = cfg.ReadUnifiedAlertingSettings(f)
|
||||
require.NoError(t, err)
|
||||
tc.verifyCfg(t, *cfg)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -191,6 +191,8 @@ func CreateGrafDir(t *testing.T, opts ...GrafanaOpts) (string, string) {
|
||||
require.NoError(t, err)
|
||||
_, err = alertingSect.NewKey("notification_timeout_seconds", "1")
|
||||
require.NoError(t, err)
|
||||
_, err = alertingSect.NewKey("max_attempts", "3")
|
||||
require.NoError(t, err)
|
||||
|
||||
for _, o := range opts {
|
||||
if o.EnableCSP {
|
||||
|
Loading…
Reference in New Issue
Block a user