mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Remove ngalert feature toggle and introduce two new settings for enabling Grafana 8 alerts and disabling them for specific organisations (#38746)
* Remove `ngalert` feature toggle * Update frontend Remove all references of ngalert feature toggle * Update docs * Disable unified alerting for specific orgs * Add backend tests * Apply suggestions from code review Co-authored-by: achatterjee-grafana <70489351+achatterjee-grafana@users.noreply.github.com> * Disabled unified alerting by default * Ensure backward compatibility with old ngalert feature toggle * Apply suggestions from code review Co-authored-by: gotjosh <josue@grafana.com>
This commit is contained in:
committed by
GitHub
parent
2dedbcd3c3
commit
012d4f0905
28
pkg/services/ngalert/api/test-data/admin_config.http
Normal file
28
pkg/services/ngalert/api/test-data/admin_config.http
Normal file
@@ -0,0 +1,28 @@
|
||||
###
|
||||
# set external Alertmanager
|
||||
POST http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"alertmanagers": ["http://localhost:9093"]
|
||||
}
|
||||
|
||||
###
|
||||
GET http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
|
||||
|
||||
###
|
||||
# after a few minutes it should be discovered
|
||||
GET http://admin:admin@localhost:3000/api/v1/ngalert/alertmanagers
|
||||
|
||||
###
|
||||
# remove it
|
||||
POST http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"alertmanagers": []
|
||||
}
|
||||
|
||||
###
|
||||
# check again
|
||||
GET http://admin:admin@localhost:3000/api/v1/ngalert/alertmanagers
|
||||
@@ -135,6 +135,7 @@ type GetAlertRuleByUIDQuery struct {
|
||||
type ListAlertRulesQuery struct {
|
||||
OrgID int64
|
||||
NamespaceUIDs []string
|
||||
ExcludeOrgs []int64
|
||||
|
||||
Result []*AlertRule
|
||||
}
|
||||
|
||||
@@ -122,6 +122,7 @@ func (ng *AlertNG) init() error {
|
||||
MultiOrgNotifier: ng.MultiOrgAlertmanager,
|
||||
Metrics: ng.Metrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: ng.Cfg.UnifiedAlerting.AdminConfigPollInterval,
|
||||
DisabledOrgs: ng.Cfg.UnifiedAlerting.DisabledOrgs,
|
||||
MinRuleInterval: ng.getRuleMinInterval(),
|
||||
}
|
||||
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
|
||||
@@ -173,7 +174,7 @@ func (ng *AlertNG) IsDisabled() bool {
|
||||
if ng.Cfg == nil {
|
||||
return true
|
||||
}
|
||||
return !ng.Cfg.IsNgAlertEnabled()
|
||||
return !ng.Cfg.UnifiedAlerting.Enabled
|
||||
}
|
||||
|
||||
// getRuleDefaultIntervalSeconds returns the default rule interval if the interval is not set.
|
||||
|
||||
@@ -149,9 +149,14 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
|
||||
}
|
||||
moa.alertmanagersMtx.Lock()
|
||||
for _, orgID := range orgIDs {
|
||||
if _, isDisabledOrg := moa.settings.UnifiedAlerting.DisabledOrgs[orgID]; isDisabledOrg {
|
||||
moa.logger.Debug("skipping syncing Alertmanger for disabled org", "org", orgID)
|
||||
continue
|
||||
}
|
||||
orgsFound[orgID] = struct{}{}
|
||||
|
||||
alertmanager, found := moa.alertmanagers[orgID]
|
||||
|
||||
if !found {
|
||||
// These metrics are not exported by Grafana and are mostly a placeholder.
|
||||
// To export them, we need to translate the metrics from each individual registry and,
|
||||
|
||||
@@ -32,8 +32,12 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
|
||||
reg := prometheus.NewPedanticRegistry()
|
||||
m := metrics.NewNGAlert(reg)
|
||||
cfg := &setting.Cfg{
|
||||
DataPath: tmpDir,
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
|
||||
DataPath: tmpDir,
|
||||
UnifiedAlerting: setting.UnifiedAlertingSettings{
|
||||
AlertmanagerConfigPollInterval: 3 * time.Minute,
|
||||
DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration(),
|
||||
DisabledOrgs: map[int64]struct{}{5: {}},
|
||||
}, // do not poll in tests.
|
||||
}
|
||||
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
|
||||
require.NoError(t, err)
|
||||
@@ -82,6 +86,12 @@ grafana_alerting_active_configurations 4
|
||||
grafana_alerting_discovered_configurations 4
|
||||
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
|
||||
}
|
||||
// if the disabled org comes back, it should not detect it.
|
||||
{
|
||||
orgStore.orgs = []int64{1, 2, 3, 4, 5}
|
||||
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
|
||||
require.Len(t, mam.alertmanagers, 4)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {
|
||||
|
||||
@@ -4,8 +4,10 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
func (sch *schedule) fetchAllDetails() []*models.AlertRule {
|
||||
q := models.ListAlertRulesQuery{}
|
||||
func (sch *schedule) fetchAllDetails(disabledOrgs []int64) []*models.AlertRule {
|
||||
q := models.ListAlertRulesQuery{
|
||||
ExcludeOrgs: disabledOrgs,
|
||||
}
|
||||
err := sch.ruleStore.GetAlertRulesForScheduling(&q)
|
||||
if err != nil {
|
||||
sch.log.Error("failed to fetch alert definitions", "err", err)
|
||||
|
||||
@@ -84,6 +84,7 @@ type schedule struct {
|
||||
sendersCfgHash map[int64]string
|
||||
senders map[int64]*sender.Sender
|
||||
adminConfigPollInterval time.Duration
|
||||
disabledOrgs map[int64]struct{}
|
||||
minRuleInterval time.Duration
|
||||
}
|
||||
|
||||
@@ -103,6 +104,7 @@ type SchedulerCfg struct {
|
||||
MultiOrgNotifier *notifier.MultiOrgAlertmanager
|
||||
Metrics *metrics.Scheduler
|
||||
AdminConfigPollInterval time.Duration
|
||||
DisabledOrgs map[int64]struct{}
|
||||
MinRuleInterval time.Duration
|
||||
}
|
||||
|
||||
@@ -132,6 +134,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
|
||||
senders: map[int64]*sender.Sender{},
|
||||
sendersCfgHash: map[int64]string{},
|
||||
adminConfigPollInterval: cfg.AdminConfigPollInterval,
|
||||
disabledOrgs: cfg.DisabledOrgs,
|
||||
minRuleInterval: cfg.MinRuleInterval,
|
||||
}
|
||||
return &sch
|
||||
@@ -190,6 +193,12 @@ func (sch *schedule) SyncAndApplyConfigFromDatabase() error {
|
||||
orgsFound := make(map[int64]struct{}, len(cfgs))
|
||||
sch.sendersMtx.Lock()
|
||||
for _, cfg := range cfgs {
|
||||
_, isDisabledOrg := sch.disabledOrgs[cfg.OrgID]
|
||||
if isDisabledOrg {
|
||||
sch.log.Debug("skipping starting sender for disabled org", "org", cfg.OrgID)
|
||||
continue
|
||||
}
|
||||
|
||||
orgsFound[cfg.OrgID] = struct{}{} // keep track of the which senders we need to keep.
|
||||
|
||||
existing, ok := sch.senders[cfg.OrgID]
|
||||
@@ -318,8 +327,12 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||
select {
|
||||
case tick := <-sch.heartbeat.C:
|
||||
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
||||
alertRules := sch.fetchAllDetails()
|
||||
sch.log.Debug("alert rules fetched", "count", len(alertRules))
|
||||
disabledOrgs := make([]int64, 0, len(sch.disabledOrgs))
|
||||
for disabledOrg := range sch.disabledOrgs {
|
||||
disabledOrgs = append(disabledOrgs, disabledOrg)
|
||||
}
|
||||
alertRules := sch.fetchAllDetails(disabledOrgs)
|
||||
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
|
||||
|
||||
// registeredDefinitions is a map used for finding deleted alert rules
|
||||
// initially it is assigned to all known alert rules from the previous cycle
|
||||
|
||||
@@ -37,7 +37,8 @@ func TestWarmStateCache(t *testing.T) {
|
||||
require.NoError(t, err)
|
||||
_, dbstore := tests.SetupTestEnv(t, 1)
|
||||
|
||||
rule := tests.CreateTestAlertRule(t, dbstore, 600)
|
||||
const mainOrgID int64 = 1
|
||||
rule := tests.CreateTestAlertRule(t, dbstore, 600, mainOrgID)
|
||||
|
||||
expectedEntries := []*state.State{
|
||||
{
|
||||
@@ -123,8 +124,11 @@ func TestAlertingTicker(t *testing.T) {
|
||||
|
||||
alerts := make([]*models.AlertRule, 0)
|
||||
|
||||
// create alert rule with one second interval
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1))
|
||||
const mainOrgID int64 = 1
|
||||
// create alert rule under main org with one second interval
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, mainOrgID))
|
||||
|
||||
const disabledOrgID int64 = 3
|
||||
|
||||
evalAppliedCh := make(chan evalAppliedInfo, len(alerts))
|
||||
stopAppliedCh := make(chan models.AlertRuleKey, len(alerts))
|
||||
@@ -146,6 +150,9 @@ func TestAlertingTicker(t *testing.T) {
|
||||
Logger: log.New("ngalert schedule test"),
|
||||
Metrics: testMetrics.GetSchedulerMetrics(),
|
||||
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
|
||||
DisabledOrgs: map[int64]struct{}{
|
||||
disabledOrgID: {},
|
||||
},
|
||||
}
|
||||
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
|
||||
sched := schedule.NewScheduler(schedCfg, nil, "http://localhost", st)
|
||||
@@ -164,9 +171,9 @@ func TestAlertingTicker(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
|
||||
})
|
||||
|
||||
// change alert rule interval to three seconds
|
||||
// add alert rule under main org with three seconds interval
|
||||
var threeSecInterval int64 = 3
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, threeSecInterval))
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, threeSecInterval, mainOrgID))
|
||||
t.Logf("alert rule: %v added with interval: %d", alerts[1].GetKey(), threeSecInterval)
|
||||
|
||||
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[0].GetKey()}
|
||||
@@ -187,9 +194,10 @@ func TestAlertingTicker(t *testing.T) {
|
||||
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
|
||||
})
|
||||
|
||||
key := alerts[0].GetKey()
|
||||
err := dbstore.DeleteAlertRuleByUID(alerts[0].OrgID, alerts[0].UID)
|
||||
require.NoError(t, err)
|
||||
t.Logf("alert rule: %v deleted", alerts[1].GetKey())
|
||||
t.Logf("alert rule: %v deleted", key)
|
||||
|
||||
expectedAlertRulesEvaluated = []models.AlertRuleKey{}
|
||||
t.Run(fmt.Sprintf("on 5th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
|
||||
@@ -208,13 +216,22 @@ func TestAlertingTicker(t *testing.T) {
|
||||
})
|
||||
|
||||
// create alert rule with one second interval
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1))
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, mainOrgID))
|
||||
|
||||
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[2].GetKey()}
|
||||
t.Run(fmt.Sprintf("on 7th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
|
||||
tick := advanceClock(t, mockedClock)
|
||||
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
|
||||
})
|
||||
|
||||
// create alert rule with one second interval under disabled org
|
||||
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, disabledOrgID))
|
||||
|
||||
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[2].GetKey()}
|
||||
t.Run(fmt.Sprintf("on 8th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
|
||||
tick := advanceClock(t, mockedClock)
|
||||
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
|
||||
})
|
||||
}
|
||||
|
||||
func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys ...models.AlertRuleKey) {
|
||||
@@ -229,13 +246,12 @@ func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys
|
||||
select {
|
||||
case info := <-ch:
|
||||
_, ok := expected[info.alertDefKey]
|
||||
if !ok {
|
||||
t.Fatal(fmt.Sprintf("alert rule: %v should not have been evaluated at: %v", info.alertDefKey, info.now))
|
||||
}
|
||||
t.Logf("alert rule: %v evaluated at: %v", info.alertDefKey, info.now)
|
||||
assert.True(t, ok)
|
||||
assert.Equal(t, tick, info.now)
|
||||
delete(expected, info.alertDefKey)
|
||||
if len(expected) == 0 {
|
||||
return
|
||||
}
|
||||
case <-timeout:
|
||||
if len(expected) == 0 {
|
||||
return
|
||||
|
||||
@@ -873,7 +873,8 @@ func TestStaleResultsHandler(t *testing.T) {
|
||||
|
||||
_, dbstore := tests.SetupTestEnv(t, 1)
|
||||
|
||||
rule := tests.CreateTestAlertRule(t, dbstore, 600)
|
||||
const mainOrgID int64 = 1
|
||||
rule := tests.CreateTestAlertRule(t, dbstore, 600, mainOrgID)
|
||||
|
||||
saveCmd1 := &models.SaveAlertInstanceCommand{
|
||||
RuleOrgID: rule.OrgID,
|
||||
|
||||
@@ -422,10 +422,12 @@ func (st DBstore) GetAlertRulesForScheduling(query *ngmodels.ListAlertRulesQuery
|
||||
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
|
||||
alerts := make([]*ngmodels.AlertRule, 0)
|
||||
q := "SELECT uid, org_id, interval_seconds, version FROM alert_rule"
|
||||
if len(query.ExcludeOrgs) > 0 {
|
||||
q = fmt.Sprintf("%s WHERE org_id NOT IN (%s)", q, strings.Join(strings.Split(strings.Trim(fmt.Sprint(query.ExcludeOrgs), "[]"), " "), ","))
|
||||
}
|
||||
if err := sess.SQL(q).Find(&alerts); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
query.Result = alerts
|
||||
return nil
|
||||
})
|
||||
|
||||
@@ -28,16 +28,18 @@ func mockTimeNow() {
|
||||
func TestAlertInstanceOperations(t *testing.T) {
|
||||
_, dbstore := tests.SetupTestEnv(t, baseIntervalSeconds)
|
||||
|
||||
alertRule1 := tests.CreateTestAlertRule(t, dbstore, 60)
|
||||
const mainOrgID int64 = 1
|
||||
|
||||
alertRule1 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
|
||||
orgID := alertRule1.OrgID
|
||||
|
||||
alertRule2 := tests.CreateTestAlertRule(t, dbstore, 60)
|
||||
alertRule2 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
|
||||
require.Equal(t, orgID, alertRule2.OrgID)
|
||||
|
||||
alertRule3 := tests.CreateTestAlertRule(t, dbstore, 60)
|
||||
alertRule3 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
|
||||
require.Equal(t, orgID, alertRule3.OrgID)
|
||||
|
||||
alertRule4 := tests.CreateTestAlertRule(t, dbstore, 60)
|
||||
alertRule4 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
|
||||
require.Equal(t, orgID, alertRule4.OrgID)
|
||||
|
||||
t.Run("can save and read new alert instance", func(t *testing.T) {
|
||||
|
||||
@@ -30,9 +30,8 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
|
||||
|
||||
cfg := setting.NewCfg()
|
||||
cfg.AlertingBaseInterval = baseInterval
|
||||
// AlertNG is disabled by default and only if it's enabled
|
||||
// its database migrations run and the relative database tables are created
|
||||
cfg.FeatureToggles = map[string]bool{"ngalert": true}
|
||||
// AlertNG database migrations run and the relative database tables are created only when it's enabled
|
||||
cfg.UnifiedAlerting.Enabled = true
|
||||
|
||||
m := metrics.NewNGAlert(prometheus.NewRegistry())
|
||||
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil, nil, m)
|
||||
@@ -45,11 +44,11 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
|
||||
}
|
||||
|
||||
// CreateTestAlertRule creates a dummy alert definition to be used by the tests.
|
||||
func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds int64) *models.AlertRule {
|
||||
func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds int64, orgID int64) *models.AlertRule {
|
||||
d := rand.Intn(1000)
|
||||
ruleGroup := fmt.Sprintf("ruleGroup-%d", d)
|
||||
err := dbstore.UpdateRuleGroup(store.UpdateRuleGroupCmd{
|
||||
OrgID: 1,
|
||||
OrgID: orgID,
|
||||
NamespaceUID: "namespace",
|
||||
RuleGroupConfig: apimodels.PostableRuleGroupConfig{
|
||||
Name: ruleGroup,
|
||||
@@ -84,7 +83,7 @@ func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds i
|
||||
require.NoError(t, err)
|
||||
|
||||
q := models.ListRuleGroupAlertRulesQuery{
|
||||
OrgID: 1,
|
||||
OrgID: orgID,
|
||||
NamespaceUID: "namespace",
|
||||
RuleGroup: ruleGroup,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user