Alerting: Remove ngalert feature toggle and introduce two new settings for enabling Grafana 8 alerts and disabling them for specific organisations (#38746)

* Remove `ngalert` feature toggle

* Update frontend

Remove all references of ngalert feature toggle

* Update docs

* Disable unified alerting for specific orgs

* Add backend tests

* Apply suggestions from code review

Co-authored-by: achatterjee-grafana <70489351+achatterjee-grafana@users.noreply.github.com>

* Disabled unified alerting by default

* Ensure backward compatibility with old ngalert feature toggle

* Apply suggestions from code review

Co-authored-by: gotjosh <josue@grafana.com>
This commit is contained in:
Sofia Papagiannaki
2021-09-29 17:16:40 +03:00
committed by GitHub
parent 2dedbcd3c3
commit 012d4f0905
57 changed files with 705 additions and 183 deletions

View File

@@ -0,0 +1,28 @@
###
# set external Alertmanager
POST http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
content-type: application/json
{
"alertmanagers": ["http://localhost:9093"]
}
###
GET http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
###
# after a few minutes it should be discovered
GET http://admin:admin@localhost:3000/api/v1/ngalert/alertmanagers
###
# remove it
POST http://admin:admin@localhost:3000/api/v1/ngalert/admin_config
content-type: application/json
{
"alertmanagers": []
}
###
# check again
GET http://admin:admin@localhost:3000/api/v1/ngalert/alertmanagers

View File

@@ -135,6 +135,7 @@ type GetAlertRuleByUIDQuery struct {
type ListAlertRulesQuery struct {
OrgID int64
NamespaceUIDs []string
ExcludeOrgs []int64
Result []*AlertRule
}

View File

@@ -122,6 +122,7 @@ func (ng *AlertNG) init() error {
MultiOrgNotifier: ng.MultiOrgAlertmanager,
Metrics: ng.Metrics.GetSchedulerMetrics(),
AdminConfigPollInterval: ng.Cfg.UnifiedAlerting.AdminConfigPollInterval,
DisabledOrgs: ng.Cfg.UnifiedAlerting.DisabledOrgs,
MinRuleInterval: ng.getRuleMinInterval(),
}
stateManager := state.NewManager(ng.Log, ng.Metrics.GetStateMetrics(), store, store)
@@ -173,7 +174,7 @@ func (ng *AlertNG) IsDisabled() bool {
if ng.Cfg == nil {
return true
}
return !ng.Cfg.IsNgAlertEnabled()
return !ng.Cfg.UnifiedAlerting.Enabled
}
// getRuleDefaultIntervalSeconds returns the default rule interval if the interval is not set.

View File

@@ -149,9 +149,14 @@ func (moa *MultiOrgAlertmanager) SyncAlertmanagersForOrgs(ctx context.Context, o
}
moa.alertmanagersMtx.Lock()
for _, orgID := range orgIDs {
if _, isDisabledOrg := moa.settings.UnifiedAlerting.DisabledOrgs[orgID]; isDisabledOrg {
moa.logger.Debug("skipping syncing Alertmanger for disabled org", "org", orgID)
continue
}
orgsFound[orgID] = struct{}{}
alertmanager, found := moa.alertmanagers[orgID]
if !found {
// These metrics are not exported by Grafana and are mostly a placeholder.
// To export them, we need to translate the metrics from each individual registry and,

View File

@@ -32,8 +32,12 @@ func TestMultiOrgAlertmanager_SyncAlertmanagersForOrgs(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
m := metrics.NewNGAlert(reg)
cfg := &setting.Cfg{
DataPath: tmpDir,
UnifiedAlerting: setting.UnifiedAlertingSettings{AlertmanagerConfigPollInterval: 3 * time.Minute, DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration()}, // do not poll in tests.
DataPath: tmpDir,
UnifiedAlerting: setting.UnifiedAlertingSettings{
AlertmanagerConfigPollInterval: 3 * time.Minute,
DefaultConfiguration: setting.GetAlertmanagerDefaultConfiguration(),
DisabledOrgs: map[int64]struct{}{5: {}},
}, // do not poll in tests.
}
mam, err := NewMultiOrgAlertmanager(cfg, configStore, orgStore, kvStore, m.GetMultiOrgAlertmanagerMetrics(), log.New("testlogger"))
require.NoError(t, err)
@@ -82,6 +86,12 @@ grafana_alerting_active_configurations 4
grafana_alerting_discovered_configurations 4
`), "grafana_alerting_discovered_configurations", "grafana_alerting_active_configurations"))
}
// if the disabled org comes back, it should not detect it.
{
orgStore.orgs = []int64{1, 2, 3, 4, 5}
require.NoError(t, mam.LoadAndSyncAlertmanagersForOrgs(ctx))
require.Len(t, mam.alertmanagers, 4)
}
}
func TestMultiOrgAlertmanager_AlertmanagerFor(t *testing.T) {

View File

@@ -4,8 +4,10 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func (sch *schedule) fetchAllDetails() []*models.AlertRule {
q := models.ListAlertRulesQuery{}
func (sch *schedule) fetchAllDetails(disabledOrgs []int64) []*models.AlertRule {
q := models.ListAlertRulesQuery{
ExcludeOrgs: disabledOrgs,
}
err := sch.ruleStore.GetAlertRulesForScheduling(&q)
if err != nil {
sch.log.Error("failed to fetch alert definitions", "err", err)

View File

@@ -84,6 +84,7 @@ type schedule struct {
sendersCfgHash map[int64]string
senders map[int64]*sender.Sender
adminConfigPollInterval time.Duration
disabledOrgs map[int64]struct{}
minRuleInterval time.Duration
}
@@ -103,6 +104,7 @@ type SchedulerCfg struct {
MultiOrgNotifier *notifier.MultiOrgAlertmanager
Metrics *metrics.Scheduler
AdminConfigPollInterval time.Duration
DisabledOrgs map[int64]struct{}
MinRuleInterval time.Duration
}
@@ -132,6 +134,7 @@ func NewScheduler(cfg SchedulerCfg, dataService *tsdb.Service, appURL string, st
senders: map[int64]*sender.Sender{},
sendersCfgHash: map[int64]string{},
adminConfigPollInterval: cfg.AdminConfigPollInterval,
disabledOrgs: cfg.DisabledOrgs,
minRuleInterval: cfg.MinRuleInterval,
}
return &sch
@@ -190,6 +193,12 @@ func (sch *schedule) SyncAndApplyConfigFromDatabase() error {
orgsFound := make(map[int64]struct{}, len(cfgs))
sch.sendersMtx.Lock()
for _, cfg := range cfgs {
_, isDisabledOrg := sch.disabledOrgs[cfg.OrgID]
if isDisabledOrg {
sch.log.Debug("skipping starting sender for disabled org", "org", cfg.OrgID)
continue
}
orgsFound[cfg.OrgID] = struct{}{} // keep track of the which senders we need to keep.
existing, ok := sch.senders[cfg.OrgID]
@@ -318,8 +327,12 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
select {
case tick := <-sch.heartbeat.C:
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
alertRules := sch.fetchAllDetails()
sch.log.Debug("alert rules fetched", "count", len(alertRules))
disabledOrgs := make([]int64, 0, len(sch.disabledOrgs))
for disabledOrg := range sch.disabledOrgs {
disabledOrgs = append(disabledOrgs, disabledOrg)
}
alertRules := sch.fetchAllDetails(disabledOrgs)
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
// registeredDefinitions is a map used for finding deleted alert rules
// initially it is assigned to all known alert rules from the previous cycle

View File

@@ -37,7 +37,8 @@ func TestWarmStateCache(t *testing.T) {
require.NoError(t, err)
_, dbstore := tests.SetupTestEnv(t, 1)
rule := tests.CreateTestAlertRule(t, dbstore, 600)
const mainOrgID int64 = 1
rule := tests.CreateTestAlertRule(t, dbstore, 600, mainOrgID)
expectedEntries := []*state.State{
{
@@ -123,8 +124,11 @@ func TestAlertingTicker(t *testing.T) {
alerts := make([]*models.AlertRule, 0)
// create alert rule with one second interval
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1))
const mainOrgID int64 = 1
// create alert rule under main org with one second interval
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, mainOrgID))
const disabledOrgID int64 = 3
evalAppliedCh := make(chan evalAppliedInfo, len(alerts))
stopAppliedCh := make(chan models.AlertRuleKey, len(alerts))
@@ -146,6 +150,9 @@ func TestAlertingTicker(t *testing.T) {
Logger: log.New("ngalert schedule test"),
Metrics: testMetrics.GetSchedulerMetrics(),
AdminConfigPollInterval: 10 * time.Minute, // do not poll in unit tests.
DisabledOrgs: map[int64]struct{}{
disabledOrgID: {},
},
}
st := state.NewManager(schedCfg.Logger, testMetrics.GetStateMetrics(), dbstore, dbstore)
sched := schedule.NewScheduler(schedCfg, nil, "http://localhost", st)
@@ -164,9 +171,9 @@ func TestAlertingTicker(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
// change alert rule interval to three seconds
// add alert rule under main org with three seconds interval
var threeSecInterval int64 = 3
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, threeSecInterval))
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, threeSecInterval, mainOrgID))
t.Logf("alert rule: %v added with interval: %d", alerts[1].GetKey(), threeSecInterval)
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[0].GetKey()}
@@ -187,9 +194,10 @@ func TestAlertingTicker(t *testing.T) {
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
key := alerts[0].GetKey()
err := dbstore.DeleteAlertRuleByUID(alerts[0].OrgID, alerts[0].UID)
require.NoError(t, err)
t.Logf("alert rule: %v deleted", alerts[1].GetKey())
t.Logf("alert rule: %v deleted", key)
expectedAlertRulesEvaluated = []models.AlertRuleKey{}
t.Run(fmt.Sprintf("on 5th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
@@ -208,13 +216,22 @@ func TestAlertingTicker(t *testing.T) {
})
// create alert rule with one second interval
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1))
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, mainOrgID))
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 7th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
// create alert rule with one second interval under disabled org
alerts = append(alerts, tests.CreateTestAlertRule(t, dbstore, 1, disabledOrgID))
expectedAlertRulesEvaluated = []models.AlertRuleKey{alerts[2].GetKey()}
t.Run(fmt.Sprintf("on 8th tick alert rules: %s should be evaluated", concatenate(expectedAlertRulesEvaluated)), func(t *testing.T) {
tick := advanceClock(t, mockedClock)
assertEvalRun(t, evalAppliedCh, tick, expectedAlertRulesEvaluated...)
})
}
func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys ...models.AlertRuleKey) {
@@ -229,13 +246,12 @@ func assertEvalRun(t *testing.T, ch <-chan evalAppliedInfo, tick time.Time, keys
select {
case info := <-ch:
_, ok := expected[info.alertDefKey]
if !ok {
t.Fatal(fmt.Sprintf("alert rule: %v should not have been evaluated at: %v", info.alertDefKey, info.now))
}
t.Logf("alert rule: %v evaluated at: %v", info.alertDefKey, info.now)
assert.True(t, ok)
assert.Equal(t, tick, info.now)
delete(expected, info.alertDefKey)
if len(expected) == 0 {
return
}
case <-timeout:
if len(expected) == 0 {
return

View File

@@ -873,7 +873,8 @@ func TestStaleResultsHandler(t *testing.T) {
_, dbstore := tests.SetupTestEnv(t, 1)
rule := tests.CreateTestAlertRule(t, dbstore, 600)
const mainOrgID int64 = 1
rule := tests.CreateTestAlertRule(t, dbstore, 600, mainOrgID)
saveCmd1 := &models.SaveAlertInstanceCommand{
RuleOrgID: rule.OrgID,

View File

@@ -422,10 +422,12 @@ func (st DBstore) GetAlertRulesForScheduling(query *ngmodels.ListAlertRulesQuery
return st.SQLStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
alerts := make([]*ngmodels.AlertRule, 0)
q := "SELECT uid, org_id, interval_seconds, version FROM alert_rule"
if len(query.ExcludeOrgs) > 0 {
q = fmt.Sprintf("%s WHERE org_id NOT IN (%s)", q, strings.Join(strings.Split(strings.Trim(fmt.Sprint(query.ExcludeOrgs), "[]"), " "), ","))
}
if err := sess.SQL(q).Find(&alerts); err != nil {
return err
}
query.Result = alerts
return nil
})

View File

@@ -28,16 +28,18 @@ func mockTimeNow() {
func TestAlertInstanceOperations(t *testing.T) {
_, dbstore := tests.SetupTestEnv(t, baseIntervalSeconds)
alertRule1 := tests.CreateTestAlertRule(t, dbstore, 60)
const mainOrgID int64 = 1
alertRule1 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
orgID := alertRule1.OrgID
alertRule2 := tests.CreateTestAlertRule(t, dbstore, 60)
alertRule2 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
require.Equal(t, orgID, alertRule2.OrgID)
alertRule3 := tests.CreateTestAlertRule(t, dbstore, 60)
alertRule3 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
require.Equal(t, orgID, alertRule3.OrgID)
alertRule4 := tests.CreateTestAlertRule(t, dbstore, 60)
alertRule4 := tests.CreateTestAlertRule(t, dbstore, 60, mainOrgID)
require.Equal(t, orgID, alertRule4.OrgID)
t.Run("can save and read new alert instance", func(t *testing.T) {

View File

@@ -30,9 +30,8 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
cfg := setting.NewCfg()
cfg.AlertingBaseInterval = baseInterval
// AlertNG is disabled by default and only if it's enabled
// its database migrations run and the relative database tables are created
cfg.FeatureToggles = map[string]bool{"ngalert": true}
// AlertNG database migrations run and the relative database tables are created only when it's enabled
cfg.UnifiedAlerting.Enabled = true
m := metrics.NewNGAlert(prometheus.NewRegistry())
ng, err := ngalert.ProvideService(cfg, nil, routing.NewRouteRegister(), sqlstore.InitTestDB(t), nil, nil, nil, nil, m)
@@ -45,11 +44,11 @@ func SetupTestEnv(t *testing.T, baseInterval time.Duration) (*ngalert.AlertNG, *
}
// CreateTestAlertRule creates a dummy alert definition to be used by the tests.
func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds int64) *models.AlertRule {
func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds int64, orgID int64) *models.AlertRule {
d := rand.Intn(1000)
ruleGroup := fmt.Sprintf("ruleGroup-%d", d)
err := dbstore.UpdateRuleGroup(store.UpdateRuleGroupCmd{
OrgID: 1,
OrgID: orgID,
NamespaceUID: "namespace",
RuleGroupConfig: apimodels.PostableRuleGroupConfig{
Name: ruleGroup,
@@ -84,7 +83,7 @@ func CreateTestAlertRule(t *testing.T, dbstore *store.DBstore, intervalSeconds i
require.NoError(t, err)
q := models.ListRuleGroupAlertRulesQuery{
OrgID: 1,
OrgID: orgID,
NamespaceUID: "namespace",
RuleGroup: ruleGroup,
}