From 4b439b7f52fc2ef567ee1383400132a61479e005 Mon Sep 17 00:00:00 2001 From: Matthew Jacobson Date: Fri, 24 Nov 2023 11:27:44 -0500 Subject: [PATCH] Alerting: In migration, fallback to '1s' for malformed min interval (#78614) * Alerting: In migration, fallback to '1s' for malformed min interval During legacy migration, when we encounter an alert datasource query with a min interval (interval field in the query model) that is not parseable, instead of failing the migration we fallback to a min interval of 1s and continue. The reason for this is a bug in legacy alerting (existing for a few major versions) which allows arbitrary dashboard variables to be used as the min interval, even though those variables do not work and will cause the legacy alert to fail with `interval calculation failed: time: invalid duration`. --- pkg/services/ngalert/migration/alert_rule.go | 2 +- pkg/services/ngalert/migration/cond_trans.go | 31 ++++++--- .../ngalert/migration/cond_trans_test.go | 5 +- .../ngalert/migration/migration_test.go | 66 +++++++++++++++++-- 4 files changed, 85 insertions(+), 19 deletions(-) diff --git a/pkg/services/ngalert/migration/alert_rule.go b/pkg/services/ngalert/migration/alert_rule.go index 3ef0b56bf33..11eceb10620 100644 --- a/pkg/services/ngalert/migration/alert_rule.go +++ b/pkg/services/ngalert/migration/alert_rule.go @@ -44,7 +44,7 @@ func addMigrationInfo(da *migrationStore.DashAlert, dashboardUID string) (map[st // MigrateAlert migrates a single dashboard alert from legacy alerting to unified alerting. func (om *OrgMigration) migrateAlert(ctx context.Context, l log.Logger, da *migrationStore.DashAlert, info migmodels.DashboardUpgradeInfo) (*ngmodels.AlertRule, error) { l.Debug("Migrating alert rule to Unified Alerting") - cond, err := transConditions(ctx, da, om.migrationStore) + cond, err := transConditions(ctx, l, da, om.migrationStore) if err != nil { return nil, fmt.Errorf("transform conditions: %w", err) } diff --git a/pkg/services/ngalert/migration/cond_trans.go b/pkg/services/ngalert/migration/cond_trans.go index d43a3d31e74..fbd9a8b3b44 100644 --- a/pkg/services/ngalert/migration/cond_trans.go +++ b/pkg/services/ngalert/migration/cond_trans.go @@ -10,6 +10,7 @@ import ( "time" "github.com/grafana/grafana/pkg/components/simplejson" + "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/services/datasources" migrationStore "github.com/grafana/grafana/pkg/services/ngalert/migration/store" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -22,7 +23,7 @@ import ( const expressionDatasourceUID = "__expr__" //nolint:gocyclo -func transConditions(ctx context.Context, alert *migrationStore.DashAlert, store migrationStore.Store) (*condition, error) { +func transConditions(ctx context.Context, l log.Logger, alert *migrationStore.DashAlert, store migrationStore.Store) (*condition, error) { // TODO: needs a significant refactor to reduce complexity. usr := getMigrationUser(alert.OrgID) set := alert.ParsedSettings @@ -142,13 +143,9 @@ func transConditions(ctx context.Context, alert *migrationStore.DashAlert, store } // Could have an alert saved but datasource deleted, so can not require match. - dsUid := "" - if ds, err := store.GetDatasource(ctx, set.Conditions[condIdx].Query.DatasourceID, usr); err == nil { - dsUid = ds.UID - } else { - if !errors.Is(err, datasources.ErrDataSourceNotFound) { - return nil, err - } + ds, err := store.GetDatasource(ctx, set.Conditions[condIdx].Query.DatasourceID, usr) + if err != nil && !errors.Is(err, datasources.ErrDataSourceNotFound) { + return nil, err } queryObj["refId"] = refID @@ -163,7 +160,17 @@ func transConditions(ctx context.Context, alert *migrationStore.DashAlert, store rawFrom := newRefIDsToTimeRanges[refID][0] rawTo := newRefIDsToTimeRanges[refID][1] - calculatedInterval, err := calculateInterval(legacydata.NewDataTimeRange(rawFrom, rawTo), simpleJson, nil) + + // We check if the minInterval stored in the model is parseable. If it's not, we use "1s" instead. + // The reason for this is because of a bug in legacy alerting which allows arbitrary variables to be used + // as the min interval, even though those variables do not work and will cause the legacy alert + // to fail with `interval calculation failed: time: invalid duration`. + if _, err := interval.GetIntervalFrom(ds, simpleJson, time.Millisecond*1); err != nil { + l.Warn("failed to parse min interval from query model, using '1s' instead", "interval", simpleJson.Get("interval").MustString(), "err", err) + simpleJson.Set("interval", "1s") + } + + calculatedInterval, err := calculateInterval(legacydata.NewDataTimeRange(rawFrom, rawTo), simpleJson, ds) if err != nil { return nil, err } @@ -183,9 +190,13 @@ func transConditions(ctx context.Context, alert *migrationStore.DashAlert, store RefID: refID, Model: encodedObj, RelativeTimeRange: *rTR, - DatasourceUID: dsUid, QueryType: queryType, } + + if ds != nil { + alertQuery.DatasourceUID = ds.UID + } + newCond.Data = append(newCond.Data, alertQuery) } } diff --git a/pkg/services/ngalert/migration/cond_trans_test.go b/pkg/services/ngalert/migration/cond_trans_test.go index fb2cc31b706..9e19c99ebcb 100644 --- a/pkg/services/ngalert/migration/cond_trans_test.go +++ b/pkg/services/ngalert/migration/cond_trans_test.go @@ -9,6 +9,7 @@ import ( "github.com/grafana/grafana/pkg/expr" "github.com/grafana/grafana/pkg/infra/db" + "github.com/grafana/grafana/pkg/infra/log/logtest" legacymodels "github.com/grafana/grafana/pkg/services/alerting/models" "github.com/grafana/grafana/pkg/services/ngalert/migration/store" "github.com/grafana/grafana/pkg/services/ngalert/models" @@ -74,7 +75,7 @@ func TestCondTransMultiCondOnSingleQuery(t *testing.T) { } migrationStore := store.NewTestMigrationStore(t, db.InitTestDB(t), &setting.Cfg{}) - c, err := transConditions(context.Background(), &dashAlert, migrationStore) + c, err := transConditions(context.Background(), &logtest.Fake{}, &dashAlert, migrationStore) require.NoError(t, err) require.Equal(t, expected, c) @@ -194,7 +195,7 @@ func TestCondTransExtended(t *testing.T) { } migrationStore := store.NewTestMigrationStore(t, db.InitTestDB(t), &setting.Cfg{}) - c, err := transConditions(context.Background(), &dashAlert, migrationStore) + c, err := transConditions(context.Background(), &logtest.Fake{}, &dashAlert, migrationStore) require.NoError(t, err) require.Equal(t, expected, c) diff --git a/pkg/services/ngalert/migration/migration_test.go b/pkg/services/ngalert/migration/migration_test.go index 7acb4ec380d..81e097d7363 100644 --- a/pkg/services/ngalert/migration/migration_test.go +++ b/pkg/services/ngalert/migration/migration_test.go @@ -14,6 +14,7 @@ import ( "github.com/prometheus/alertmanager/pkg/labels" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" + "xorm.io/xorm" "github.com/grafana/grafana/pkg/components/simplejson" @@ -635,18 +636,22 @@ func TestDashAlertQueryMigration(t *testing.T) { x := sqlStore.GetEngine() service := NewTestMigrationService(t, sqlStore, &setting.Cfg{}) - createAlertQuery := func(refId string, ds string, from string, to string) ngModels.AlertQuery { - dur, _ := calculateInterval(legacydata.NewDataTimeRange(from, to), simplejson.New(), nil) - intervalMs := strconv.FormatInt(dur.Milliseconds(), 10) + newQueryModel := `{"datasource":{"type":"prometheus","uid":"gdev-prometheus"},"expr":"up{job=\"fake-data-gen\"}","instant":false,"interval":"%s","intervalMs":%d,"maxDataPoints":1500,"refId":"%s"}` + createAlertQueryWithModel := func(refId string, ds string, from string, to string, model string) ngModels.AlertQuery { rel, _ := getRelativeDuration(from, to) return ngModels.AlertQuery{ RefID: refId, RelativeTimeRange: ngModels.RelativeTimeRange{From: rel.From, To: rel.To}, DatasourceUID: ds, - Model: []byte(fmt.Sprintf(`{"datasource":{"type":"prometheus","uid":"gdev-prometheus"},"expr":"up{job=\"fake-data-gen\"}","instant":false,"intervalMs":%s,"maxDataPoints":1500,"refId":"%s"}`, intervalMs, refId)), + Model: []byte(model), } } + createAlertQuery := func(refId string, ds string, from string, to string) ngModels.AlertQuery { + dur, _ := calculateInterval(legacydata.NewDataTimeRange(from, to), simplejson.New(), nil) + return createAlertQueryWithModel(refId, ds, from, to, fmt.Sprintf(newQueryModel, "", dur.Milliseconds(), refId)) + } + createClassicConditionQuery := func(refId string, conditions []classicConditionJSON) ngModels.AlertQuery { exprModel := struct { Type string `json:"type"` @@ -1000,6 +1005,50 @@ func TestDashAlertQueryMigration(t *testing.T) { }, }, }, + { + name: "simple query with interval, calculates intervalMs using it as min interval", + alerts: []*models.Alert{ + createAlertWithCond(t, 1, 1, 1, "alert1", nil, + []migrationStore.DashAlertCondition{ + withQueryModel( + createCondition("A", "max", "gt", 42, 1, "5m", "now"), + fmt.Sprintf(queryModel, "A", "1s"), + ), + }), + }, + expected: map[int64][]*ngModels.AlertRule{ + int64(1): { + genAlert(func(rule *ngModels.AlertRule) { + rule.Data = append(rule.Data, createAlertQueryWithModel("A", "ds1-1", "5m", "now", fmt.Sprintf(newQueryModel, "1s", 1000, "A"))) + rule.Data = append(rule.Data, createClassicConditionQuery("B", []classicConditionJSON{ + cond("A", "max", "gt", 42), + })) + }), + }, + }, + }, + { + name: "simple query with interval as variable, calculates intervalMs using default as min interval", + alerts: []*models.Alert{ + createAlertWithCond(t, 1, 1, 1, "alert1", nil, + []migrationStore.DashAlertCondition{ + withQueryModel( + createCondition("A", "max", "gt", 42, 1, "5m", "now"), + fmt.Sprintf(queryModel, "A", "$min_interval"), + ), + }), + }, + expected: map[int64][]*ngModels.AlertRule{ + int64(1): { + genAlert(func(rule *ngModels.AlertRule) { + rule.Data = append(rule.Data, createAlertQueryWithModel("A", "ds1-1", "5m", "now", fmt.Sprintf(newQueryModel, "$min_interval", 1000, "A"))) + rule.Data = append(rule.Data, createClassicConditionQuery("B", []classicConditionJSON{ + cond("A", "max", "gt", 42), + })) + }), + }, + }, + }, } for _, tt := range tc { t.Run(tt.name, func(t *testing.T) { @@ -1107,7 +1156,12 @@ func createAlertNotification(t *testing.T, orgId int64, uid string, channelType return createAlertNotificationWithReminder(t, orgId, uid, channelType, settings, defaultChannel, false, time.Duration(0)) } -var queryModel = `{"datasource":{"type":"prometheus","uid":"gdev-prometheus"},"expr":"up{job=\"fake-data-gen\"}","instant":false,"refId":"%s"}` +func withQueryModel(base migrationStore.DashAlertCondition, model string) migrationStore.DashAlertCondition { + base.Query.Model = []byte(model) + return base +} + +var queryModel = `{"datasource":{"type":"prometheus","uid":"gdev-prometheus"},"expr":"up{job=\"fake-data-gen\"}","instant":false,"refId":"%s","interval":"%s"}` func createCondition(refId string, reducer string, evalType string, thresh float64, datasourceId int64, from string, to string) migrationStore.DashAlertCondition { return migrationStore.DashAlertCondition{ @@ -1127,7 +1181,7 @@ func createCondition(refId string, reducer string, evalType string, thresh float }{ Params: []string{refId, from, to}, DatasourceID: datasourceId, - Model: []byte(fmt.Sprintf(queryModel, refId)), + Model: []byte(fmt.Sprintf(queryModel, refId, "")), }, Reducer: struct { Type string `json:"type"`