From 63187fae0ccb925ac603acf66da84d4695be3480 Mon Sep 17 00:00:00 2001 From: Matthew Jacobson Date: Thu, 6 Apr 2023 12:06:25 -0400 Subject: [PATCH] Alerting: Remove and revert flag alertingBigTransactions (#65976) * Alerting: Remove and revert flag alertingBigTransactions This is a partial revert of #56575 and a removal of the `alertingBigTransactions` flag. Real-word use has seen no clear performance incentive to maintain this flag. Lowered db connection count came at the cost of significant increase in CPU usage and query latency. * Fix lint backend * Removed last bits of alertingBigTransactions --------- Co-authored-by: Armand Grillet <2117580+armandgrillet@users.noreply.github.com> --- .../feature-toggles/index.md | 1 - .../src/types/featureToggles.gen.ts | 1 - pkg/services/featuremgmt/registry.go | 6 -- pkg/services/featuremgmt/toggles_gen.csv | 1 - pkg/services/featuremgmt/toggles_gen.go | 4 - pkg/services/ngalert/state/manager.go | 22 +---- pkg/services/ngalert/state/manager_test.go | 36 ++++--- pkg/services/ngalert/state/persist.go | 2 +- pkg/services/ngalert/state/testing.go | 6 +- .../ngalert/store/instance_database.go | 93 ------------------- .../ngalert/store/instance_database_test.go | 88 +++--------------- 11 files changed, 42 insertions(+), 218 deletions(-) diff --git a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md index 9d7b628917a..7b9aff01b69 100644 --- a/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md +++ b/docs/sources/setup-grafana/configure-grafana/feature-toggles/index.md @@ -60,7 +60,6 @@ Alpha features might be changed or removed without prior notice. | Feature toggle name | Description | | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `alertingBigTransactions` | Use big transactions for alerting database writes | | `dashboardPreviews` | Create and show thumbnails for dashboard search results | | `live-service-web-worker` | This will use a webworker thread to processes events rather than the main thread | | `queryOverLive` | Use Grafana Live WebSocket to execute backend queries | diff --git a/packages/grafana-data/src/types/featureToggles.gen.ts b/packages/grafana-data/src/types/featureToggles.gen.ts index e62f5f0f32d..d2414f1c493 100644 --- a/packages/grafana-data/src/types/featureToggles.gen.ts +++ b/packages/grafana-data/src/types/featureToggles.gen.ts @@ -18,7 +18,6 @@ * @public */ export interface FeatureToggles { - alertingBigTransactions?: boolean; trimDefaults?: boolean; disableEnvelopeEncryption?: boolean; database_metrics?: boolean; diff --git a/pkg/services/featuremgmt/registry.go b/pkg/services/featuremgmt/registry.go index 2143c9d4ad3..dd8d6e5241f 100644 --- a/pkg/services/featuremgmt/registry.go +++ b/pkg/services/featuremgmt/registry.go @@ -9,12 +9,6 @@ package featuremgmt var ( // Register each toggle here standardFeatureFlags = []FeatureFlag{ - { - Name: "alertingBigTransactions", - Description: "Use big transactions for alerting database writes", - State: FeatureStateAlpha, - Owner: grafanaAlertingSquad, - }, { Name: "trimDefaults", Description: "Use cue schema to remove values that will be applied automatically", diff --git a/pkg/services/featuremgmt/toggles_gen.csv b/pkg/services/featuremgmt/toggles_gen.csv index 0286f61d16e..43e42e059e5 100644 --- a/pkg/services/featuremgmt/toggles_gen.csv +++ b/pkg/services/featuremgmt/toggles_gen.csv @@ -1,5 +1,4 @@ Name,State,Owner,requiresDevMode,RequiresLicense,RequiresRestart,FrontendOnly -alertingBigTransactions,alpha,@grafana/alerting-squad,false,false,false,false trimDefaults,beta,@grafana/grafana-as-code,false,false,false,false disableEnvelopeEncryption,stable,@grafana/grafana-as-code,false,false,false,false database_metrics,stable,@grafana/hosted-grafana-team,false,false,false,false diff --git a/pkg/services/featuremgmt/toggles_gen.go b/pkg/services/featuremgmt/toggles_gen.go index a648556ad6f..90fad46a56f 100644 --- a/pkg/services/featuremgmt/toggles_gen.go +++ b/pkg/services/featuremgmt/toggles_gen.go @@ -7,10 +7,6 @@ package featuremgmt const ( - // FlagAlertingBigTransactions - // Use big transactions for alerting database writes - FlagAlertingBigTransactions = "alertingBigTransactions" - // FlagTrimDefaults // Use cue schema to remove values that will be applied automatically FlagTrimDefaults = "trimDefaults" diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index 8b523828cb6..5161ba957cf 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -351,8 +351,6 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state } logger.Debug("Saving alert states", "count", len(states)) - instances := make([]ngModels.AlertInstance, 0, len(states)) - for _, s := range states { // Do not save normal state to database and remove transition to Normal state but keep mapped states if st.doNotSaveNormalState && IsNormalStateWithNoReason(s.State) && !s.Changed() { @@ -364,7 +362,7 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err, "labels", s.Labels.String()) continue } - fields := ngModels.AlertInstance{ + instance := ngModels.AlertInstance{ AlertInstanceKey: key, Labels: ngModels.InstanceLabels(s.Labels), CurrentState: ngModels.InstanceStateType(s.State.State.String()), @@ -373,23 +371,11 @@ func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, state CurrentStateSince: s.StartsAt, CurrentStateEnd: s.EndsAt, } - instances = append(instances, fields) - } - if len(instances) == 0 { - return - } - - if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil { - type debugInfo struct { - State string - Labels string + err = st.instanceStore.SaveAlertInstance(ctx, instance) + if err != nil { + logger.Error("Failed to save alert state", "labels", s.Labels.String(), "state", s.State, "error", err) } - debug := make([]debugInfo, 0) - for _, inst := range instances { - debug = append(debug, debugInfo{string(inst.CurrentState), data.Labels(inst.Labels).String()}) - } - logger.Error("Failed to save alert states", "states", debug, "error", err) } } diff --git a/pkg/services/ngalert/state/manager_test.go b/pkg/services/ngalert/state/manager_test.go index 67b82976d1a..b4abfee70f1 100644 --- a/pkg/services/ngalert/state/manager_test.go +++ b/pkg/services/ngalert/state/manager_test.go @@ -113,9 +113,11 @@ func TestWarmStateCache(t *testing.T) { }, } + instances := make([]models.AlertInstance, 0) + labels := models.InstanceLabels{"test1": "testValue1"} _, hash, _ := labels.StringAndHash() - instance1 := models.AlertInstance{ + instances = append(instances, models.AlertInstance{ AlertInstanceKey: models.AlertInstanceKey{ RuleOrgID: rule.OrgID, RuleUID: rule.UID, @@ -126,11 +128,11 @@ func TestWarmStateCache(t *testing.T) { CurrentStateSince: evaluationTime.Add(-1 * time.Minute), CurrentStateEnd: evaluationTime.Add(1 * time.Minute), Labels: labels, - } + }) labels = models.InstanceLabels{"test2": "testValue2"} _, hash, _ = labels.StringAndHash() - instance2 := models.AlertInstance{ + instances = append(instances, models.AlertInstance{ AlertInstanceKey: models.AlertInstanceKey{ RuleOrgID: rule.OrgID, RuleUID: rule.UID, @@ -141,11 +143,11 @@ func TestWarmStateCache(t *testing.T) { CurrentStateSince: evaluationTime.Add(-1 * time.Minute), CurrentStateEnd: evaluationTime.Add(1 * time.Minute), Labels: labels, - } + }) labels = models.InstanceLabels{"test3": "testValue3"} _, hash, _ = labels.StringAndHash() - instance3 := models.AlertInstance{ + instances = append(instances, models.AlertInstance{ AlertInstanceKey: models.AlertInstanceKey{ RuleOrgID: rule.OrgID, RuleUID: rule.UID, @@ -156,11 +158,11 @@ func TestWarmStateCache(t *testing.T) { CurrentStateSince: evaluationTime.Add(-1 * time.Minute), CurrentStateEnd: evaluationTime.Add(1 * time.Minute), Labels: labels, - } + }) labels = models.InstanceLabels{"test4": "testValue4"} _, hash, _ = labels.StringAndHash() - instance4 := models.AlertInstance{ + instances = append(instances, models.AlertInstance{ AlertInstanceKey: models.AlertInstanceKey{ RuleOrgID: rule.OrgID, RuleUID: rule.UID, @@ -171,11 +173,11 @@ func TestWarmStateCache(t *testing.T) { CurrentStateSince: evaluationTime.Add(-1 * time.Minute), CurrentStateEnd: evaluationTime.Add(1 * time.Minute), Labels: labels, - } + }) labels = models.InstanceLabels{"test5": "testValue5"} _, hash, _ = labels.StringAndHash() - instance5 := models.AlertInstance{ + instances = append(instances, models.AlertInstance{ AlertInstanceKey: models.AlertInstanceKey{ RuleOrgID: rule.OrgID, RuleUID: rule.UID, @@ -186,8 +188,10 @@ func TestWarmStateCache(t *testing.T) { CurrentStateSince: evaluationTime.Add(-1 * time.Minute), CurrentStateEnd: evaluationTime.Add(1 * time.Minute), Labels: labels, + }) + for _, instance := range instances { + _ = dbstore.SaveAlertInstance(ctx, instance) } - _ = dbstore.SaveAlertInstances(ctx, instance1, instance2, instance3, instance4, instance5) cfg := state.ManagerCfg{ Metrics: testMetrics.GetStateMetrics(), @@ -2370,7 +2374,9 @@ func TestStaleResultsHandler(t *testing.T) { }, } - _ = dbstore.SaveAlertInstances(ctx, instances...) + for _, instance := range instances { + _ = dbstore.SaveAlertInstance(ctx, instance) + } testCases := []struct { desc string @@ -2621,7 +2627,9 @@ func TestDeleteStateByRuleUID(t *testing.T) { }, } - _ = dbstore.SaveAlertInstances(ctx, instances...) + for _, instance := range instances { + _ = dbstore.SaveAlertInstance(ctx, instance) + } testCases := []struct { desc string @@ -2755,7 +2763,9 @@ func TestResetStateByRuleUID(t *testing.T) { }, } - _ = dbstore.SaveAlertInstances(ctx, instances...) + for _, instance := range instances { + _ = dbstore.SaveAlertInstance(ctx, instance) + } testCases := []struct { desc string diff --git a/pkg/services/ngalert/state/persist.go b/pkg/services/ngalert/state/persist.go index c6a23980fbc..b12deba5af6 100644 --- a/pkg/services/ngalert/state/persist.go +++ b/pkg/services/ngalert/state/persist.go @@ -11,7 +11,7 @@ import ( type InstanceStore interface { FetchOrgIds(ctx context.Context) ([]int64, error) ListAlertInstances(ctx context.Context, cmd *models.ListAlertInstancesQuery) ([]*models.AlertInstance, error) - SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error + SaveAlertInstance(ctx context.Context, instance models.AlertInstance) error DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error } diff --git a/pkg/services/ngalert/state/testing.go b/pkg/services/ngalert/state/testing.go index e14b8e88d78..1b9a6fa694c 100644 --- a/pkg/services/ngalert/state/testing.go +++ b/pkg/services/ngalert/state/testing.go @@ -28,12 +28,10 @@ func (f *FakeInstanceStore) ListAlertInstances(_ context.Context, q *models.List return nil, nil } -func (f *FakeInstanceStore) SaveAlertInstances(_ context.Context, q ...models.AlertInstance) error { +func (f *FakeInstanceStore) SaveAlertInstance(_ context.Context, q models.AlertInstance) error { f.mtx.Lock() defer f.mtx.Unlock() - for _, inst := range q { - f.RecordedOps = append(f.RecordedOps, inst) - } + f.RecordedOps = append(f.RecordedOps, q) return nil } diff --git a/pkg/services/ngalert/store/instance_database.go b/pkg/services/ngalert/store/instance_database.go index 8583a40e474..0c31bc620fe 100644 --- a/pkg/services/ngalert/store/instance_database.go +++ b/pkg/services/ngalert/store/instance_database.go @@ -43,99 +43,6 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI return result, err } -// SaveAlertInstances saves all the provided alert instances to the store. -func (st DBstore) SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error { - if !st.FeatureToggles.IsEnabled(featuremgmt.FlagAlertingBigTransactions) { - // This mimics the replace code-path by calling SaveAlertInstance in a loop, with a transaction per call. - for _, c := range cmd { - err := st.SaveAlertInstance(ctx, c) - if err != nil { - return err - } - } - return nil - } else { - // Batches write into statements with `maxRows` instances per statements. - // This makes sure we don't create statements that are too long for some - // databases to process. For example, SQLite has a limit of 999 variables - // per write. - keyNames := []string{"rule_org_id", "rule_uid", "labels_hash"} - fieldNames := []string{ - "rule_org_id", "rule_uid", "labels", "labels_hash", "current_state", - "current_reason", "current_state_since", "current_state_end", "last_eval_time", - } - fieldsPerRow := len(fieldNames) - maxRows := 20 - maxArgs := maxRows * fieldsPerRow - - bigUpsertSQL, err := st.SQLStore.GetDialect().UpsertMultipleSQL( - "alert_instance", keyNames, fieldNames, maxRows) - if err != nil { - return err - } - - // Args contains the SQL statement, and the values to fill into the SQL statement. - args := make([]interface{}, 0, maxArgs) - args = append(args, bigUpsertSQL) - values := func(a []interface{}) int { - return len(a) - 1 - } - - // Generate batches of `maxRows` and write the statements when full. - for _, alertInstance := range cmd { - labelTupleJSON, err := alertInstance.Labels.StringKey() - if err != nil { - return err - } - - if err := models.ValidateAlertInstance(alertInstance); err != nil { - return err - } - - args = append(args, - alertInstance.RuleOrgID, alertInstance.RuleUID, labelTupleJSON, alertInstance.LabelsHash, - alertInstance.CurrentState, alertInstance.CurrentReason, alertInstance.CurrentStateSince.Unix(), - alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix()) - - // If we've reached the maximum batch size, write to the database. - if values(args) >= maxArgs { - err = st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error { - _, err := sess.Exec(args...) - return err - }) - if err != nil { - return fmt.Errorf("failed to save alert instances: %w", err) - } - - // Reset args so we can re-use the allocated interface pointers. - args = args[:1] - } - } - - // Write the final batch of up to maxRows in size. - if values(args) != 0 && values(args)%fieldsPerRow == 0 { - upsertSQL, err := st.SQLStore.GetDialect().UpsertMultipleSQL( - "alert_instance", keyNames, fieldNames, values(args)/fieldsPerRow) - if err != nil { - return err - } - - args[0] = upsertSQL - err = st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error { - _, err := sess.Exec(args...) - return err - }) - if err != nil { - return fmt.Errorf("failed to save alert instances: %w", err) - } - } else if values(args) != 0 { - return fmt.Errorf("failed to upsert alert instances. Last statements had %v fields, which is not a multiple of the number of fields, %v", len(args), fieldsPerRow) - } - - return nil - } -} - // SaveAlertInstance is a handler for saving a new alert instance. func (st DBstore) SaveAlertInstance(ctx context.Context, alertInstance models.AlertInstance) error { return st.SQLStore.WithDbSession(ctx, func(sess *db.Session) error { diff --git a/pkg/services/ngalert/store/instance_database_test.go b/pkg/services/ngalert/store/instance_database_test.go index 5cb8088bcf9..3a055dab3cc 100644 --- a/pkg/services/ngalert/store/instance_database_test.go +++ b/pkg/services/ngalert/store/instance_database_test.go @@ -19,7 +19,6 @@ func BenchmarkAlertInstanceOperations(b *testing.B) { b.StopTimer() ctx := context.Background() _, dbstore := tests.SetupTestEnv(b, baseIntervalSeconds) - dbstore.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertingBigTransactions) const mainOrgID int64 = 1 @@ -48,78 +47,13 @@ func BenchmarkAlertInstanceOperations(b *testing.B) { b.StartTimer() for i := 0; i < b.N; i++ { - _ = dbstore.SaveAlertInstances(ctx, instances...) + for _, instance := range instances { + _ = dbstore.SaveAlertInstance(ctx, instance) + } _ = dbstore.DeleteAlertInstances(ctx, keys...) } } -func TestIntegrationAlertInstanceBulkWrite(t *testing.T) { - if testing.Short() { - t.Skip("skipping integration test") - } - ctx := context.Background() - _, dbstore := tests.SetupTestEnv(t, baseIntervalSeconds) - - orgIDs := []int64{1, 2, 3, 4, 5} - counts := []int{10_000, 200, 503, 0, 1256} - instances := make([]models.AlertInstance, 0, 10_000+200+503+0+1256) - keys := make([]models.AlertInstanceKey, 0, 10_000+200+503+0+1256) - - for i, id := range orgIDs { - alertRule := tests.CreateTestAlertRule(t, ctx, dbstore, 60, id) - - // Create some instances to write down and then delete. - for j := 0; j < counts[i]; j++ { - labels := models.InstanceLabels{"test": fmt.Sprint(j)} - _, labelsHash, _ := labels.StringAndHash() - instance := models.AlertInstance{ - AlertInstanceKey: models.AlertInstanceKey{ - RuleOrgID: alertRule.OrgID, - RuleUID: alertRule.UID, - LabelsHash: labelsHash, - }, - CurrentState: models.InstanceStateFiring, - CurrentReason: string(models.InstanceStateError), - Labels: labels, - } - instances = append(instances, instance) - keys = append(keys, instance.AlertInstanceKey) - } - } - - for _, bigStmts := range []bool{false, true} { - dbstore.FeatureToggles = featuremgmt.WithFeatures([]interface{}{featuremgmt.FlagAlertingBigTransactions, bigStmts}) - t.Log("Saving") - err := dbstore.SaveAlertInstances(ctx, instances...) - require.NoError(t, err) - t.Log("Finished database write") - - // List our instances. Make sure we have the right count. - for i, id := range orgIDs { - q := &models.ListAlertInstancesQuery{ - RuleOrgID: id, - } - alerts, err := dbstore.ListAlertInstances(ctx, q) - require.NoError(t, err) - require.Equal(t, counts[i], len(alerts), "Org %v: Expected %v instances but got %v", id, counts[i], len(alerts)) - } - t.Log("Finished database read") - - err = dbstore.DeleteAlertInstances(ctx, keys...) - require.NoError(t, err) - t.Log("Finished database delete") - - for _, id := range orgIDs { - q := &models.ListAlertInstancesQuery{ - RuleOrgID: id, - } - alerts, err := dbstore.ListAlertInstances(ctx, q) - require.NoError(t, err) - require.Zero(t, len(alerts), "Org %v: Deleted instances but still had %v", id, len(alerts)) - } - } -} - func TestIntegrationAlertInstanceOperations(t *testing.T) { if testing.Short() { t.Skip("skipping integration test") @@ -164,7 +98,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { CurrentReason: string(models.InstanceStateError), Labels: labels, } - err := dbstore.SaveAlertInstances(ctx, instance) + err := dbstore.SaveAlertInstance(ctx, instance) require.NoError(t, err) listCmd := &models.ListAlertInstancesQuery{ @@ -193,7 +127,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { CurrentState: models.InstanceStateNormal, Labels: labels, } - err := dbstore.SaveAlertInstances(ctx, instance) + err := dbstore.SaveAlertInstance(ctx, instance) require.NoError(t, err) listCmd := &models.ListAlertInstancesQuery{ @@ -223,7 +157,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { Labels: labels, } - err := dbstore.SaveAlertInstances(ctx, instance1) + err := dbstore.SaveAlertInstance(ctx, instance1) require.NoError(t, err) labels = models.InstanceLabels{"test": "testValue2"} @@ -237,7 +171,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { CurrentState: models.InstanceStateFiring, Labels: labels, } - err = dbstore.SaveAlertInstances(ctx, instance2) + err = dbstore.SaveAlertInstance(ctx, instance2) require.NoError(t, err) listQuery := &models.ListAlertInstancesQuery{ @@ -284,7 +218,9 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { CurrentReason: models.StateReasonError, Labels: labels, } - err := dbstore.SaveAlertInstances(ctx, instance1, instance2) + err := dbstore.SaveAlertInstance(ctx, instance1) + require.NoError(t, err) + err = dbstore.SaveAlertInstance(ctx, instance2) require.NoError(t, err) listQuery := &models.ListAlertInstancesQuery{ @@ -327,7 +263,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { Labels: labels, } - err := dbstore.SaveAlertInstances(ctx, instance1) + err := dbstore.SaveAlertInstance(ctx, instance1) require.NoError(t, err) instance2 := models.AlertInstance{ @@ -339,7 +275,7 @@ func TestIntegrationAlertInstanceOperations(t *testing.T) { CurrentState: models.InstanceStateNormal, Labels: instance1.Labels, } - err = dbstore.SaveAlertInstances(ctx, instance2) + err = dbstore.SaveAlertInstance(ctx, instance2) require.NoError(t, err) listQuery := &models.ListAlertInstancesQuery{