Alerting: Write and Delete multiple alert instances. (#54072)

Prior to this change, all alert instance writes and deletes happened
individually, in their own database transaction. This change batches up
writes or deletes for a given rule's evaluation loop into a single
transaction before applying it.

Before:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8           398           2991381 ns/op         1133537 B/op      27703 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
    util.go:127: alert definition: {orgID: 1, UID: FovKXiRVzm} with title: "an alert definition FTvFXmRVkz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: foDFXmRVkm} with title: "an alert definition fovFXmRVkz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: VQvFuigVkm} with title: "an alert definition VwDKXmR4kz" interval: 60 created
PASS
ok      github.com/grafana/grafana/pkg/services/ngalert/store   1.619s
```

After:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8          1440            816484 ns/op          352297 B/op       6529 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
    util.go:127: alert definition: {orgID: 1, UID: 302r_igVzm} with title: "an alert definition q0h9lmR4zz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: 71hrlmR4km} with title: "an alert definition nJ29_mR4zz" interval: 60 created
    util.go:127: alert definition: {orgID: 1, UID: Cahr_mR4zm} with title: "an alert definition ja2rlmg4zz" interval: 60 created
PASS
ok      github.com/grafana/grafana/pkg/services/ngalert/store   1.383s
```

So we cut time by about 75% and memory allocations by about 60% when
storing and deleting 100 instances.

This change also updates some of our tests so that they run successfully against postgreSQL - we were using random Int64s, but postgres integers, which our tables use, max out at 2^31-1
This commit is contained in:
Joe Blubaugh
2022-09-02 11:17:20 +08:00
committed by GitHub
parent d706320d0a
commit 5e4fd94413
15 changed files with 559 additions and 210 deletions

View File

@@ -3,6 +3,7 @@ package store
import (
"context"
"fmt"
"sort"
"strings"
"github.com/grafana/grafana/pkg/services/ngalert/models"
@@ -12,9 +13,9 @@ import (
type InstanceStore interface {
GetAlertInstance(ctx context.Context, cmd *models.GetAlertInstanceQuery) error
ListAlertInstances(ctx context.Context, cmd *models.ListAlertInstancesQuery) error
SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error
SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error
FetchOrgIds(ctx context.Context) ([]int64, error)
DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error
DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error
DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error
}
@@ -65,7 +66,7 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
params = append(params, p...)
}
addToQuery("SELECT alert_instance.*, alert_rule.title AS rule_title FROM alert_instance LEFT JOIN alert_rule ON alert_instance.rule_org_id = alert_rule.org_id AND alert_instance.rule_uid = alert_rule.uid WHERE rule_org_id = ?", cmd.RuleOrgID)
addToQuery("SELECT * FROM alert_instance WHERE rule_org_id = ?", cmd.RuleOrgID)
if cmd.RuleUID != "" {
addToQuery(` AND rule_uid = ?`, cmd.RuleUID)
@@ -88,43 +89,89 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
})
}
// SaveAlertInstance is a handler for saving a new alert instance.
func (st DBstore) SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error {
return st.SQLStore.WithDbSession(ctx, func(sess *sqlstore.DBSession) error {
labelTupleJSON, labelsHash, err := cmd.Labels.StringAndHash()
// SaveAlertInstances saves all the provided alert instances to the store in a single transaction.
func (st DBstore) SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error {
// The function starts a single transaction and batches writes into
// statements with `maxRows` instances per statements. This makes for a
// fairly efficient transcation without creating statements that are too long
// for some databases to process. For example, SQLite has a limit of 999
// variables per write.
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
keyNames := []string{"rule_org_id", "rule_uid", "labels_hash"}
fieldNames := []string{
"rule_org_id", "rule_uid", "labels", "labels_hash", "current_state",
"current_reason", "current_state_since", "current_state_end", "last_eval_time",
}
fieldsPerRow := len(fieldNames)
maxRows := 20
maxArgs := maxRows * fieldsPerRow
// Prepare a statement for the maximum batch size.
bigUpsertSQL, err := st.SQLStore.Dialect.UpsertMultipleSQL(
"alert_instance", keyNames, fieldNames, maxRows)
if err != nil {
return err
}
alertInstance := &models.AlertInstance{
RuleOrgID: cmd.RuleOrgID,
RuleUID: cmd.RuleUID,
Labels: cmd.Labels,
LabelsHash: labelsHash,
CurrentState: cmd.State,
CurrentReason: cmd.StateReason,
CurrentStateSince: cmd.CurrentStateSince,
CurrentStateEnd: cmd.CurrentStateEnd,
LastEvalTime: cmd.LastEvalTime,
}
if err := models.ValidateAlertInstance(alertInstance); err != nil {
return err
}
params := append(make([]interface{}, 0), alertInstance.RuleOrgID, alertInstance.RuleUID, labelTupleJSON, alertInstance.LabelsHash, alertInstance.CurrentState, alertInstance.CurrentReason, alertInstance.CurrentStateSince.Unix(), alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix())
upsertSQL := st.SQLStore.Dialect.UpsertSQL(
"alert_instance",
[]string{"rule_org_id", "rule_uid", "labels_hash"},
[]string{"rule_org_id", "rule_uid", "labels", "labels_hash", "current_state", "current_reason", "current_state_since", "current_state_end", "last_eval_time"})
_, err = sess.SQL(upsertSQL, params...).Query()
bigStmt, err := sess.DB().Prepare(bigUpsertSQL)
if err != nil {
return err
}
// Generate batches of `maxRows` and write the statements when full.
args := make([]interface{}, 0, maxArgs)
for _, alertInstance := range cmd {
if len(args) >= maxArgs {
if _, err = bigStmt.ExecContext(ctx, args...); err != nil {
return err
}
args = args[:0]
}
labelTupleJSON, err := alertInstance.Labels.StringKey()
if err != nil {
return err
}
if err := models.ValidateAlertInstance(alertInstance); err != nil {
return err
}
args = append(args,
alertInstance.RuleOrgID, alertInstance.RuleUID, labelTupleJSON, alertInstance.LabelsHash,
alertInstance.CurrentState, alertInstance.CurrentReason, alertInstance.CurrentStateSince.Unix(),
alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix())
}
// Write the final batch of up to maxRows in size.
if len(args) > 0 && len(args)%fieldsPerRow == 0 {
upsertSQL, err := st.SQLStore.Dialect.UpsertMultipleSQL(
"alert_instance", keyNames, fieldNames, len(args)/fieldsPerRow)
if err != nil {
return err
}
stmt, err := sess.DB().Prepare(upsertSQL)
if err != nil {
return err
}
_, err = stmt.ExecContext(ctx, args...)
if err != nil {
return err
}
} else {
return fmt.Errorf("failed to upsert alert instances. Last statements had %v fields, which is not a multiple of the number of fields, %v", len(args), fieldsPerRow)
}
return nil
})
if err != nil {
return err
}
return nil
}
func (st DBstore) FetchOrgIds(ctx context.Context) ([]int64, error) {
@@ -150,14 +197,103 @@ func (st DBstore) FetchOrgIds(ctx context.Context) ([]int64, error) {
return orgIds, err
}
func (st DBstore) DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error {
return st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
_, err := sess.Exec("DELETE FROM alert_instance WHERE rule_org_id = ? AND rule_uid = ? AND labels_hash = ?", orgID, ruleUID, labelsHash)
// DeleteAlertInstances deletes instances with the provided keys in a single transaction.
func (st DBstore) DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error {
if len(keys) == 0 {
return nil
}
type data struct {
ruleOrgID int64
ruleUID string
labelHashes []interface{}
}
// Sort by org and rule UID. Most callers will have grouped already, but it's
// cheap to verify and leads to more compact transactions.
sort.Slice(keys, func(i, j int) bool {
aye := keys[i]
jay := keys[j]
if aye.RuleOrgID < jay.RuleOrgID {
return true
}
if aye.RuleOrgID == jay.RuleOrgID && aye.RuleUID < jay.RuleUID {
return true
}
return false
})
maxRows := 200
rowData := data{
0, "", make([]interface{}, 0, maxRows),
}
placeholdersBuilder := strings.Builder{}
placeholdersBuilder.WriteString("(")
execQuery := func(s *sqlstore.DBSession, rd data, placeholders string) error {
if len(rd.labelHashes) == 0 {
return nil
}
placeholders = strings.TrimRight(placeholders, ", ")
placeholders = placeholders + ")"
queryString := fmt.Sprintf(
"DELETE FROM alert_instance WHERE rule_org_id = ? AND rule_uid = ? AND labels_hash IN %s;",
placeholders,
)
execArgs := make([]interface{}, 0, 3+len(rd.labelHashes))
execArgs = append(execArgs, queryString, rd.ruleOrgID, rd.ruleUID)
execArgs = append(execArgs, rd.labelHashes...)
_, err := s.Exec(execArgs...)
if err != nil {
return err
}
return nil
}
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
counter := 0
// Create batches of up to 200 items and execute a new delete statement for each batch.
for _, k := range keys {
counter++
// When a rule ID changes or we hit 200 hashes, issue a statement.
if rowData.ruleOrgID != k.RuleOrgID || rowData.ruleUID != k.RuleUID || len(rowData.labelHashes) >= 200 {
err := execQuery(sess, rowData, placeholdersBuilder.String())
if err != nil {
return err
}
// reset our reused data.
rowData.ruleOrgID = k.RuleOrgID
rowData.ruleUID = k.RuleUID
rowData.labelHashes = rowData.labelHashes[:0]
placeholdersBuilder.Reset()
placeholdersBuilder.WriteString("(")
}
// Accumulate new values.
rowData.labelHashes = append(rowData.labelHashes, k.LabelsHash)
placeholdersBuilder.WriteString("?, ")
}
// Delete any remaining rows.
if len(rowData.labelHashes) != 0 {
err := execQuery(sess, rowData, placeholdersBuilder.String())
if err != nil {
return err
}
}
return nil
})
return err
}
func (st DBstore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error {