mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Write and Delete multiple alert instances. (#54072)
Prior to this change, all alert instance writes and deletes happened
individually, in their own database transaction. This change batches up
writes or deletes for a given rule's evaluation loop into a single
transaction before applying it.
Before:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8 398 2991381 ns/op 1133537 B/op 27703 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
util.go:127: alert definition: {orgID: 1, UID: FovKXiRVzm} with title: "an alert definition FTvFXmRVkz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: foDFXmRVkm} with title: "an alert definition fovFXmRVkz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: VQvFuigVkm} with title: "an alert definition VwDKXmR4kz" interval: 60 created
PASS
ok github.com/grafana/grafana/pkg/services/ngalert/store 1.619s
```
After:
```
goos: darwin
goarch: arm64
pkg: github.com/grafana/grafana/pkg/services/ngalert/store
BenchmarkAlertInstanceOperations-8 1440 816484 ns/op 352297 B/op 6529 allocs/op
--- BENCH: BenchmarkAlertInstanceOperations-8
util.go:127: alert definition: {orgID: 1, UID: 302r_igVzm} with title: "an alert definition q0h9lmR4zz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: 71hrlmR4km} with title: "an alert definition nJ29_mR4zz" interval: 60 created
util.go:127: alert definition: {orgID: 1, UID: Cahr_mR4zm} with title: "an alert definition ja2rlmg4zz" interval: 60 created
PASS
ok github.com/grafana/grafana/pkg/services/ngalert/store 1.383s
```
So we cut time by about 75% and memory allocations by about 60% when
storing and deleting 100 instances.
This change also updates some of our tests so that they run successfully against postgreSQL - we were using random Int64s, but postgres integers, which our tables use, max out at 2^31-1
This commit is contained in:
@@ -3,6 +3,7 @@ package store
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
@@ -12,9 +13,9 @@ import (
|
||||
type InstanceStore interface {
|
||||
GetAlertInstance(ctx context.Context, cmd *models.GetAlertInstanceQuery) error
|
||||
ListAlertInstances(ctx context.Context, cmd *models.ListAlertInstancesQuery) error
|
||||
SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error
|
||||
SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error
|
||||
FetchOrgIds(ctx context.Context) ([]int64, error)
|
||||
DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error
|
||||
DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error
|
||||
DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error
|
||||
}
|
||||
|
||||
@@ -65,7 +66,7 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
|
||||
params = append(params, p...)
|
||||
}
|
||||
|
||||
addToQuery("SELECT alert_instance.*, alert_rule.title AS rule_title FROM alert_instance LEFT JOIN alert_rule ON alert_instance.rule_org_id = alert_rule.org_id AND alert_instance.rule_uid = alert_rule.uid WHERE rule_org_id = ?", cmd.RuleOrgID)
|
||||
addToQuery("SELECT * FROM alert_instance WHERE rule_org_id = ?", cmd.RuleOrgID)
|
||||
|
||||
if cmd.RuleUID != "" {
|
||||
addToQuery(` AND rule_uid = ?`, cmd.RuleUID)
|
||||
@@ -88,43 +89,89 @@ func (st DBstore) ListAlertInstances(ctx context.Context, cmd *models.ListAlertI
|
||||
})
|
||||
}
|
||||
|
||||
// SaveAlertInstance is a handler for saving a new alert instance.
|
||||
func (st DBstore) SaveAlertInstance(ctx context.Context, cmd *models.SaveAlertInstanceCommand) error {
|
||||
return st.SQLStore.WithDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
labelTupleJSON, labelsHash, err := cmd.Labels.StringAndHash()
|
||||
// SaveAlertInstances saves all the provided alert instances to the store in a single transaction.
|
||||
func (st DBstore) SaveAlertInstances(ctx context.Context, cmd ...models.AlertInstance) error {
|
||||
// The function starts a single transaction and batches writes into
|
||||
// statements with `maxRows` instances per statements. This makes for a
|
||||
// fairly efficient transcation without creating statements that are too long
|
||||
// for some databases to process. For example, SQLite has a limit of 999
|
||||
// variables per write.
|
||||
|
||||
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
keyNames := []string{"rule_org_id", "rule_uid", "labels_hash"}
|
||||
fieldNames := []string{
|
||||
"rule_org_id", "rule_uid", "labels", "labels_hash", "current_state",
|
||||
"current_reason", "current_state_since", "current_state_end", "last_eval_time",
|
||||
}
|
||||
fieldsPerRow := len(fieldNames)
|
||||
maxRows := 20
|
||||
maxArgs := maxRows * fieldsPerRow
|
||||
|
||||
// Prepare a statement for the maximum batch size.
|
||||
bigUpsertSQL, err := st.SQLStore.Dialect.UpsertMultipleSQL(
|
||||
"alert_instance", keyNames, fieldNames, maxRows)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
alertInstance := &models.AlertInstance{
|
||||
RuleOrgID: cmd.RuleOrgID,
|
||||
RuleUID: cmd.RuleUID,
|
||||
Labels: cmd.Labels,
|
||||
LabelsHash: labelsHash,
|
||||
CurrentState: cmd.State,
|
||||
CurrentReason: cmd.StateReason,
|
||||
CurrentStateSince: cmd.CurrentStateSince,
|
||||
CurrentStateEnd: cmd.CurrentStateEnd,
|
||||
LastEvalTime: cmd.LastEvalTime,
|
||||
}
|
||||
|
||||
if err := models.ValidateAlertInstance(alertInstance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
params := append(make([]interface{}, 0), alertInstance.RuleOrgID, alertInstance.RuleUID, labelTupleJSON, alertInstance.LabelsHash, alertInstance.CurrentState, alertInstance.CurrentReason, alertInstance.CurrentStateSince.Unix(), alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix())
|
||||
|
||||
upsertSQL := st.SQLStore.Dialect.UpsertSQL(
|
||||
"alert_instance",
|
||||
[]string{"rule_org_id", "rule_uid", "labels_hash"},
|
||||
[]string{"rule_org_id", "rule_uid", "labels", "labels_hash", "current_state", "current_reason", "current_state_since", "current_state_end", "last_eval_time"})
|
||||
_, err = sess.SQL(upsertSQL, params...).Query()
|
||||
bigStmt, err := sess.DB().Prepare(bigUpsertSQL)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Generate batches of `maxRows` and write the statements when full.
|
||||
args := make([]interface{}, 0, maxArgs)
|
||||
for _, alertInstance := range cmd {
|
||||
if len(args) >= maxArgs {
|
||||
if _, err = bigStmt.ExecContext(ctx, args...); err != nil {
|
||||
return err
|
||||
}
|
||||
args = args[:0]
|
||||
}
|
||||
|
||||
labelTupleJSON, err := alertInstance.Labels.StringKey()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := models.ValidateAlertInstance(alertInstance); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
args = append(args,
|
||||
alertInstance.RuleOrgID, alertInstance.RuleUID, labelTupleJSON, alertInstance.LabelsHash,
|
||||
alertInstance.CurrentState, alertInstance.CurrentReason, alertInstance.CurrentStateSince.Unix(),
|
||||
alertInstance.CurrentStateEnd.Unix(), alertInstance.LastEvalTime.Unix())
|
||||
}
|
||||
|
||||
// Write the final batch of up to maxRows in size.
|
||||
if len(args) > 0 && len(args)%fieldsPerRow == 0 {
|
||||
upsertSQL, err := st.SQLStore.Dialect.UpsertMultipleSQL(
|
||||
"alert_instance", keyNames, fieldNames, len(args)/fieldsPerRow)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stmt, err := sess.DB().Prepare(upsertSQL)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = stmt.ExecContext(ctx, args...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
return fmt.Errorf("failed to upsert alert instances. Last statements had %v fields, which is not a multiple of the number of fields, %v", len(args), fieldsPerRow)
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (st DBstore) FetchOrgIds(ctx context.Context) ([]int64, error) {
|
||||
@@ -150,14 +197,103 @@ func (st DBstore) FetchOrgIds(ctx context.Context) ([]int64, error) {
|
||||
return orgIds, err
|
||||
}
|
||||
|
||||
func (st DBstore) DeleteAlertInstance(ctx context.Context, orgID int64, ruleUID, labelsHash string) error {
|
||||
return st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
_, err := sess.Exec("DELETE FROM alert_instance WHERE rule_org_id = ? AND rule_uid = ? AND labels_hash = ?", orgID, ruleUID, labelsHash)
|
||||
// DeleteAlertInstances deletes instances with the provided keys in a single transaction.
|
||||
func (st DBstore) DeleteAlertInstances(ctx context.Context, keys ...models.AlertInstanceKey) error {
|
||||
if len(keys) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
type data struct {
|
||||
ruleOrgID int64
|
||||
ruleUID string
|
||||
labelHashes []interface{}
|
||||
}
|
||||
|
||||
// Sort by org and rule UID. Most callers will have grouped already, but it's
|
||||
// cheap to verify and leads to more compact transactions.
|
||||
sort.Slice(keys, func(i, j int) bool {
|
||||
aye := keys[i]
|
||||
jay := keys[j]
|
||||
|
||||
if aye.RuleOrgID < jay.RuleOrgID {
|
||||
return true
|
||||
}
|
||||
|
||||
if aye.RuleOrgID == jay.RuleOrgID && aye.RuleUID < jay.RuleUID {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
})
|
||||
|
||||
maxRows := 200
|
||||
rowData := data{
|
||||
0, "", make([]interface{}, 0, maxRows),
|
||||
}
|
||||
placeholdersBuilder := strings.Builder{}
|
||||
placeholdersBuilder.WriteString("(")
|
||||
|
||||
execQuery := func(s *sqlstore.DBSession, rd data, placeholders string) error {
|
||||
if len(rd.labelHashes) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
placeholders = strings.TrimRight(placeholders, ", ")
|
||||
placeholders = placeholders + ")"
|
||||
|
||||
queryString := fmt.Sprintf(
|
||||
"DELETE FROM alert_instance WHERE rule_org_id = ? AND rule_uid = ? AND labels_hash IN %s;",
|
||||
placeholders,
|
||||
)
|
||||
|
||||
execArgs := make([]interface{}, 0, 3+len(rd.labelHashes))
|
||||
execArgs = append(execArgs, queryString, rd.ruleOrgID, rd.ruleUID)
|
||||
execArgs = append(execArgs, rd.labelHashes...)
|
||||
_, err := s.Exec(execArgs...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
counter := 0
|
||||
|
||||
// Create batches of up to 200 items and execute a new delete statement for each batch.
|
||||
for _, k := range keys {
|
||||
counter++
|
||||
// When a rule ID changes or we hit 200 hashes, issue a statement.
|
||||
if rowData.ruleOrgID != k.RuleOrgID || rowData.ruleUID != k.RuleUID || len(rowData.labelHashes) >= 200 {
|
||||
err := execQuery(sess, rowData, placeholdersBuilder.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// reset our reused data.
|
||||
rowData.ruleOrgID = k.RuleOrgID
|
||||
rowData.ruleUID = k.RuleUID
|
||||
rowData.labelHashes = rowData.labelHashes[:0]
|
||||
placeholdersBuilder.Reset()
|
||||
placeholdersBuilder.WriteString("(")
|
||||
}
|
||||
|
||||
// Accumulate new values.
|
||||
rowData.labelHashes = append(rowData.labelHashes, k.LabelsHash)
|
||||
placeholdersBuilder.WriteString("?, ")
|
||||
}
|
||||
|
||||
// Delete any remaining rows.
|
||||
if len(rowData.labelHashes) != 0 {
|
||||
err := execQuery(sess, rowData, placeholdersBuilder.String())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
})
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (st DBstore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error {
|
||||
|
||||
Reference in New Issue
Block a user