grafana/pkg/services/ngalert/state/testing.go

103 lines
2.7 KiB
Go
Raw Normal View History

package state
import (
"context"
"slices"
"sync"
"github.com/grafana/grafana/pkg/services/ngalert/models"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/grafana/grafana/pkg/services/screenshot"
)
Alerting: Write and Delete multiple alert instances. (#55350) Prior to this change, all alert instance writes and deletes happened individually, in their own database transaction. This change batches up writes or deletes for a given rule's evaluation loop into a single transaction before applying it. These new transactions are off by default, guarded by the feature toggle "alertingBigTransactions" Before: ``` goos: darwin goarch: arm64 pkg: github.com/grafana/grafana/pkg/services/ngalert/store BenchmarkAlertInstanceOperations-8 398 2991381 ns/op 1133537 B/op 27703 allocs/op --- BENCH: BenchmarkAlertInstanceOperations-8 util.go:127: alert definition: {orgID: 1, UID: FovKXiRVzm} with title: "an alert definition FTvFXmRVkz" interval: 60 created util.go:127: alert definition: {orgID: 1, UID: foDFXmRVkm} with title: "an alert definition fovFXmRVkz" interval: 60 created util.go:127: alert definition: {orgID: 1, UID: VQvFuigVkm} with title: "an alert definition VwDKXmR4kz" interval: 60 created PASS ok github.com/grafana/grafana/pkg/services/ngalert/store 1.619s ``` After: ``` goos: darwin goarch: arm64 pkg: github.com/grafana/grafana/pkg/services/ngalert/store BenchmarkAlertInstanceOperations-8 1440 816484 ns/op 352297 B/op 6529 allocs/op --- BENCH: BenchmarkAlertInstanceOperations-8 util.go:127: alert definition: {orgID: 1, UID: 302r_igVzm} with title: "an alert definition q0h9lmR4zz" interval: 60 created util.go:127: alert definition: {orgID: 1, UID: 71hrlmR4km} with title: "an alert definition nJ29_mR4zz" interval: 60 created util.go:127: alert definition: {orgID: 1, UID: Cahr_mR4zm} with title: "an alert definition ja2rlmg4zz" interval: 60 created PASS ok github.com/grafana/grafana/pkg/services/ngalert/store 1.383s ``` So we cut time by about 75% and memory allocations by about 60% when storing and deleting 100 instances.
2022-10-06 01:22:58 -05:00
var _ InstanceStore = &FakeInstanceStore{}
type FakeInstanceStore struct {
mtx sync.Mutex
recordedOps []any
}
type FakeInstanceStoreOp struct {
Name string
Args []any
}
func (f *FakeInstanceStore) RecordedOps() []any {
f.mtx.Lock()
defer f.mtx.Unlock()
return slices.Clone(f.recordedOps)
}
func (f *FakeInstanceStore) ListAlertInstances(_ context.Context, q *models.ListAlertInstancesQuery) ([]*models.AlertInstance, error) {
f.mtx.Lock()
defer f.mtx.Unlock()
f.recordedOps = append(f.recordedOps, *q)
return nil, nil
}
func (f *FakeInstanceStore) SaveAlertInstance(_ context.Context, q models.AlertInstance) error {
f.mtx.Lock()
defer f.mtx.Unlock()
f.recordedOps = append(f.recordedOps, q)
return nil
}
func (f *FakeInstanceStore) FetchOrgIds(_ context.Context) ([]int64, error) { return []int64{}, nil }
func (f *FakeInstanceStore) DeleteAlertInstances(ctx context.Context, q ...models.AlertInstanceKey) error {
f.mtx.Lock()
defer f.mtx.Unlock()
f.recordedOps = append(f.recordedOps, FakeInstanceStoreOp{
Name: "DeleteAlertInstances", Args: []any{
ctx,
q,
},
})
return nil
}
func (f *FakeInstanceStore) DeleteAlertInstancesByRule(ctx context.Context, key models.AlertRuleKey) error {
return nil
}
func (f *FakeInstanceStore) FullSync(ctx context.Context, instances []models.AlertInstance) error {
f.mtx.Lock()
defer f.mtx.Unlock()
f.recordedOps = []any{}
for _, instance := range instances {
f.recordedOps = append(f.recordedOps, instance)
}
return nil
}
type FakeRuleReader struct{}
func (f *FakeRuleReader) ListAlertRules(_ context.Context, q *models.ListAlertRulesQuery) (models.RulesGroup, error) {
return nil, nil
}
Alerting: Add alert pausing feature (#60734) * Add field in alert_rule model, add state to alert_instance model, and state to eval * Remove paused state from eval package * Skip paused alert rules in scheduler * Add migration to add is_paused field to alert_rule table * Convert to postable alerts only if not normal, pernding, or paused * Handle paused eval results in state manager * Add Paused state to eval package * Add paused alerts logic in scheduler * Skip alert on scheduler * Remove paused status from eval package * Apply suggestions from code review Co-authored-by: George Robinson <george.robinson@grafana.com> * Remove state * Rethink schedule and manager for paused alerts * Change return to continue * Remove unused var * Rethink alert pausing * Paused alerts storing annotations * Only add one state transition * Revert boolean method renaming refactor * Revert take image refactor * Make registry errors public * Revert method extraction for getting a folder title * Revert variable renaming refactor * Undo unnecessary changes * Revert changes in test * Remove IsPause check in PatchPartiLAlertRule function * Use SetNormal to set state * Fix text by returning to old behaviour on alert rule deletion * Add test in schedule_unit_test.go to test ticks with paused alerts * Add coment to clarify usage of context.Background() * Add comment to clarify resetStateByRuleUID method usage * Move rule get to a more limited scope * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * rum gofmt on pkg/services/ngalert/schedule/schedule.go * Remove defer cancel for context * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/testing.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * skip scheduler rule state clean up on paused alert rule * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Fix mock in test * Add (hopefully) final suggestions * Use error channel from recordAnnotationsSync to cancel context * Run make gen-cue * Place pause alert check in channel update after version check * Reduce branching un update channel select * Add if for error and move code inside if in state manager ResetStateByRuleUID * Add reason to logs * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * Do not delete alert rule routine, just exit on eval if is paused * Reduce branching and create-close a channel to avoid deadlocks * Separate state deletion and state reset (includes history saving) * Add current pause state in rule route in scheduler * Split clearState and bring errCh closer to RecordStatesAsync call * Change rule to ruleMeta in RecordStatesAsync * copy state to be able to modify it * Add timeout to context creation * Shorten the timeout * Use resetState is rule is paused and deleteState if rule is not paused * Remove Empty state reason * Save every rule change in historian * Add tests for DeleteStateByRuleUID and ResetStateByRuleUID * Remove useless line * Remove outdated comment Co-authored-by: George Robinson <george.robinson@grafana.com> Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> Co-authored-by: Armand Grillet <2117580+armandgrillet@users.noreply.github.com>
2023-01-26 11:29:10 -06:00
type FakeHistorian struct {
StateTransitions []StateTransition
}
func (f *FakeHistorian) Record(ctx context.Context, rule history_model.RuleMeta, states []StateTransition) <-chan error {
Alerting: Add alert pausing feature (#60734) * Add field in alert_rule model, add state to alert_instance model, and state to eval * Remove paused state from eval package * Skip paused alert rules in scheduler * Add migration to add is_paused field to alert_rule table * Convert to postable alerts only if not normal, pernding, or paused * Handle paused eval results in state manager * Add Paused state to eval package * Add paused alerts logic in scheduler * Skip alert on scheduler * Remove paused status from eval package * Apply suggestions from code review Co-authored-by: George Robinson <george.robinson@grafana.com> * Remove state * Rethink schedule and manager for paused alerts * Change return to continue * Remove unused var * Rethink alert pausing * Paused alerts storing annotations * Only add one state transition * Revert boolean method renaming refactor * Revert take image refactor * Make registry errors public * Revert method extraction for getting a folder title * Revert variable renaming refactor * Undo unnecessary changes * Revert changes in test * Remove IsPause check in PatchPartiLAlertRule function * Use SetNormal to set state * Fix text by returning to old behaviour on alert rule deletion * Add test in schedule_unit_test.go to test ticks with paused alerts * Add coment to clarify usage of context.Background() * Add comment to clarify resetStateByRuleUID method usage * Move rule get to a more limited scope * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * rum gofmt on pkg/services/ngalert/schedule/schedule.go * Remove defer cancel for context * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/testing.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/schedule/schedule_unit_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Update pkg/services/ngalert/models/instance_test.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * skip scheduler rule state clean up on paused alert rule * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> * Fix mock in test * Add (hopefully) final suggestions * Use error channel from recordAnnotationsSync to cancel context * Run make gen-cue * Place pause alert check in channel update after version check * Reduce branching un update channel select * Add if for error and move code inside if in state manager ResetStateByRuleUID * Add reason to logs * Update pkg/services/ngalert/schedule/schedule.go Co-authored-by: George Robinson <george.robinson@grafana.com> * Do not delete alert rule routine, just exit on eval if is paused * Reduce branching and create-close a channel to avoid deadlocks * Separate state deletion and state reset (includes history saving) * Add current pause state in rule route in scheduler * Split clearState and bring errCh closer to RecordStatesAsync call * Change rule to ruleMeta in RecordStatesAsync * copy state to be able to modify it * Add timeout to context creation * Shorten the timeout * Use resetState is rule is paused and deleteState if rule is not paused * Remove Empty state reason * Save every rule change in historian * Add tests for DeleteStateByRuleUID and ResetStateByRuleUID * Remove useless line * Remove outdated comment Co-authored-by: George Robinson <george.robinson@grafana.com> Co-authored-by: Santiago <santiagohernandez.1997@gmail.com> Co-authored-by: Armand Grillet <2117580+armandgrillet@users.noreply.github.com>
2023-01-26 11:29:10 -06:00
f.StateTransitions = append(f.StateTransitions, states...)
errCh := make(chan error)
close(errCh)
return errCh
}
// NotAvailableImageService is a service that returns ErrScreenshotsUnavailable.
type NotAvailableImageService struct{}
func (s *NotAvailableImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
return nil, screenshot.ErrScreenshotsUnavailable
}
// NoopImageService is a no-op image service.
type NoopImageService struct{}
func (s *NoopImageService) NewImage(_ context.Context, _ *models.AlertRule) (*models.Image, error) {
return &models.Image{}, nil
}