mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Move rule evaluation status logic out of prometheus API and into scheduler (#89141)
* Add health fields to rules and an aggregator method to the scheduler * Move health, last error, and last eval time in together to minimize state processing * Wire up a readonly scheduler to prom api * Extract to exported function * Use health in api_prometheus and fix up tests * Rename health struct to status * Fix tests one more time * Several new tests * Handle inactive rules * Push state mapping into state manager * rename to StatusReader * Rectify cyclo complexity rebase * Convert existing package local status implementation to models one * fix tests * undo RuleDefs rename
This commit is contained in:
parent
6a3eb276ef
commit
393faa8732
@ -63,6 +63,7 @@ type API struct {
|
|||||||
DataProxy *datasourceproxy.DataSourceProxyService
|
DataProxy *datasourceproxy.DataSourceProxyService
|
||||||
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
|
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
|
||||||
StateManager *state.Manager
|
StateManager *state.Manager
|
||||||
|
Scheduler StatusReader
|
||||||
AccessControl ac.AccessControl
|
AccessControl ac.AccessControl
|
||||||
Policies *provisioning.NotificationPolicyService
|
Policies *provisioning.NotificationPolicyService
|
||||||
ReceiverService *notifier.ReceiverService
|
ReceiverService *notifier.ReceiverService
|
||||||
@ -115,7 +116,7 @@ func (api *API) RegisterAPIEndpoints(m *metrics.API) {
|
|||||||
api.RegisterPrometheusApiEndpoints(NewForkingProm(
|
api.RegisterPrometheusApiEndpoints(NewForkingProm(
|
||||||
api.DatasourceCache,
|
api.DatasourceCache,
|
||||||
NewLotexProm(proxy, logger),
|
NewLotexProm(proxy, logger),
|
||||||
&PrometheusSrv{log: logger, manager: api.StateManager, store: api.RuleStore, authz: ruleAuthzService},
|
&PrometheusSrv{log: logger, manager: api.StateManager, status: api.Scheduler, store: api.RuleStore, authz: ruleAuthzService},
|
||||||
), m)
|
), m)
|
||||||
// Register endpoints for proxying to Cortex Ruler-compatible backends.
|
// Register endpoints for proxying to Cortex Ruler-compatible backends.
|
||||||
api.RegisterRulerApiEndpoints(NewForkingRuler(
|
api.RegisterRulerApiEndpoints(NewForkingRuler(
|
||||||
|
@ -9,7 +9,6 @@ import (
|
|||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/prometheus/alertmanager/pkg/labels"
|
"github.com/prometheus/alertmanager/pkg/labels"
|
||||||
apiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
apiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||||
@ -24,9 +23,14 @@ import (
|
|||||||
"github.com/grafana/grafana/pkg/util"
|
"github.com/grafana/grafana/pkg/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type StatusReader interface {
|
||||||
|
Status(key ngmodels.AlertRuleKey) (ngmodels.RuleStatus, bool)
|
||||||
|
}
|
||||||
|
|
||||||
type PrometheusSrv struct {
|
type PrometheusSrv struct {
|
||||||
log log.Logger
|
log log.Logger
|
||||||
manager state.AlertInstanceManager
|
manager state.AlertInstanceManager
|
||||||
|
status StatusReader
|
||||||
store RuleStore
|
store RuleStore
|
||||||
authz RuleAccessControlService
|
authz RuleAccessControlService
|
||||||
}
|
}
|
||||||
@ -222,7 +226,7 @@ func (srv PrometheusSrv) RouteGetRuleStatuses(c *contextmodel.ReqContext) respon
|
|||||||
namespaces[namespaceUID] = folder.Fullpath
|
namespaces[namespaceUID] = folder.Fullpath
|
||||||
}
|
}
|
||||||
|
|
||||||
ruleResponse = PrepareRuleGroupStatuses(srv.log, srv.manager, srv.store, RuleGroupStatusesOptions{
|
ruleResponse = PrepareRuleGroupStatuses(srv.log, srv.manager, srv.status, srv.store, RuleGroupStatusesOptions{
|
||||||
Ctx: c.Req.Context(),
|
Ctx: c.Req.Context(),
|
||||||
OrgID: c.OrgID,
|
OrgID: c.OrgID,
|
||||||
Query: c.Req.Form,
|
Query: c.Req.Form,
|
||||||
@ -235,7 +239,7 @@ func (srv PrometheusSrv) RouteGetRuleStatuses(c *contextmodel.ReqContext) respon
|
|||||||
return response.JSON(ruleResponse.HTTPStatusCode(), ruleResponse)
|
return response.JSON(ruleResponse.HTTPStatusCode(), ruleResponse)
|
||||||
}
|
}
|
||||||
|
|
||||||
func PrepareRuleGroupStatuses(log log.Logger, manager state.AlertInstanceManager, store ListAlertRulesStore, opts RuleGroupStatusesOptions) apimodels.RuleResponse {
|
func PrepareRuleGroupStatuses(log log.Logger, manager state.AlertInstanceManager, status StatusReader, store ListAlertRulesStore, opts RuleGroupStatusesOptions) apimodels.RuleResponse {
|
||||||
ruleResponse := apimodels.RuleResponse{
|
ruleResponse := apimodels.RuleResponse{
|
||||||
DiscoveryBase: apimodels.DiscoveryBase{
|
DiscoveryBase: apimodels.DiscoveryBase{
|
||||||
Status: "success",
|
Status: "success",
|
||||||
@ -346,7 +350,7 @@ func PrepareRuleGroupStatuses(log log.Logger, manager state.AlertInstanceManager
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
ruleGroup, totals := toRuleGroup(log, manager, groupKey, folder, rules, limitAlertsPerRule, withStatesFast, matchers, labelOptions)
|
ruleGroup, totals := toRuleGroup(log, manager, status, groupKey, folder, rules, limitAlertsPerRule, withStatesFast, matchers, labelOptions)
|
||||||
ruleGroup.Totals = totals
|
ruleGroup.Totals = totals
|
||||||
for k, v := range totals {
|
for k, v := range totals {
|
||||||
rulesTotals[k] += v
|
rulesTotals[k] += v
|
||||||
@ -432,7 +436,7 @@ func matchersMatch(matchers []*labels.Matcher, labels map[string]string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, groupKey ngmodels.AlertRuleGroupKey, folderFullPath string, rules []*ngmodels.AlertRule, limitAlerts int64, withStates map[eval.State]struct{}, matchers labels.Matchers, labelOptions []ngmodels.LabelOption) (*apimodels.RuleGroup, map[string]int64) {
|
func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, sr StatusReader, groupKey ngmodels.AlertRuleGroupKey, folderFullPath string, rules []*ngmodels.AlertRule, limitAlerts int64, withStates map[eval.State]struct{}, matchers labels.Matchers, labelOptions []ngmodels.LabelOption) (*apimodels.RuleGroup, map[string]int64) {
|
||||||
newGroup := &apimodels.RuleGroup{
|
newGroup := &apimodels.RuleGroup{
|
||||||
Name: groupKey.RuleGroup,
|
Name: groupKey.RuleGroup,
|
||||||
// file is what Prometheus uses for provisioning, we replace it with namespace which is the folder in Grafana.
|
// file is what Prometheus uses for provisioning, we replace it with namespace which is the folder in Grafana.
|
||||||
@ -443,6 +447,15 @@ func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, groupKey ng
|
|||||||
|
|
||||||
ngmodels.RulesGroup(rules).SortByGroupIndex()
|
ngmodels.RulesGroup(rules).SortByGroupIndex()
|
||||||
for _, rule := range rules {
|
for _, rule := range rules {
|
||||||
|
status, ok := sr.Status(rule.GetKey())
|
||||||
|
// Grafana by design return "ok" health and default other fields for unscheduled rules.
|
||||||
|
// This differs from Prometheus.
|
||||||
|
if !ok {
|
||||||
|
status = ngmodels.RuleStatus{
|
||||||
|
Health: "ok",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
alertingRule := apimodels.AlertingRule{
|
alertingRule := apimodels.AlertingRule{
|
||||||
State: "inactive",
|
State: "inactive",
|
||||||
Name: rule.Title,
|
Name: rule.Title,
|
||||||
@ -454,9 +467,11 @@ func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, groupKey ng
|
|||||||
newRule := apimodels.Rule{
|
newRule := apimodels.Rule{
|
||||||
Name: rule.Title,
|
Name: rule.Title,
|
||||||
Labels: apimodels.LabelsFromMap(rule.GetLabels(labelOptions...)),
|
Labels: apimodels.LabelsFromMap(rule.GetLabels(labelOptions...)),
|
||||||
Health: "ok",
|
Health: status.Health,
|
||||||
|
LastError: errorOrEmpty(status.LastError),
|
||||||
Type: rule.Type().String(),
|
Type: rule.Type().String(),
|
||||||
LastEvaluation: time.Time{},
|
LastEvaluation: status.EvaluationTimestamp,
|
||||||
|
EvaluationTime: status.EvaluationDuration.Seconds(),
|
||||||
}
|
}
|
||||||
|
|
||||||
states := manager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
states := manager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
||||||
@ -485,12 +500,6 @@ func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, groupKey ng
|
|||||||
Value: valString,
|
Value: valString,
|
||||||
}
|
}
|
||||||
|
|
||||||
if alertState.LastEvaluationTime.After(newRule.LastEvaluation) {
|
|
||||||
newRule.LastEvaluation = alertState.LastEvaluationTime
|
|
||||||
}
|
|
||||||
|
|
||||||
newRule.EvaluationTime = alertState.EvaluationDuration.Seconds()
|
|
||||||
|
|
||||||
switch alertState.State {
|
switch alertState.State {
|
||||||
case eval.Normal:
|
case eval.Normal:
|
||||||
case eval.Pending:
|
case eval.Pending:
|
||||||
@ -503,14 +512,7 @@ func toRuleGroup(log log.Logger, manager state.AlertInstanceManager, groupKey ng
|
|||||||
}
|
}
|
||||||
alertingRule.State = "firing"
|
alertingRule.State = "firing"
|
||||||
case eval.Error:
|
case eval.Error:
|
||||||
newRule.Health = "error"
|
|
||||||
case eval.NoData:
|
case eval.NoData:
|
||||||
newRule.Health = "nodata"
|
|
||||||
}
|
|
||||||
|
|
||||||
if alertState.Error != nil {
|
|
||||||
newRule.LastError = alertState.Error.Error()
|
|
||||||
newRule.Health = "error"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(withStates) > 0 {
|
if len(withStates) > 0 {
|
||||||
@ -604,3 +606,10 @@ func encodedQueriesOrError(rules []ngmodels.AlertQuery) string {
|
|||||||
|
|
||||||
return err.Error()
|
return err.Error()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func errorOrEmpty(err error) string {
|
||||||
|
if err != nil {
|
||||||
|
return err.Error()
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
@ -489,6 +489,7 @@ func TestRouteGetRuleStatuses(t *testing.T) {
|
|||||||
t.Run("should return sorted", func(t *testing.T) {
|
t.Run("should return sorted", func(t *testing.T) {
|
||||||
ruleStore := fakes.NewRuleStore(t)
|
ruleStore := fakes.NewRuleStore(t)
|
||||||
fakeAIM := NewFakeAlertInstanceManager(t)
|
fakeAIM := NewFakeAlertInstanceManager(t)
|
||||||
|
fakeSch := newFakeSchedulerReader(t).setupStates(fakeAIM)
|
||||||
groupKey := ngmodels.GenerateGroupKey(orgID)
|
groupKey := ngmodels.GenerateGroupKey(orgID)
|
||||||
gen := ngmodels.RuleGen
|
gen := ngmodels.RuleGen
|
||||||
rules := gen.With(gen.WithGroupKey(groupKey), gen.WithUniqueGroupIndex()).GenerateManyRef(5, 10)
|
rules := gen.With(gen.WithGroupKey(groupKey), gen.WithUniqueGroupIndex()).GenerateManyRef(5, 10)
|
||||||
@ -497,6 +498,7 @@ func TestRouteGetRuleStatuses(t *testing.T) {
|
|||||||
api := PrometheusSrv{
|
api := PrometheusSrv{
|
||||||
log: log.NewNopLogger(),
|
log: log.NewNopLogger(),
|
||||||
manager: fakeAIM,
|
manager: fakeAIM,
|
||||||
|
status: fakeSch,
|
||||||
store: ruleStore,
|
store: ruleStore,
|
||||||
authz: &fakeRuleAccessControlService{},
|
authz: &fakeRuleAccessControlService{},
|
||||||
}
|
}
|
||||||
@ -558,6 +560,7 @@ func TestRouteGetRuleStatuses(t *testing.T) {
|
|||||||
api := PrometheusSrv{
|
api := PrometheusSrv{
|
||||||
log: log.NewNopLogger(),
|
log: log.NewNopLogger(),
|
||||||
manager: fakeAIM,
|
manager: fakeAIM,
|
||||||
|
status: newFakeSchedulerReader(t).setupStates(fakeAIM),
|
||||||
store: ruleStore,
|
store: ruleStore,
|
||||||
authz: accesscontrol.NewRuleService(acimpl.ProvideAccessControl(featuremgmt.WithFeatures(), zanzana.NewNoopClient())),
|
authz: accesscontrol.NewRuleService(acimpl.ProvideAccessControl(featuremgmt.WithFeatures(), zanzana.NewNoopClient())),
|
||||||
}
|
}
|
||||||
@ -673,6 +676,7 @@ func TestRouteGetRuleStatuses(t *testing.T) {
|
|||||||
api := PrometheusSrv{
|
api := PrometheusSrv{
|
||||||
log: log.NewNopLogger(),
|
log: log.NewNopLogger(),
|
||||||
manager: fakeAIM,
|
manager: fakeAIM,
|
||||||
|
status: newFakeSchedulerReader(t).setupStates(fakeAIM),
|
||||||
store: ruleStore,
|
store: ruleStore,
|
||||||
authz: accesscontrol.NewRuleService(acimpl.ProvideAccessControl(featuremgmt.WithFeatures(), zanzana.NewNoopClient())),
|
authz: accesscontrol.NewRuleService(acimpl.ProvideAccessControl(featuremgmt.WithFeatures(), zanzana.NewNoopClient())),
|
||||||
}
|
}
|
||||||
@ -1389,11 +1393,13 @@ func TestRouteGetRuleStatuses(t *testing.T) {
|
|||||||
func setupAPI(t *testing.T) (*fakes.RuleStore, *fakeAlertInstanceManager, PrometheusSrv) {
|
func setupAPI(t *testing.T) (*fakes.RuleStore, *fakeAlertInstanceManager, PrometheusSrv) {
|
||||||
fakeStore := fakes.NewRuleStore(t)
|
fakeStore := fakes.NewRuleStore(t)
|
||||||
fakeAIM := NewFakeAlertInstanceManager(t)
|
fakeAIM := NewFakeAlertInstanceManager(t)
|
||||||
|
fakeSch := newFakeSchedulerReader(t).setupStates(fakeAIM)
|
||||||
fakeAuthz := &fakeRuleAccessControlService{}
|
fakeAuthz := &fakeRuleAccessControlService{}
|
||||||
|
|
||||||
api := PrometheusSrv{
|
api := PrometheusSrv{
|
||||||
log: log.NewNopLogger(),
|
log: log.NewNopLogger(),
|
||||||
manager: fakeAIM,
|
manager: fakeAIM,
|
||||||
|
status: fakeSch,
|
||||||
store: fakeStore,
|
store: fakeStore,
|
||||||
authz: fakeAuthz,
|
authz: fakeAuthz,
|
||||||
}
|
}
|
||||||
|
@ -166,3 +166,29 @@ func (f fakeRuleAccessControlService) AuthorizeDatasourceAccessForRule(ctx conte
|
|||||||
func (f fakeRuleAccessControlService) AuthorizeDatasourceAccessForRuleGroup(ctx context.Context, user identity.Requester, rules models.RulesGroup) error {
|
func (f fakeRuleAccessControlService) AuthorizeDatasourceAccessForRuleGroup(ctx context.Context, user identity.Requester, rules models.RulesGroup) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type statesReader interface {
|
||||||
|
GetStatesForRuleUID(orgID int64, alertRuleUID string) []*state.State
|
||||||
|
}
|
||||||
|
|
||||||
|
type fakeSchedulerReader struct {
|
||||||
|
states statesReader
|
||||||
|
}
|
||||||
|
|
||||||
|
func newFakeSchedulerReader(t *testing.T) *fakeSchedulerReader {
|
||||||
|
return &fakeSchedulerReader{}
|
||||||
|
}
|
||||||
|
|
||||||
|
// setupStates allows the fake scheduler to return data consistent with states defined elsewhere.
|
||||||
|
// This can be combined with fakeAlertInstanceManager, for instance.
|
||||||
|
func (f *fakeSchedulerReader) setupStates(reader statesReader) *fakeSchedulerReader {
|
||||||
|
f.states = reader
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *fakeSchedulerReader) Status(key models.AlertRuleKey) (models.RuleStatus, bool) {
|
||||||
|
if f.states == nil {
|
||||||
|
return models.RuleStatus{}, false
|
||||||
|
}
|
||||||
|
return state.StatesToRuleStatus(f.states.GetStatesForRuleUID(key.OrgID, key.UID)), true
|
||||||
|
}
|
||||||
|
@ -886,3 +886,11 @@ func (r *Record) Fingerprint() data.Fingerprint {
|
|||||||
func hasAnyCondition(rule *AlertRuleWithOptionals) bool {
|
func hasAnyCondition(rule *AlertRuleWithOptionals) bool {
|
||||||
return rule.Condition != "" || (rule.Record != nil && rule.Record.From != "")
|
return rule.Condition != "" || (rule.Record != nil && rule.Record.From != "")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RuleStatus contains info about a rule's current evaluation state.
|
||||||
|
type RuleStatus struct {
|
||||||
|
Health string
|
||||||
|
LastError error
|
||||||
|
EvaluationTimestamp time.Time
|
||||||
|
EvaluationDuration time.Duration
|
||||||
|
}
|
||||||
|
@ -478,6 +478,7 @@ func (ng *AlertNG) init() error {
|
|||||||
ProvenanceStore: ng.store,
|
ProvenanceStore: ng.store,
|
||||||
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
|
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
|
||||||
StateManager: ng.stateManager,
|
StateManager: ng.stateManager,
|
||||||
|
Scheduler: scheduler,
|
||||||
AccessControl: ng.accesscontrol,
|
AccessControl: ng.accesscontrol,
|
||||||
Policies: policyService,
|
Policies: policyService,
|
||||||
ReceiverService: receiverService,
|
ReceiverService: receiverService,
|
||||||
|
@ -40,6 +40,8 @@ type Rule interface {
|
|||||||
Update(lastVersion RuleVersionAndPauseStatus) bool
|
Update(lastVersion RuleVersionAndPauseStatus) bool
|
||||||
// Type gives the type of the rule.
|
// Type gives the type of the rule.
|
||||||
Type() ngmodels.RuleType
|
Type() ngmodels.RuleType
|
||||||
|
// Status indicates the status of the evaluating rule.
|
||||||
|
Status() ngmodels.RuleStatus
|
||||||
}
|
}
|
||||||
|
|
||||||
type ruleFactoryFunc func(context.Context, *ngmodels.AlertRule) Rule
|
type ruleFactoryFunc func(context.Context, *ngmodels.AlertRule) Rule
|
||||||
@ -180,6 +182,10 @@ func (a *alertRule) Type() ngmodels.RuleType {
|
|||||||
return ngmodels.RuleTypeAlerting
|
return ngmodels.RuleTypeAlerting
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *alertRule) Status() ngmodels.RuleStatus {
|
||||||
|
return a.stateManager.GetStatusForRuleUID(a.key.OrgID, a.key.UID)
|
||||||
|
}
|
||||||
|
|
||||||
// eval signals the rule evaluation routine to perform the evaluation of the rule. Does nothing if the loop is stopped.
|
// eval signals the rule evaluation routine to perform the evaluation of the rule. Does nothing if the loop is stopped.
|
||||||
// Before sending a message into the channel, it does non-blocking read to make sure that there is no concurrent send operation.
|
// Before sending a message into the channel, it does non-blocking read to make sure that there is no concurrent send operation.
|
||||||
// Returns a tuple where first element is
|
// Returns a tuple where first element is
|
||||||
|
@ -369,6 +369,17 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
require.Equal(t, s.Labels, data.Labels(cmd.Labels))
|
require.Equal(t, s.Labels, data.Labels(cmd.Labels))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("status should accurately reflect latest evaluation", func(t *testing.T) {
|
||||||
|
states := sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID)
|
||||||
|
require.NotEmpty(t, states)
|
||||||
|
|
||||||
|
status := ruleInfo.Status()
|
||||||
|
require.Equal(t, "ok", status.Health)
|
||||||
|
require.Nil(t, status.LastError)
|
||||||
|
require.Equal(t, states[0].LastEvaluationTime, status.EvaluationTimestamp)
|
||||||
|
require.Equal(t, states[0].EvaluationDuration, status.EvaluationDuration)
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("it reports metrics", func(t *testing.T) {
|
t.Run("it reports metrics", func(t *testing.T) {
|
||||||
// duration metric has 0 values because of mocked clock that do not advance
|
// duration metric has 0 values because of mocked clock that do not advance
|
||||||
expectedMetric := fmt.Sprintf(
|
expectedMetric := fmt.Sprintf(
|
||||||
@ -700,6 +711,15 @@ func TestRuleRoutine(t *testing.T) {
|
|||||||
assert.Len(t, args.PostableAlerts, 1)
|
assert.Len(t, args.PostableAlerts, 1)
|
||||||
assert.Equal(t, state.ErrorAlertName, args.PostableAlerts[0].Labels[prometheusModel.AlertNameLabel])
|
assert.Equal(t, state.ErrorAlertName, args.PostableAlerts[0].Labels[prometheusModel.AlertNameLabel])
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("status should reflect unhealthy rule", func(t *testing.T) {
|
||||||
|
status := ruleInfo.Status()
|
||||||
|
require.Equal(t, "error", status.Health)
|
||||||
|
require.NotNil(t, status.LastError, "expected status to carry the latest evaluation error")
|
||||||
|
require.Contains(t, status.LastError.Error(), "cannot reference itself")
|
||||||
|
require.Equal(t, int64(0), status.EvaluationTimestamp.UTC().Unix())
|
||||||
|
require.Equal(t, time.Duration(0), status.EvaluationDuration)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
t.Run("when there are alerts that should be firing", func(t *testing.T) {
|
t.Run("when there are alerts that should be firing", func(t *testing.T) {
|
||||||
|
@ -84,8 +84,8 @@ func (r *recordingRule) Type() ngmodels.RuleType {
|
|||||||
return ngmodels.RuleTypeRecording
|
return ngmodels.RuleTypeRecording
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *recordingRule) Status() RuleStatus {
|
func (r *recordingRule) Status() ngmodels.RuleStatus {
|
||||||
return RuleStatus{
|
return ngmodels.RuleStatus{
|
||||||
Health: r.health.Load(),
|
Health: r.health.Load(),
|
||||||
LastError: r.lastError.Load(),
|
LastError: r.lastError.Load(),
|
||||||
EvaluationTimestamp: r.evaluationTimestamp.Load(),
|
EvaluationTimestamp: r.evaluationTimestamp.Load(),
|
||||||
|
@ -56,6 +56,14 @@ func (r *ruleRegistry) exists(key models.AlertRuleKey) bool {
|
|||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get fetches a rule from the registry by key. It returns (rule, ok) where ok is false if the rule did not exist.
|
||||||
|
func (r *ruleRegistry) get(key models.AlertRuleKey) (Rule, bool) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
ru, ok := r.rules[key]
|
||||||
|
return ru, ok
|
||||||
|
}
|
||||||
|
|
||||||
// del removes pair that has specific key from the registry.
|
// del removes pair that has specific key from the registry.
|
||||||
// Returns 2-tuple where the first element is value of the removed pair
|
// Returns 2-tuple where the first element is value of the removed pair
|
||||||
// and the second element indicates whether element with the specified key existed.
|
// and the second element indicates whether element with the specified key existed.
|
||||||
|
@ -171,6 +171,14 @@ func (sch *schedule) Rules() ([]*ngmodels.AlertRule, map[ngmodels.FolderKey]stri
|
|||||||
return sch.schedulableAlertRules.all()
|
return sch.schedulableAlertRules.all()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Status fetches the health of a given scheduled rule, by key.
|
||||||
|
func (sch *schedule) Status(key ngmodels.AlertRuleKey) (ngmodels.RuleStatus, bool) {
|
||||||
|
if rule, ok := sch.registry.get(key); ok {
|
||||||
|
return rule.Status(), true
|
||||||
|
}
|
||||||
|
return ngmodels.RuleStatus{}, false
|
||||||
|
}
|
||||||
|
|
||||||
// deleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
// deleteAlertRule stops evaluation of the rule, deletes it from active rules, and cleans up state cache.
|
||||||
func (sch *schedule) deleteAlertRule(keys ...ngmodels.AlertRuleKey) {
|
func (sch *schedule) deleteAlertRule(keys ...ngmodels.AlertRuleKey) {
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
|
@ -113,6 +113,11 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
|
|
||||||
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
|
folderWithRuleGroup1 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule1.NamespaceUID), alertRule1.RuleGroup)
|
||||||
|
|
||||||
|
t.Run("before 1st tick status should not be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule should not be present before the scheduler has created it")
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("on 1st tick alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 1st tick alert rule should be evaluated", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
@ -137,12 +142,25 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("after 1st tick status for rule should be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
|
||||||
|
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
|
||||||
|
// We are only testing that things were scheduled, not that the rule routine worked internally.
|
||||||
|
})
|
||||||
|
|
||||||
// add alert rule under main org with three base intervals
|
// add alert rule under main org with three base intervals
|
||||||
alertRule2 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(3*cfg.BaseInterval), gen.WithTitle("rule-2")).GenerateRef()
|
alertRule2 := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(3*cfg.BaseInterval), gen.WithTitle("rule-2")).GenerateRef()
|
||||||
ruleStore.PutRule(ctx, alertRule2)
|
ruleStore.PutRule(ctx, alertRule2)
|
||||||
|
|
||||||
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
|
folderWithRuleGroup2 := fmt.Sprintf("%s;%s", ruleStore.getNamespaceTitle(alertRule2.NamespaceUID), alertRule2.RuleGroup)
|
||||||
|
|
||||||
|
t.Run("before 2nd tick status for rule should not be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule2.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule should not be present before the scheduler has created it")
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("on 2nd tick first alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 2nd tick first alert rule should be evaluated", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
|
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
@ -184,6 +202,16 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
assertEvalRun(t, evalAppliedCh, tick, keys...)
|
assertEvalRun(t, evalAppliedCh, tick, keys...)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("after 3rd tick status for both rules should be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
_, ok = sched.Status(alertRule2.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
|
||||||
|
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
|
||||||
|
// We are only testing that things were scheduled, not that the rule routine worked internally.
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("on 4th tick only one alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 4th tick only one alert rule should be evaluated", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
|
scheduled, stopped, updated := sched.processTick(ctx, dispatcherGroup, tick)
|
||||||
@ -223,6 +251,16 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("after 5th tick status for both rules should be available regardless of pause state", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
_, ok = sched.Status(alertRule2.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
// Interestingly, the rules in this test are randomised, and are sometimes invalid.
|
||||||
|
// Therefore, we can't reliably assert anything about the actual health. It might be error, it might not, depending on randomness.
|
||||||
|
// We are only testing that things were scheduled, not that the rule routine worked internally.
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
t.Run("on 6th tick all alert rule are paused (it still enters evaluation but it is early skipped)", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
@ -309,6 +347,13 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("after 8th tick status for deleted rule should not be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule that was deleted should not be available")
|
||||||
|
_, ok = sched.Status(alertRule2.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
|
t.Run("on 9th tick one alert rule should be evaluated", func(t *testing.T) {
|
||||||
tick = tick.Add(cfg.BaseInterval)
|
tick = tick.Add(cfg.BaseInterval)
|
||||||
|
|
||||||
@ -338,6 +383,14 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
require.Emptyf(t, updated, "None rules are expected to be updated")
|
require.Emptyf(t, updated, "None rules are expected to be updated")
|
||||||
assertEvalRun(t, evalAppliedCh, tick, alertRule3.GetKey())
|
assertEvalRun(t, evalAppliedCh, tick, alertRule3.GetKey())
|
||||||
})
|
})
|
||||||
|
t.Run("after 10th tick status for remaining rules should be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule that was deleted should not be available")
|
||||||
|
_, ok = sched.Status(alertRule2.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
_, ok = sched.Status(alertRule3.GetKey())
|
||||||
|
require.True(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
})
|
||||||
t.Run("on 11th tick rule2 should be updated", func(t *testing.T) {
|
t.Run("on 11th tick rule2 should be updated", func(t *testing.T) {
|
||||||
newRule2 := models.CopyRule(alertRule2)
|
newRule2 := models.CopyRule(alertRule2)
|
||||||
newRule2.Version++
|
newRule2.Version++
|
||||||
@ -465,6 +518,14 @@ func TestProcessTicks(t *testing.T) {
|
|||||||
|
|
||||||
require.Emptyf(t, updated, "No rules should be updated")
|
require.Emptyf(t, updated, "No rules should be updated")
|
||||||
})
|
})
|
||||||
|
t.Run("after 12th tick no status should be available", func(t *testing.T) {
|
||||||
|
_, ok := sched.Status(alertRule1.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule that was deleted should not be available")
|
||||||
|
_, ok = sched.Status(alertRule2.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
_, ok = sched.Status(alertRule3.GetKey())
|
||||||
|
require.False(t, ok, "status for a rule that just evaluated was not available")
|
||||||
|
})
|
||||||
|
|
||||||
t.Run("scheduled rules should be sorted", func(t *testing.T) {
|
t.Run("scheduled rules should be sorted", func(t *testing.T) {
|
||||||
rules := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval)).GenerateManyRef(10, 20)
|
rules := gen.With(gen.WithOrgID(mainOrgID), gen.WithInterval(cfg.BaseInterval)).GenerateManyRef(10, 20)
|
||||||
|
@ -556,6 +556,11 @@ func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*Stat
|
|||||||
return st.cache.getStatesForRuleUID(orgID, alertRuleUID, st.doNotSaveNormalState)
|
return st.cache.getStatesForRuleUID(orgID, alertRuleUID, st.doNotSaveNormalState)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (st *Manager) GetStatusForRuleUID(orgID int64, alertRuleUID string) ngModels.RuleStatus {
|
||||||
|
states := st.GetStatesForRuleUID(orgID, alertRuleUID)
|
||||||
|
return StatesToRuleStatus(states)
|
||||||
|
}
|
||||||
|
|
||||||
func (st *Manager) Put(states []*State) {
|
func (st *Manager) Put(states []*State) {
|
||||||
for _, s := range states {
|
for _, s := range states {
|
||||||
st.cache.set(s)
|
st.cache.set(s)
|
||||||
@ -623,3 +628,34 @@ func (st *Manager) deleteStaleStatesFromCache(ctx context.Context, logger log.Lo
|
|||||||
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
||||||
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
|
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func StatesToRuleStatus(states []*State) ngModels.RuleStatus {
|
||||||
|
status := ngModels.RuleStatus{
|
||||||
|
Health: "ok",
|
||||||
|
LastError: nil,
|
||||||
|
EvaluationTimestamp: time.Time{},
|
||||||
|
}
|
||||||
|
for _, state := range states {
|
||||||
|
if state.LastEvaluationTime.After(status.EvaluationTimestamp) {
|
||||||
|
status.EvaluationTimestamp = state.LastEvaluationTime
|
||||||
|
}
|
||||||
|
|
||||||
|
status.EvaluationDuration = state.EvaluationDuration
|
||||||
|
|
||||||
|
switch state.State {
|
||||||
|
case eval.Normal:
|
||||||
|
case eval.Pending:
|
||||||
|
case eval.Alerting:
|
||||||
|
case eval.Error:
|
||||||
|
status.Health = "error"
|
||||||
|
case eval.NoData:
|
||||||
|
status.Health = "nodata"
|
||||||
|
}
|
||||||
|
|
||||||
|
if state.Error != nil {
|
||||||
|
status.LastError = state.Error
|
||||||
|
status.Health = "error"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return status
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user