mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Enhancements to /rules (#33085)
* set processing time * merge labels and set on response * use state cache for adding alerts to rules * minor cleanup * pr feedback * Do not initialize mutex unnecessarily Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com> * linter Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
This commit is contained in:
parent
7480d9e2be
commit
4be1d84f23
@ -5,6 +5,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
apiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
apiv1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
||||||
|
|
||||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
@ -84,18 +85,9 @@ func (srv PrometheusSrv) RouteGetRuleStatuses(c *models.ReqContext) response.Res
|
|||||||
LastEvaluation: time.Time{},
|
LastEvaluation: time.Time{},
|
||||||
EvaluationTime: 0, // TODO: see if we are able to pass this along with evaluation results
|
EvaluationTime: 0, // TODO: see if we are able to pass this along with evaluation results
|
||||||
}
|
}
|
||||||
for _, rule := range alertRuleQuery.Result {
|
|
||||||
instanceQuery := ngmodels.ListAlertInstancesQuery{
|
|
||||||
DefinitionOrgID: c.SignedInUser.OrgId,
|
|
||||||
DefinitionUID: rule.UID,
|
|
||||||
}
|
|
||||||
if err := srv.store.ListAlertInstances(&instanceQuery); err != nil {
|
|
||||||
ruleResponse.DiscoveryBase.Status = "error"
|
|
||||||
ruleResponse.DiscoveryBase.Error = fmt.Sprintf("failure getting alerts for rule %s: %s", rule.UID, err.Error())
|
|
||||||
ruleResponse.DiscoveryBase.ErrorType = apiv1.ErrServer
|
|
||||||
return response.JSON(http.StatusInternalServerError, ruleResponse)
|
|
||||||
}
|
|
||||||
|
|
||||||
|
stateMap := srv.stateTracker.GetStatesByRuleUID()
|
||||||
|
for _, rule := range alertRuleQuery.Result {
|
||||||
alertingRule := apimodels.AlertingRule{
|
alertingRule := apimodels.AlertingRule{
|
||||||
State: "inactive",
|
State: "inactive",
|
||||||
Name: rule.Title,
|
Name: rule.Title,
|
||||||
@ -106,36 +98,46 @@ func (srv PrometheusSrv) RouteGetRuleStatuses(c *models.ReqContext) response.Res
|
|||||||
|
|
||||||
newRule := apimodels.Rule{
|
newRule := apimodels.Rule{
|
||||||
Name: rule.Title,
|
Name: rule.Title,
|
||||||
Labels: nil, // TODO: NG AlertRule does not have labels but does have annotations
|
Labels: rule.Labels,
|
||||||
Health: "ok", // TODO: update this in the future when error and noData states are being evaluated and set
|
Health: "ok", // TODO: update this in the future when error and noData states are being evaluated and set
|
||||||
Type: apiv1.RuleTypeAlerting,
|
Type: apiv1.RuleTypeAlerting,
|
||||||
LastEvaluation: time.Time{}, // TODO: set this to be rule evaluation time once it is being set
|
LastEvaluation: time.Time{},
|
||||||
EvaluationTime: 0, // TODO: set this once we are saving it or adding it to evaluation results
|
|
||||||
}
|
}
|
||||||
for _, instance := range instanceQuery.Result {
|
|
||||||
activeAt := instance.CurrentStateSince
|
for _, alertState := range stateMap[rule.UID] {
|
||||||
|
activeAt := alertState.StartsAt
|
||||||
alert := &apimodels.Alert{
|
alert := &apimodels.Alert{
|
||||||
Labels: map[string]string(instance.Labels),
|
Labels: map[string]string(alertState.Labels),
|
||||||
Annotations: nil, // TODO: set these once they are added to evaluation results
|
Annotations: alertState.Annotations,
|
||||||
State: translateInstanceState(instance.CurrentState),
|
State: alertState.State.String(),
|
||||||
ActiveAt: &activeAt,
|
ActiveAt: &activeAt,
|
||||||
Value: "", // TODO: set this once it is added to the evaluation results
|
Value: "", // TODO: set this once it is added to the evaluation results
|
||||||
}
|
}
|
||||||
if instance.LastEvalTime.After(newRule.LastEvaluation) {
|
|
||||||
newRule.LastEvaluation = instance.LastEvalTime
|
if alertState.LastEvaluationTime.After(newRule.LastEvaluation) {
|
||||||
newGroup.LastEvaluation = instance.LastEvalTime
|
newRule.LastEvaluation = alertState.LastEvaluationTime
|
||||||
|
newGroup.LastEvaluation = alertState.LastEvaluationTime
|
||||||
}
|
}
|
||||||
switch alert.State {
|
|
||||||
case "pending":
|
alertingRule.Duration = alertState.EvaluationDuration.Seconds()
|
||||||
|
newRule.EvaluationTime = alertState.EvaluationDuration.Seconds()
|
||||||
|
|
||||||
|
switch alertState.State {
|
||||||
|
case eval.Normal:
|
||||||
|
case eval.Pending:
|
||||||
if alertingRule.State == "inactive" {
|
if alertingRule.State == "inactive" {
|
||||||
alertingRule.State = "pending"
|
alertingRule.State = "pending"
|
||||||
}
|
}
|
||||||
case "firing":
|
case eval.Alerting:
|
||||||
alertingRule.State = "firing"
|
alertingRule.State = "firing"
|
||||||
|
case eval.Error:
|
||||||
|
// handle Error case based on configuration in alertRule
|
||||||
|
case eval.NoData:
|
||||||
|
// handle NoData case based on configuration in alertRule
|
||||||
}
|
}
|
||||||
|
|
||||||
alertingRule.Alerts = append(alertingRule.Alerts, alert)
|
alertingRule.Alerts = append(alertingRule.Alerts, alert)
|
||||||
}
|
}
|
||||||
|
|
||||||
alertingRule.Rule = newRule
|
alertingRule.Rule = newRule
|
||||||
newGroup.Rules = append(newGroup.Rules, alertingRule)
|
newGroup.Rules = append(newGroup.Rules, alertingRule)
|
||||||
newGroup.Interval = float64(rule.IntervalSeconds)
|
newGroup.Interval = float64(rule.IntervalSeconds)
|
||||||
@ -144,16 +146,3 @@ func (srv PrometheusSrv) RouteGetRuleStatuses(c *models.ReqContext) response.Res
|
|||||||
}
|
}
|
||||||
return response.JSON(http.StatusOK, ruleResponse)
|
return response.JSON(http.StatusOK, ruleResponse)
|
||||||
}
|
}
|
||||||
|
|
||||||
func translateInstanceState(state ngmodels.InstanceStateType) string {
|
|
||||||
switch {
|
|
||||||
case state == ngmodels.InstanceStateFiring:
|
|
||||||
return "firing"
|
|
||||||
case state == ngmodels.InstanceStateNormal:
|
|
||||||
return "inactive"
|
|
||||||
case state == ngmodels.InstanceStatePending:
|
|
||||||
return "pending"
|
|
||||||
default:
|
|
||||||
return "inactive"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -76,6 +76,11 @@ const (
|
|||||||
// that evaluated to true (Alerting).
|
// that evaluated to true (Alerting).
|
||||||
Alerting
|
Alerting
|
||||||
|
|
||||||
|
// Pending is the eval state for an alert instance condition
|
||||||
|
// that evaluated to true (Alerting) but has not yet met
|
||||||
|
// the For duration defined in AlertRule
|
||||||
|
Pending
|
||||||
|
|
||||||
// NoData is the eval state for an alert rule condition
|
// NoData is the eval state for an alert rule condition
|
||||||
// that evaluated to NoData.
|
// that evaluated to NoData.
|
||||||
NoData
|
NoData
|
||||||
@ -86,7 +91,7 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func (s State) String() string {
|
func (s State) String() string {
|
||||||
return [...]string{"Normal", "Alerting", "NoData", "Error"}[s]
|
return [...]string{"Normal", "Alerting", "Pending", "NoData", "Error"}[s]
|
||||||
}
|
}
|
||||||
|
|
||||||
// AlertExecCtx is the context provided for executing an alert condition.
|
// AlertExecCtx is the context provided for executing an alert condition.
|
||||||
|
@ -78,7 +78,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key models.AlertRul
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
processedStates := stateTracker.ProcessEvalResults(alertRule, results)
|
processedStates := stateTracker.ProcessEvalResults(alertRule, results, end.Sub(start))
|
||||||
sch.saveAlertStates(processedStates)
|
sch.saveAlertStates(processedStates)
|
||||||
alerts := FromAlertStateToPostableAlerts(processedStates)
|
alerts := FromAlertStateToPostableAlerts(processedStates)
|
||||||
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts))
|
sch.log.Debug("sending alerts to notifier", "count", len(alerts.PostableAlerts))
|
||||||
@ -321,7 +321,7 @@ func (sch *schedule) saveAlertStates(states []state.AlertState) {
|
|||||||
for _, s := range states {
|
for _, s := range states {
|
||||||
cmd := models.SaveAlertInstanceCommand{
|
cmd := models.SaveAlertInstanceCommand{
|
||||||
DefinitionOrgID: s.OrgID,
|
DefinitionOrgID: s.OrgID,
|
||||||
DefinitionUID: s.UID,
|
DefinitionUID: s.AlertRuleUID,
|
||||||
Labels: models.InstanceLabels(s.Labels),
|
Labels: models.InstanceLabels(s.Labels),
|
||||||
State: models.InstanceStateType(s.State.String()),
|
State: models.InstanceStateType(s.State.String()),
|
||||||
LastEvalTime: s.LastEvaluationTime,
|
LastEvalTime: s.LastEvaluationTime,
|
||||||
@ -330,7 +330,7 @@ func (sch *schedule) saveAlertStates(states []state.AlertState) {
|
|||||||
}
|
}
|
||||||
err := sch.store.SaveAlertInstance(&cmd)
|
err := sch.store.SaveAlertInstance(&cmd)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
sch.log.Error("failed to save alert state", "uid", s.UID, "orgId", s.OrgID, "labels", s.Labels.String(), "state", s.State.String(), "msg", err.Error())
|
sch.log.Error("failed to save alert state", "uid", s.AlertRuleUID, "orgId", s.OrgID, "labels", s.Labels.String(), "state", s.State.String(), "msg", err.Error())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -355,7 +355,7 @@ func (sch *schedule) WarmStateCache(st *state.StateTracker) {
|
|||||||
for _, entry := range cmd.Result {
|
for _, entry := range cmd.Result {
|
||||||
lbs := map[string]string(entry.Labels)
|
lbs := map[string]string(entry.Labels)
|
||||||
stateForEntry := state.AlertState{
|
stateForEntry := state.AlertState{
|
||||||
UID: entry.DefinitionUID,
|
AlertRuleUID: entry.DefinitionUID,
|
||||||
OrgID: entry.DefinitionOrgID,
|
OrgID: entry.DefinitionOrgID,
|
||||||
CacheId: fmt.Sprintf("%s %s", entry.DefinitionUID, lbs),
|
CacheId: fmt.Sprintf("%s %s", entry.DefinitionUID, lbs),
|
||||||
Labels: lbs,
|
Labels: lbs,
|
||||||
|
@ -5,6 +5,8 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
prometheusModel "github.com/prometheus/common/model"
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/infra/log"
|
"github.com/grafana/grafana/pkg/infra/log"
|
||||||
|
|
||||||
"github.com/grafana/grafana-plugin-sdk-go/data"
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
||||||
@ -13,7 +15,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type AlertState struct {
|
type AlertState struct {
|
||||||
UID string
|
AlertRuleUID string
|
||||||
OrgID int64
|
OrgID int64
|
||||||
CacheId string
|
CacheId string
|
||||||
Labels data.Labels
|
Labels data.Labels
|
||||||
@ -22,6 +24,7 @@ type AlertState struct {
|
|||||||
StartsAt time.Time
|
StartsAt time.Time
|
||||||
EndsAt time.Time
|
EndsAt time.Time
|
||||||
LastEvaluationTime time.Time
|
LastEvaluationTime time.Time
|
||||||
|
EvaluationDuration time.Duration
|
||||||
Annotations map[string]string
|
Annotations map[string]string
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -31,21 +34,20 @@ type StateEvaluation struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type cache struct {
|
type cache struct {
|
||||||
cacheMap map[string]AlertState
|
states map[string]AlertState
|
||||||
mu sync.Mutex
|
mtxStates sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
type StateTracker struct {
|
type StateTracker struct {
|
||||||
stateCache cache
|
cache cache
|
||||||
quit chan struct{}
|
quit chan struct{}
|
||||||
Log log.Logger
|
Log log.Logger
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewStateTracker(logger log.Logger) *StateTracker {
|
func NewStateTracker(logger log.Logger) *StateTracker {
|
||||||
tracker := &StateTracker{
|
tracker := &StateTracker{
|
||||||
stateCache: cache{
|
cache: cache{
|
||||||
cacheMap: make(map[string]AlertState),
|
states: make(map[string]AlertState),
|
||||||
mu: sync.Mutex{},
|
|
||||||
},
|
},
|
||||||
quit: make(chan struct{}),
|
quit: make(chan struct{}),
|
||||||
Log: logger,
|
Log: logger,
|
||||||
@ -54,135 +56,158 @@ func NewStateTracker(logger log.Logger) *StateTracker {
|
|||||||
return tracker
|
return tracker
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *StateTracker) getOrCreate(alertRule *ngModels.AlertRule, result eval.Result) AlertState {
|
func (st *StateTracker) getOrCreate(alertRule *ngModels.AlertRule, result eval.Result, evaluationDuration time.Duration) AlertState {
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
lbs := data.Labels{}
|
|
||||||
if len(result.Instance) > 0 {
|
// if duplicate labels exist, alertRule label will take precedence
|
||||||
lbs = result.Instance
|
lbs := mergeLabels(alertRule.Labels, result.Instance)
|
||||||
}
|
|
||||||
lbs["__alert_rule_uid__"] = alertRule.UID
|
lbs["__alert_rule_uid__"] = alertRule.UID
|
||||||
lbs["__alert_rule_namespace_uid__"] = alertRule.NamespaceUID
|
lbs["__alert_rule_namespace_uid__"] = alertRule.NamespaceUID
|
||||||
lbs["__alert_rule_title__"] = alertRule.Title
|
lbs[prometheusModel.AlertNameLabel] = alertRule.Title
|
||||||
|
|
||||||
|
id := fmt.Sprintf("%s", map[string]string(lbs))
|
||||||
|
if state, ok := st.cache.states[id]; ok {
|
||||||
|
return state
|
||||||
|
}
|
||||||
|
|
||||||
annotations := map[string]string{}
|
annotations := map[string]string{}
|
||||||
if len(alertRule.Annotations) > 0 {
|
if len(alertRule.Annotations) > 0 {
|
||||||
annotations = alertRule.Annotations
|
annotations = alertRule.Annotations
|
||||||
}
|
}
|
||||||
|
|
||||||
idString := fmt.Sprintf("%s", map[string]string(lbs))
|
newResults := []StateEvaluation{
|
||||||
if state, ok := st.stateCache.cacheMap[idString]; ok {
|
{
|
||||||
return state
|
EvaluationTime: result.EvaluatedAt,
|
||||||
|
EvaluationState: result.State,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
st.Log.Debug("adding new alert state cache entry", "cacheId", idString, "state", result.State.String(), "evaluatedAt", result.EvaluatedAt.String())
|
st.Log.Debug("adding new alert state cache entry", "cacheId", id, "state", result.State.String(), "evaluatedAt", result.EvaluatedAt.String())
|
||||||
newState := AlertState{
|
newState := AlertState{
|
||||||
UID: alertRule.UID,
|
AlertRuleUID: alertRule.UID,
|
||||||
OrgID: alertRule.OrgID,
|
OrgID: alertRule.OrgID,
|
||||||
CacheId: idString,
|
CacheId: id,
|
||||||
Labels: lbs,
|
Labels: lbs,
|
||||||
State: result.State,
|
State: result.State,
|
||||||
Results: []StateEvaluation{},
|
Results: newResults,
|
||||||
Annotations: annotations,
|
Annotations: annotations,
|
||||||
|
EvaluationDuration: evaluationDuration,
|
||||||
}
|
}
|
||||||
if result.State == eval.Alerting {
|
if result.State == eval.Alerting {
|
||||||
newState.StartsAt = result.EvaluatedAt
|
newState.StartsAt = result.EvaluatedAt
|
||||||
}
|
}
|
||||||
st.stateCache.cacheMap[idString] = newState
|
st.cache.states[id] = newState
|
||||||
return newState
|
return newState
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *StateTracker) set(stateEntry AlertState) {
|
func (st *StateTracker) set(entry AlertState) {
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
st.stateCache.cacheMap[stateEntry.CacheId] = stateEntry
|
st.cache.states[entry.CacheId] = entry
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *StateTracker) Get(stateId string) AlertState {
|
func (st *StateTracker) Get(id string) AlertState {
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
return st.stateCache.cacheMap[stateId]
|
return st.cache.states[id]
|
||||||
}
|
}
|
||||||
|
|
||||||
//Used to ensure a clean cache on startup
|
//Used to ensure a clean cache on startup
|
||||||
func (st *StateTracker) ResetCache() {
|
func (st *StateTracker) ResetCache() {
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
st.stateCache.cacheMap = make(map[string]AlertState)
|
st.cache.states = make(map[string]AlertState)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *StateTracker) ProcessEvalResults(alertRule *ngModels.AlertRule, results eval.Results) []AlertState {
|
func (st *StateTracker) ProcessEvalResults(alertRule *ngModels.AlertRule, results eval.Results, evaluationDuration time.Duration) []AlertState {
|
||||||
st.Log.Info("state tracker processing evaluation results", "uid", alertRule.UID, "resultCount", len(results))
|
st.Log.Info("state tracker processing evaluation results", "uid", alertRule.UID, "resultCount", len(results))
|
||||||
var changedStates []AlertState
|
var states []AlertState
|
||||||
for _, result := range results {
|
for _, result := range results {
|
||||||
s, _ := st.setNextState(alertRule, result)
|
s := st.setNextState(alertRule, result, evaluationDuration)
|
||||||
changedStates = append(changedStates, s)
|
states = append(states, s)
|
||||||
}
|
}
|
||||||
st.Log.Debug("returning changed states to scheduler", "count", len(changedStates))
|
st.Log.Debug("returning changed states to scheduler", "count", len(states))
|
||||||
return changedStates
|
return states
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: When calculating if an alert should not be firing anymore, we should take three things into account:
|
//TODO: When calculating if an alert should not be firing anymore, we should take three things into account:
|
||||||
// 1. The re-send the delay if any, we don't want to send every firing alert every time, we should have a fixed delay across all alerts to avoid saturating the notification system
|
// 1. The re-send the delay if any, we don't want to send every firing alert every time, we should have a fixed delay across all alerts to avoid saturating the notification system
|
||||||
// 2. The evaluation interval defined for this particular alert - we don't support that yet but will eventually allow you to define how often do you want this alert to be evaluted
|
|
||||||
// 3. The base interval defined by the scheduler - in the case where #2 is not yet an option we can use the base interval at which every alert runs.
|
|
||||||
//Set the current state based on evaluation results
|
//Set the current state based on evaluation results
|
||||||
//return the state and a bool indicating whether a state transition occurred
|
func (st *StateTracker) setNextState(alertRule *ngModels.AlertRule, result eval.Result, evaluationDuration time.Duration) AlertState {
|
||||||
func (st *StateTracker) setNextState(alertRule *ngModels.AlertRule, result eval.Result) (AlertState, bool) {
|
currentState := st.getOrCreate(alertRule, result, evaluationDuration)
|
||||||
currentState := st.getOrCreate(alertRule, result)
|
|
||||||
st.Log.Debug("setting alert state", "uid", alertRule.UID)
|
st.Log.Debug("setting alert state", "uid", alertRule.UID)
|
||||||
switch {
|
switch {
|
||||||
case currentState.State == result.State:
|
case currentState.State == result.State:
|
||||||
st.Log.Debug("no state transition", "cacheId", currentState.CacheId, "state", currentState.State.String())
|
st.Log.Debug("no state transition", "cacheId", currentState.CacheId, "state", currentState.State.String())
|
||||||
currentState.LastEvaluationTime = result.EvaluatedAt
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
||||||
|
currentState.EvaluationDuration = evaluationDuration
|
||||||
currentState.Results = append(currentState.Results, StateEvaluation{
|
currentState.Results = append(currentState.Results, StateEvaluation{
|
||||||
EvaluationTime: result.EvaluatedAt,
|
EvaluationTime: result.EvaluatedAt,
|
||||||
EvaluationState: result.State,
|
EvaluationState: result.State,
|
||||||
})
|
})
|
||||||
if currentState.State == eval.Alerting {
|
if currentState.State == eval.Alerting {
|
||||||
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
|
currentState.EndsAt = result.EvaluatedAt.Add(alertRule.For * time.Second)
|
||||||
}
|
}
|
||||||
st.set(currentState)
|
st.set(currentState)
|
||||||
return currentState, false
|
return currentState
|
||||||
case currentState.State == eval.Normal && result.State == eval.Alerting:
|
case currentState.State == eval.Normal && result.State == eval.Alerting:
|
||||||
st.Log.Debug("state transition from normal to alerting", "cacheId", currentState.CacheId)
|
st.Log.Debug("state transition from normal to alerting", "cacheId", currentState.CacheId)
|
||||||
currentState.State = eval.Alerting
|
currentState.State = eval.Alerting
|
||||||
currentState.LastEvaluationTime = result.EvaluatedAt
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
||||||
currentState.StartsAt = result.EvaluatedAt
|
currentState.StartsAt = result.EvaluatedAt
|
||||||
currentState.EndsAt = result.EvaluatedAt.Add(40 * time.Second)
|
currentState.EndsAt = result.EvaluatedAt.Add(alertRule.For * time.Second)
|
||||||
|
currentState.EvaluationDuration = evaluationDuration
|
||||||
currentState.Results = append(currentState.Results, StateEvaluation{
|
currentState.Results = append(currentState.Results, StateEvaluation{
|
||||||
EvaluationTime: result.EvaluatedAt,
|
EvaluationTime: result.EvaluatedAt,
|
||||||
EvaluationState: result.State,
|
EvaluationState: result.State,
|
||||||
})
|
})
|
||||||
currentState.Annotations["alerting"] = result.EvaluatedAt.String()
|
currentState.Annotations["alerting"] = result.EvaluatedAt.String()
|
||||||
st.set(currentState)
|
st.set(currentState)
|
||||||
return currentState, true
|
return currentState
|
||||||
case currentState.State == eval.Alerting && result.State == eval.Normal:
|
case currentState.State == eval.Alerting && result.State == eval.Normal:
|
||||||
st.Log.Debug("state transition from alerting to normal", "cacheId", currentState.CacheId)
|
st.Log.Debug("state transition from alerting to normal", "cacheId", currentState.CacheId)
|
||||||
currentState.State = eval.Normal
|
currentState.State = eval.Normal
|
||||||
currentState.LastEvaluationTime = result.EvaluatedAt
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
||||||
currentState.EndsAt = result.EvaluatedAt
|
currentState.EndsAt = result.EvaluatedAt
|
||||||
|
currentState.EvaluationDuration = evaluationDuration
|
||||||
currentState.Results = append(currentState.Results, StateEvaluation{
|
currentState.Results = append(currentState.Results, StateEvaluation{
|
||||||
EvaluationTime: result.EvaluatedAt,
|
EvaluationTime: result.EvaluatedAt,
|
||||||
EvaluationState: result.State,
|
EvaluationState: result.State,
|
||||||
})
|
})
|
||||||
st.set(currentState)
|
st.set(currentState)
|
||||||
return currentState, true
|
return currentState
|
||||||
default:
|
default:
|
||||||
return currentState, false
|
return currentState
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (st *StateTracker) GetAll() []AlertState {
|
func (st *StateTracker) GetAll() []AlertState {
|
||||||
var states []AlertState
|
var states []AlertState
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
for _, v := range st.stateCache.cacheMap {
|
for _, v := range st.cache.states {
|
||||||
states = append(states, v)
|
states = append(states, v)
|
||||||
}
|
}
|
||||||
return states
|
return states
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (st *StateTracker) GetStatesByRuleUID() map[string][]AlertState {
|
||||||
|
ruleMap := make(map[string][]AlertState)
|
||||||
|
st.cache.mtxStates.Lock()
|
||||||
|
defer st.cache.mtxStates.Unlock()
|
||||||
|
for _, state := range st.cache.states {
|
||||||
|
if ruleStates, ok := ruleMap[state.AlertRuleUID]; ok {
|
||||||
|
ruleStates = append(ruleStates, state)
|
||||||
|
ruleMap[state.AlertRuleUID] = ruleStates
|
||||||
|
} else {
|
||||||
|
ruleStates := []AlertState{state}
|
||||||
|
ruleMap[state.AlertRuleUID] = ruleStates
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ruleMap
|
||||||
|
}
|
||||||
|
|
||||||
func (st *StateTracker) cleanUp() {
|
func (st *StateTracker) cleanUp() {
|
||||||
ticker := time.NewTicker(time.Duration(60) * time.Minute)
|
ticker := time.NewTicker(time.Duration(60) * time.Minute)
|
||||||
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
|
st.Log.Debug("starting cleanup process", "intervalMinutes", 60)
|
||||||
@ -200,9 +225,9 @@ func (st *StateTracker) cleanUp() {
|
|||||||
|
|
||||||
func (st *StateTracker) trim() {
|
func (st *StateTracker) trim() {
|
||||||
st.Log.Info("trimming alert state cache", "now", time.Now())
|
st.Log.Info("trimming alert state cache", "now", time.Now())
|
||||||
st.stateCache.mu.Lock()
|
st.cache.mtxStates.Lock()
|
||||||
defer st.stateCache.mu.Unlock()
|
defer st.cache.mtxStates.Unlock()
|
||||||
for _, v := range st.stateCache.cacheMap {
|
for _, v := range st.cache.states {
|
||||||
if len(v.Results) > 100 {
|
if len(v.Results) > 100 {
|
||||||
st.Log.Debug("trimming result set", "cacheId", v.CacheId, "count", len(v.Results)-100)
|
st.Log.Debug("trimming result set", "cacheId", v.CacheId, "count", len(v.Results)-100)
|
||||||
newResults := make([]StateEvaluation, 100)
|
newResults := make([]StateEvaluation, 100)
|
||||||
@ -214,7 +239,7 @@ func (st *StateTracker) trim() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a AlertState) Equals(b AlertState) bool {
|
func (a AlertState) Equals(b AlertState) bool {
|
||||||
return a.UID == b.UID &&
|
return a.AlertRuleUID == b.AlertRuleUID &&
|
||||||
a.OrgID == b.OrgID &&
|
a.OrgID == b.OrgID &&
|
||||||
a.CacheId == b.CacheId &&
|
a.CacheId == b.CacheId &&
|
||||||
a.Labels.String() == b.Labels.String() &&
|
a.Labels.String() == b.Labels.String() &&
|
||||||
@ -229,3 +254,17 @@ func (st *StateTracker) Put(states []AlertState) {
|
|||||||
st.set(s)
|
st.set(s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if duplicate labels exist, keep the value from the first set
|
||||||
|
func mergeLabels(a, b data.Labels) data.Labels {
|
||||||
|
newLbs := data.Labels{}
|
||||||
|
for k, v := range a {
|
||||||
|
newLbs[k] = v
|
||||||
|
}
|
||||||
|
for k, v := range b {
|
||||||
|
if _, ok := newLbs[k]; !ok {
|
||||||
|
newLbs[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return newLbs
|
||||||
|
}
|
||||||
|
@ -35,11 +35,11 @@ func TestWarmStateCache(t *testing.T) {
|
|||||||
|
|
||||||
expectedEntries := []state.AlertState{
|
expectedEntries := []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: "test_uid map[test1:testValue1]",
|
CacheId: "test_uid map[test1:testValue1]",
|
||||||
Labels: data.Labels{"test1": "testValue1"},
|
Labels: data.Labels{"test1": "testValue1"},
|
||||||
State: eval.Normal,
|
State: eval.Normal,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
||||||
},
|
},
|
||||||
@ -47,11 +47,11 @@ func TestWarmStateCache(t *testing.T) {
|
|||||||
EndsAt: evaluationTime.Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(1 * time.Minute),
|
||||||
LastEvaluationTime: evaluationTime,
|
LastEvaluationTime: evaluationTime,
|
||||||
}, {
|
}, {
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: "test_uid map[test2:testValue2]",
|
CacheId: "test_uid map[test2:testValue2]",
|
||||||
Labels: data.Labels{"test2": "testValue2"},
|
Labels: data.Labels{"test2": "testValue2"},
|
||||||
State: eval.Alerting,
|
State: eval.Alerting,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
||||||
},
|
},
|
||||||
|
@ -20,7 +20,11 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("error parsing date format: %s", err.Error())
|
t.Fatalf("error parsing date format: %s", err.Error())
|
||||||
}
|
}
|
||||||
cacheId := "map[__alert_rule_namespace_uid__:test_namespace __alert_rule_title__:test_title __alert_rule_uid__:test_uid label1:value1 label2:value2]"
|
cacheId := "map[__alert_rule_namespace_uid__:test_namespace __alert_rule_uid__:test_uid alertname:test_title label1:value1 label2:value2 rule_label:rule_value]"
|
||||||
|
|
||||||
|
ruleLabels := map[string]string{
|
||||||
|
"rule_label": "rule_value",
|
||||||
|
}
|
||||||
alertRule := models.AlertRule{
|
alertRule := models.AlertRule{
|
||||||
ID: 1,
|
ID: 1,
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
@ -28,13 +32,17 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
Condition: "A",
|
Condition: "A",
|
||||||
UID: "test_uid",
|
UID: "test_uid",
|
||||||
NamespaceUID: "test_namespace",
|
NamespaceUID: "test_namespace",
|
||||||
|
For: 10 * time.Second,
|
||||||
|
Labels: ruleLabels,
|
||||||
}
|
}
|
||||||
|
processingTime := 10 * time.Millisecond
|
||||||
expectedLabels := data.Labels{
|
expectedLabels := data.Labels{
|
||||||
"label1": "value1",
|
"label1": "value1",
|
||||||
"label2": "value2",
|
"label2": "value2",
|
||||||
|
"rule_label": "rule_value",
|
||||||
"__alert_rule_uid__": "test_uid",
|
"__alert_rule_uid__": "test_uid",
|
||||||
"__alert_rule_namespace_uid__": "test_namespace",
|
"__alert_rule_namespace_uid__": "test_namespace",
|
||||||
"__alert_rule_title__": "test_title",
|
"alertname": "test_title",
|
||||||
}
|
}
|
||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
desc string
|
desc string
|
||||||
@ -60,11 +68,11 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
expectedResultCount: 1,
|
expectedResultCount: 1,
|
||||||
expectedCacheEntries: []state.AlertState{
|
expectedCacheEntries: []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: cacheId,
|
CacheId: cacheId,
|
||||||
Labels: expectedLabels,
|
Labels: expectedLabels,
|
||||||
State: eval.Normal,
|
State: eval.Normal,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
||||||
},
|
},
|
||||||
@ -94,17 +102,17 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
expectedResultCount: 2,
|
expectedResultCount: 2,
|
||||||
expectedCacheEntries: []state.AlertState{
|
expectedCacheEntries: []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: cacheId,
|
CacheId: cacheId,
|
||||||
Labels: expectedLabels,
|
Labels: expectedLabels,
|
||||||
State: eval.Alerting,
|
State: eval.Alerting,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
||||||
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
|
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(1 * time.Minute),
|
StartsAt: evaluationTime.Add(1 * time.Minute),
|
||||||
EndsAt: evaluationTime.Add(100 * time.Second),
|
EndsAt: evaluationTime.Add(alertRule.For * time.Second).Add(1 * time.Minute),
|
||||||
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -129,11 +137,11 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
expectedResultCount: 2,
|
expectedResultCount: 2,
|
||||||
expectedCacheEntries: []state.AlertState{
|
expectedCacheEntries: []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: cacheId,
|
CacheId: cacheId,
|
||||||
Labels: expectedLabels,
|
Labels: expectedLabels,
|
||||||
State: eval.Normal,
|
State: eval.Normal,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
||||||
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
|
||||||
@ -164,17 +172,17 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
expectedResultCount: 2,
|
expectedResultCount: 2,
|
||||||
expectedCacheEntries: []state.AlertState{
|
expectedCacheEntries: []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: cacheId,
|
CacheId: cacheId,
|
||||||
Labels: expectedLabels,
|
Labels: expectedLabels,
|
||||||
State: eval.Alerting,
|
State: eval.Alerting,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Alerting},
|
||||||
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
|
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Alerting},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime,
|
StartsAt: evaluationTime,
|
||||||
EndsAt: evaluationTime.Add(100 * time.Second),
|
EndsAt: evaluationTime.Add(alertRule.For * time.Second).Add(1 * time.Minute),
|
||||||
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -199,11 +207,11 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
expectedResultCount: 2,
|
expectedResultCount: 2,
|
||||||
expectedCacheEntries: []state.AlertState{
|
expectedCacheEntries: []state.AlertState{
|
||||||
{
|
{
|
||||||
UID: "test_uid",
|
AlertRuleUID: "test_uid",
|
||||||
OrgID: 123,
|
OrgID: 123,
|
||||||
CacheId: cacheId,
|
CacheId: cacheId,
|
||||||
Labels: expectedLabels,
|
Labels: expectedLabels,
|
||||||
State: eval.Normal,
|
State: eval.Normal,
|
||||||
Results: []state.StateEvaluation{
|
Results: []state.StateEvaluation{
|
||||||
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime, EvaluationState: eval.Normal},
|
||||||
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
|
{EvaluationTime: evaluationTime.Add(1 * time.Minute), EvaluationState: eval.Normal},
|
||||||
@ -219,7 +227,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
t.Run("all fields for a cache entry are set correctly", func(t *testing.T) {
|
t.Run("all fields for a cache entry are set correctly", func(t *testing.T) {
|
||||||
st := state.NewStateTracker(log.New("test_state_tracker"))
|
st := state.NewStateTracker(log.New("test_state_tracker"))
|
||||||
_ = st.ProcessEvalResults(&alertRule, tc.evalResults)
|
_ = st.ProcessEvalResults(&alertRule, tc.evalResults, processingTime)
|
||||||
for _, entry := range tc.expectedCacheEntries {
|
for _, entry := range tc.expectedCacheEntries {
|
||||||
if !entry.Equals(st.Get(entry.CacheId)) {
|
if !entry.Equals(st.Get(entry.CacheId)) {
|
||||||
t.Log(tc.desc)
|
t.Log(tc.desc)
|
||||||
@ -231,7 +239,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
|
|
||||||
t.Run("the expected number of entries are added to the cache", func(t *testing.T) {
|
t.Run("the expected number of entries are added to the cache", func(t *testing.T) {
|
||||||
st := state.NewStateTracker(log.New("test_state_tracker"))
|
st := state.NewStateTracker(log.New("test_state_tracker"))
|
||||||
st.ProcessEvalResults(&alertRule, tc.evalResults)
|
st.ProcessEvalResults(&alertRule, tc.evalResults, processingTime)
|
||||||
assert.Equal(t, len(tc.expectedCacheEntries), len(st.GetAll()))
|
assert.Equal(t, len(tc.expectedCacheEntries), len(st.GetAll()))
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -240,15 +248,15 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
//for a unique set of labels.
|
//for a unique set of labels.
|
||||||
t.Run("the expected number of states are returned to the caller", func(t *testing.T) {
|
t.Run("the expected number of states are returned to the caller", func(t *testing.T) {
|
||||||
st := state.NewStateTracker(log.New("test_state_tracker"))
|
st := state.NewStateTracker(log.New("test_state_tracker"))
|
||||||
results := st.ProcessEvalResults(&alertRule, tc.evalResults)
|
results := st.ProcessEvalResults(&alertRule, tc.evalResults, processingTime)
|
||||||
assert.Equal(t, len(tc.evalResults), len(results))
|
assert.Equal(t, len(tc.evalResults), len(results))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func printEntryDiff(a, b state.AlertState, t *testing.T) {
|
func printEntryDiff(a, b state.AlertState, t *testing.T) {
|
||||||
if a.UID != b.UID {
|
if a.AlertRuleUID != b.AlertRuleUID {
|
||||||
t.Log(fmt.Sprintf("%v \t %v\n", a.UID, b.UID))
|
t.Log(fmt.Sprintf("%v \t %v\n", a.AlertRuleUID, b.AlertRuleUID))
|
||||||
}
|
}
|
||||||
if a.OrgID != b.OrgID {
|
if a.OrgID != b.OrgID {
|
||||||
t.Log(fmt.Sprintf("%v \t %v\n", a.OrgID, b.OrgID))
|
t.Log(fmt.Sprintf("%v \t %v\n", a.OrgID, b.OrgID))
|
||||||
|
Loading…
Reference in New Issue
Block a user