2021-04-23 14:32:25 -05:00
|
|
|
package state
|
|
|
|
|
|
|
|
import (
|
2021-09-14 09:08:04 -05:00
|
|
|
"context"
|
2021-10-04 13:04:37 -05:00
|
|
|
"net/url"
|
2021-04-23 14:32:25 -05:00
|
|
|
"time"
|
|
|
|
|
2022-06-22 11:18:42 -05:00
|
|
|
"github.com/benbjohnson/clock"
|
2022-02-24 04:58:54 -06:00
|
|
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
2021-07-13 11:50:10 -05:00
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
2021-04-30 11:28:06 -05:00
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
2021-04-23 14:32:25 -05:00
|
|
|
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
|
|
)
|
|
|
|
|
2022-11-04 16:06:47 -05:00
|
|
|
var (
|
|
|
|
ResendDelay = 30 * time.Second
|
|
|
|
MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
|
|
|
|
)
|
2021-09-02 10:22:59 -05:00
|
|
|
|
2022-03-09 12:20:29 -06:00
|
|
|
// AlertInstanceManager defines the interface for querying the current alert instances.
|
|
|
|
type AlertInstanceManager interface {
|
|
|
|
GetAll(orgID int64) []*State
|
|
|
|
GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State
|
|
|
|
}
|
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
type Manager struct {
|
2021-07-07 11:18:31 -05:00
|
|
|
log log.Logger
|
2021-09-14 06:55:01 -05:00
|
|
|
metrics *metrics.State
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-06-22 11:18:42 -05:00
|
|
|
clock clock.Clock
|
2021-05-19 15:15:09 -05:00
|
|
|
cache *cache
|
|
|
|
ResendDelay time.Duration
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-10-05 15:32:20 -05:00
|
|
|
instanceStore InstanceStore
|
2022-11-09 15:06:49 -06:00
|
|
|
images ImageCapturer
|
2022-10-05 15:32:20 -05:00
|
|
|
historian Historian
|
2022-10-06 14:30:12 -05:00
|
|
|
externalURL *url.URL
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-11-09 15:06:49 -06:00
|
|
|
func NewManager(metrics *metrics.State, externalURL *url.URL, instanceStore InstanceStore, images ImageCapturer, clock clock.Clock, historian Historian) *Manager {
|
2022-11-04 16:06:47 -05:00
|
|
|
return &Manager{
|
2022-10-06 14:30:12 -05:00
|
|
|
cache: newCache(),
|
2022-10-05 15:32:20 -05:00
|
|
|
ResendDelay: ResendDelay, // TODO: make this configurable
|
2022-10-21 16:16:51 -05:00
|
|
|
log: log.New("ngalert.state.manager"),
|
2022-10-05 15:32:20 -05:00
|
|
|
metrics: metrics,
|
|
|
|
instanceStore: instanceStore,
|
2022-11-09 15:06:49 -06:00
|
|
|
images: images,
|
2022-10-05 15:32:20 -05:00
|
|
|
historian: historian,
|
|
|
|
clock: clock,
|
2022-10-06 14:30:12 -05:00
|
|
|
externalURL: externalURL,
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-04 16:06:47 -05:00
|
|
|
func (st *Manager) Run(ctx context.Context) error {
|
|
|
|
ticker := st.clock.Ticker(MetricsScrapeInterval)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
|
|
|
st.log.Debug("Recording state cache metrics", "now", st.clock.Now())
|
|
|
|
st.cache.recordMetrics(st.metrics)
|
|
|
|
case <-ctx.Done():
|
|
|
|
st.log.Debug("Stopping")
|
|
|
|
ticker.Stop()
|
|
|
|
return ctx.Err()
|
|
|
|
}
|
|
|
|
}
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-11-04 13:23:08 -05:00
|
|
|
func (st *Manager) Warm(ctx context.Context, rulesReader RuleReader) {
|
2022-10-28 12:10:28 -05:00
|
|
|
if st.instanceStore == nil {
|
|
|
|
st.log.Info("Skip warming the state because instance store is not configured")
|
2022-11-04 13:23:08 -05:00
|
|
|
return
|
2022-10-28 12:10:28 -05:00
|
|
|
}
|
2022-10-06 14:30:12 -05:00
|
|
|
startTime := time.Now()
|
|
|
|
st.log.Info("Warming state cache for startup")
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-02-08 07:49:04 -06:00
|
|
|
orgIds, err := st.instanceStore.FetchOrgIds(ctx)
|
2021-07-07 11:18:31 -05:00
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch orgIds", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
statesCount := 0
|
|
|
|
states := make(map[int64]map[string]*ruleStates, len(orgIds))
|
2021-07-07 11:18:31 -05:00
|
|
|
for _, orgId := range orgIds {
|
|
|
|
// Get Rules
|
|
|
|
ruleCmd := ngModels.ListAlertRulesQuery{
|
|
|
|
OrgID: orgId,
|
|
|
|
}
|
2022-11-04 13:23:08 -05:00
|
|
|
if err := rulesReader.ListAlertRules(ctx, &ruleCmd); err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch previous state", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
ruleByUID := make(map[string]*ngModels.AlertRule, len(ruleCmd.Result))
|
|
|
|
for _, rule := range ruleCmd.Result {
|
|
|
|
ruleByUID[rule.UID] = rule
|
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
orgStates := make(map[string]*ruleStates, len(ruleByUID))
|
|
|
|
states[orgId] = orgStates
|
|
|
|
|
2021-07-07 11:18:31 -05:00
|
|
|
// Get Instances
|
|
|
|
cmd := ngModels.ListAlertInstancesQuery{
|
|
|
|
RuleOrgID: orgId,
|
|
|
|
}
|
2022-02-08 07:49:04 -06:00
|
|
|
if err := st.instanceStore.ListAlertInstances(ctx, &cmd); err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch previous state", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, entry := range cmd.Result {
|
|
|
|
ruleForEntry, ok := ruleByUID[entry.RuleUID]
|
|
|
|
if !ok {
|
2022-10-06 14:30:12 -05:00
|
|
|
// TODO Should we delete the orphaned state from the db?
|
2021-07-07 11:18:31 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
rulesStates, ok := orgStates[entry.RuleUID]
|
|
|
|
if !ok {
|
|
|
|
rulesStates = &ruleStates{states: make(map[string]*State)}
|
|
|
|
orgStates[entry.RuleUID] = rulesStates
|
|
|
|
}
|
|
|
|
|
2021-07-07 11:18:31 -05:00
|
|
|
lbs := map[string]string(entry.Labels)
|
2022-10-11 03:30:33 -05:00
|
|
|
cacheID, err := entry.Labels.StringKey()
|
2021-07-07 11:18:31 -05:00
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Error getting cacheId for entry", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
2022-10-11 03:30:33 -05:00
|
|
|
rulesStates.states[cacheID] = &State{
|
2022-02-02 12:18:20 -06:00
|
|
|
AlertRuleUID: entry.RuleUID,
|
|
|
|
OrgID: entry.RuleOrgID,
|
2022-10-11 03:30:33 -05:00
|
|
|
CacheID: cacheID,
|
2022-02-02 12:18:20 -06:00
|
|
|
Labels: lbs,
|
|
|
|
State: translateInstanceState(entry.CurrentState),
|
2022-05-23 03:49:49 -05:00
|
|
|
StateReason: entry.CurrentReason,
|
2022-02-02 12:18:20 -06:00
|
|
|
LastEvaluationString: "",
|
|
|
|
StartsAt: entry.CurrentStateSince,
|
|
|
|
EndsAt: entry.CurrentStateEnd,
|
|
|
|
LastEvaluationTime: entry.LastEvalTime,
|
|
|
|
Annotations: ruleForEntry.Annotations,
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
2022-10-06 14:30:12 -05:00
|
|
|
statesCount++
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
}
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.setAllStates(states)
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Info("State cache has been initialized", "states", statesCount, "duration", time.Since(startTime))
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
func (st *Manager) Get(orgID int64, alertRuleUID, stateId string) *State {
|
2021-05-04 11:57:50 -05:00
|
|
|
return st.cache.get(orgID, alertRuleUID, stateId)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-08-25 13:12:22 -05:00
|
|
|
// ResetStateByRuleUID deletes all entries in the state manager that match the given rule UID.
|
|
|
|
func (st *Manager) ResetStateByRuleUID(ctx context.Context, ruleKey ngModels.AlertRuleKey) []*State {
|
|
|
|
logger := st.log.New(ruleKey.LogContext()...)
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("Resetting state of the rule")
|
2022-08-25 13:12:22 -05:00
|
|
|
states := st.cache.removeByRuleUID(ruleKey.OrgID, ruleKey.UID)
|
2022-10-28 12:10:28 -05:00
|
|
|
if len(states) > 0 && st.instanceStore != nil {
|
2022-08-25 13:12:22 -05:00
|
|
|
err := st.instanceStore.DeleteAlertInstancesByRule(ctx, ruleKey)
|
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Error("Failed to delete states that belong to a rule from database", "error", err)
|
2022-08-25 13:12:22 -05:00
|
|
|
}
|
|
|
|
}
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Info("Rules state was reset", "states", len(states))
|
2022-08-25 13:12:22 -05:00
|
|
|
return states
|
2021-05-03 13:01:33 -05:00
|
|
|
}
|
|
|
|
|
2022-07-14 14:59:59 -05:00
|
|
|
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
|
|
|
|
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
|
2022-12-06 12:07:39 -06:00
|
|
|
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []StateTransition {
|
2022-10-26 18:16:02 -05:00
|
|
|
logger := st.log.FromContext(ctx)
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
|
2022-11-04 10:39:26 -05:00
|
|
|
var states []StateTransition
|
2022-11-07 10:03:53 -06:00
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
for _, result := range results {
|
2022-10-21 16:16:51 -05:00
|
|
|
s := st.setNextState(ctx, alertRule, result, extraLabels, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
states = append(states, s)
|
|
|
|
}
|
2022-11-14 09:57:51 -06:00
|
|
|
staleStates := st.deleteStaleStatesFromCache(ctx, logger, evaluatedAt, alertRule)
|
|
|
|
st.deleteAlertStates(ctx, logger, staleStates)
|
2022-11-07 08:09:19 -06:00
|
|
|
|
|
|
|
st.saveAlertStates(ctx, logger, states...)
|
2022-11-04 10:39:26 -05:00
|
|
|
|
2022-12-06 11:33:15 -06:00
|
|
|
allChanges := append(states, staleStates...)
|
|
|
|
if st.historian != nil {
|
|
|
|
st.historian.RecordStatesAsync(ctx, alertRule, allChanges)
|
|
|
|
}
|
2022-12-06 12:07:39 -06:00
|
|
|
return allChanges
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2021-10-07 16:30:06 -05:00
|
|
|
// Set the current state based on evaluation results
|
2022-11-04 10:39:26 -05:00
|
|
|
func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, result eval.Result, extraLabels data.Labels, logger log.Logger) StateTransition {
|
2022-10-06 14:30:12 -05:00
|
|
|
currentState := st.cache.getOrCreate(ctx, st.log, alertRule, result, extraLabels, st.externalURL)
|
2021-04-23 14:32:25 -05:00
|
|
|
|
|
|
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
|
|
|
currentState.EvaluationDuration = result.EvaluationDuration
|
|
|
|
currentState.Results = append(currentState.Results, Evaluation{
|
2022-02-02 12:18:20 -06:00
|
|
|
EvaluationTime: result.EvaluatedAt,
|
|
|
|
EvaluationState: result.State,
|
|
|
|
Values: NewEvaluationValues(result.Values),
|
2022-04-05 13:36:42 -05:00
|
|
|
Condition: alertRule.Condition,
|
2021-04-23 14:32:25 -05:00
|
|
|
})
|
2022-02-02 12:18:20 -06:00
|
|
|
currentState.LastEvaluationString = result.EvaluationString
|
2021-05-18 12:56:14 -05:00
|
|
|
currentState.TrimResults(alertRule)
|
2021-07-13 11:50:10 -05:00
|
|
|
oldState := currentState.State
|
2022-05-23 03:49:49 -05:00
|
|
|
oldReason := currentState.StateReason
|
2021-04-23 14:32:25 -05:00
|
|
|
|
2022-12-07 04:45:56 -06:00
|
|
|
// Add the instance to the log context to help correlate log lines for a state
|
|
|
|
logger = logger.New("instance", result.Instance)
|
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
switch result.State {
|
|
|
|
case eval.Normal:
|
2022-12-07 04:45:56 -06:00
|
|
|
logger.Debug("Setting next state", "handler", "resultNormal")
|
|
|
|
resultNormal(currentState, alertRule, result, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Alerting:
|
2022-12-07 04:45:56 -06:00
|
|
|
logger.Debug("Setting next state", "handler", "resultAlerting")
|
|
|
|
resultAlerting(currentState, alertRule, result, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Error:
|
2022-12-07 04:45:56 -06:00
|
|
|
logger.Debug("Setting next state", "handler", "resultError")
|
|
|
|
resultError(currentState, alertRule, result, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.NoData:
|
2022-12-07 04:45:56 -06:00
|
|
|
logger.Debug("Setting next state", "handler", "resultNoData")
|
|
|
|
resultNoData(currentState, alertRule, result, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Pending: // we do not emit results with this state
|
2022-12-07 04:45:56 -06:00
|
|
|
logger.Debug("Ignoring set next state as result is pending")
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-05-23 03:49:49 -05:00
|
|
|
// Set reason iff: result is different than state, reason is not Alerting or Normal
|
|
|
|
currentState.StateReason = ""
|
|
|
|
|
|
|
|
if currentState.State != result.State &&
|
|
|
|
result.State != eval.Normal &&
|
|
|
|
result.State != eval.Alerting {
|
|
|
|
currentState.StateReason = result.State.String()
|
|
|
|
}
|
|
|
|
|
2021-07-29 13:29:17 -05:00
|
|
|
// Set Resolved property so the scheduler knows to send a postable alert
|
|
|
|
// to Alertmanager.
|
|
|
|
currentState.Resolved = oldState == eval.Alerting && currentState.State == eval.Normal
|
|
|
|
|
2022-11-02 17:14:22 -05:00
|
|
|
if shouldTakeImage(currentState.State, oldState, currentState.Image, currentState.Resolved) {
|
2022-11-09 15:06:49 -06:00
|
|
|
image, err := takeImage(ctx, st.images, alertRule)
|
2022-11-02 17:14:22 -05:00
|
|
|
if err != nil {
|
|
|
|
logger.Warn("Failed to take an image",
|
2022-11-10 03:58:38 -06:00
|
|
|
"dashboard", alertRule.GetDashboardUID(),
|
|
|
|
"panel", alertRule.GetPanelID(),
|
2022-11-02 17:14:22 -05:00
|
|
|
"error", err)
|
|
|
|
} else if image != nil {
|
|
|
|
currentState.Image = image
|
|
|
|
}
|
2022-05-22 21:53:41 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.set(currentState)
|
2022-05-23 03:49:49 -05:00
|
|
|
|
2022-11-04 10:39:26 -05:00
|
|
|
nextState := StateTransition{
|
|
|
|
State: currentState,
|
|
|
|
PreviousState: oldState,
|
|
|
|
PreviousStateReason: oldReason,
|
2021-07-13 11:50:10 -05:00
|
|
|
}
|
2022-11-04 10:39:26 -05:00
|
|
|
|
|
|
|
return nextState
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2021-05-04 11:57:50 -05:00
|
|
|
func (st *Manager) GetAll(orgID int64) []*State {
|
|
|
|
return st.cache.getAll(orgID)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2021-05-04 11:57:50 -05:00
|
|
|
func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
|
|
|
|
return st.cache.getStatesForRuleUID(orgID, alertRuleUID)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
func (st *Manager) Put(states []*State) {
|
|
|
|
for _, s := range states {
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.set(s)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
}
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-10-06 01:22:58 -05:00
|
|
|
// TODO: Is the `State` type necessary? Should it embed the instance?
|
2022-11-07 08:09:19 -06:00
|
|
|
func (st *Manager) saveAlertStates(ctx context.Context, logger log.Logger, states ...StateTransition) {
|
2022-11-14 09:57:51 -06:00
|
|
|
if st.instanceStore == nil || len(states) == 0 {
|
2022-11-07 08:09:19 -06:00
|
|
|
return
|
2022-10-28 12:10:28 -05:00
|
|
|
}
|
|
|
|
|
2022-11-04 10:39:26 -05:00
|
|
|
logger.Debug("Saving alert states", "count", len(states))
|
2022-10-06 01:22:58 -05:00
|
|
|
instances := make([]ngModels.AlertInstance, 0, len(states))
|
|
|
|
|
|
|
|
for _, s := range states {
|
2022-11-07 08:35:29 -06:00
|
|
|
key, err := s.GetAlertInstanceKey()
|
2022-10-06 01:22:58 -05:00
|
|
|
if err != nil {
|
2022-11-07 08:09:19 -06:00
|
|
|
logger.Error("Failed to create a key for alert state to save it to database. The state will be ignored ", "cacheID", s.CacheID, "error", err)
|
2022-10-06 01:22:58 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
fields := ngModels.AlertInstance{
|
2022-11-07 08:35:29 -06:00
|
|
|
AlertInstanceKey: key,
|
2022-10-06 01:22:58 -05:00
|
|
|
Labels: ngModels.InstanceLabels(s.Labels),
|
2022-11-04 10:39:26 -05:00
|
|
|
CurrentState: ngModels.InstanceStateType(s.State.State.String()),
|
2022-10-06 01:22:58 -05:00
|
|
|
CurrentReason: s.StateReason,
|
|
|
|
LastEvalTime: s.LastEvaluationTime,
|
|
|
|
CurrentStateSince: s.StartsAt,
|
|
|
|
CurrentStateEnd: s.EndsAt,
|
|
|
|
}
|
|
|
|
instances = append(instances, fields)
|
2022-08-18 08:40:33 -05:00
|
|
|
}
|
2022-10-06 01:22:58 -05:00
|
|
|
|
|
|
|
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
|
2022-11-07 08:09:19 -06:00
|
|
|
type debugInfo struct {
|
|
|
|
State string
|
|
|
|
Labels string
|
|
|
|
}
|
|
|
|
debug := make([]debugInfo, 0)
|
2022-10-06 01:22:58 -05:00
|
|
|
for _, inst := range instances {
|
2022-11-07 08:09:19 -06:00
|
|
|
debug = append(debug, debugInfo{string(inst.CurrentState), data.Labels(inst.Labels).String()})
|
2022-10-06 01:22:58 -05:00
|
|
|
}
|
2022-11-04 10:39:26 -05:00
|
|
|
logger.Error("Failed to save alert states", "states", debug, "error", err)
|
2022-10-06 01:22:58 -05:00
|
|
|
}
|
2022-08-18 08:40:33 -05:00
|
|
|
}
|
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
func (st *Manager) deleteAlertStates(ctx context.Context, logger log.Logger, states []StateTransition) {
|
|
|
|
if st.instanceStore == nil || len(states) == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.Debug("Deleting alert states", "count", len(states))
|
|
|
|
toDelete := make([]ngModels.AlertInstanceKey, 0, len(states))
|
|
|
|
|
|
|
|
for _, s := range states {
|
|
|
|
key, err := s.GetAlertInstanceKey()
|
|
|
|
if err != nil {
|
|
|
|
logger.Error("Failed to delete alert instance with invalid labels", "cacheID", s.CacheID, "error", err)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
toDelete = append(toDelete, key)
|
|
|
|
}
|
|
|
|
|
|
|
|
err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...)
|
|
|
|
if err != nil {
|
|
|
|
logger.Error("Failed to delete stale states", "error", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-23 03:49:49 -05:00
|
|
|
// TODO: why wouldn't you allow other types like NoData or Error?
|
2021-07-07 11:18:31 -05:00
|
|
|
func translateInstanceState(state ngModels.InstanceStateType) eval.State {
|
|
|
|
switch {
|
|
|
|
case state == ngModels.InstanceStateFiring:
|
|
|
|
return eval.Alerting
|
|
|
|
case state == ngModels.InstanceStateNormal:
|
|
|
|
return eval.Normal
|
|
|
|
default:
|
|
|
|
return eval.Error
|
|
|
|
}
|
|
|
|
}
|
2021-07-13 11:50:10 -05:00
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
func (st *Manager) deleteStaleStatesFromCache(ctx context.Context, logger log.Logger, evaluatedAt time.Time, alertRule *ngModels.AlertRule) []StateTransition {
|
|
|
|
// If we are removing two or more stale series it makes sense to share the resolved image as the alert rule is the same.
|
|
|
|
// TODO: We will need to change this when we support images without screenshots as each series will have a different image
|
|
|
|
var resolvedImage *ngModels.Image
|
2022-11-02 17:14:22 -05:00
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
var resolvedStates []StateTransition
|
|
|
|
staleStates := st.cache.deleteRuleStates(alertRule.GetKey(), func(s *State) bool {
|
|
|
|
return stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds)
|
2022-11-09 05:08:32 -06:00
|
|
|
})
|
2022-10-06 01:22:58 -05:00
|
|
|
|
2022-11-07 10:03:53 -06:00
|
|
|
for _, s := range staleStates {
|
|
|
|
logger.Info("Detected stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
|
2022-11-14 09:57:51 -06:00
|
|
|
oldState := s.State
|
|
|
|
oldReason := s.StateReason
|
2022-11-07 08:35:29 -06:00
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
s.State = eval.Normal
|
|
|
|
s.StateReason = ngModels.StateReasonMissingSeries
|
|
|
|
s.EndsAt = evaluatedAt
|
|
|
|
s.LastEvaluationTime = evaluatedAt
|
2022-11-02 17:14:22 -05:00
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
if oldState == eval.Alerting {
|
|
|
|
s.Resolved = true
|
2022-11-07 10:03:53 -06:00
|
|
|
// If there is no resolved image for this rule then take one
|
|
|
|
if resolvedImage == nil {
|
2022-11-14 09:57:51 -06:00
|
|
|
image, err := takeImage(ctx, st.images, alertRule)
|
2022-11-07 10:03:53 -06:00
|
|
|
if err != nil {
|
|
|
|
logger.Warn("Failed to take an image",
|
2022-11-14 09:57:51 -06:00
|
|
|
"dashboard", alertRule.GetDashboardUID(),
|
|
|
|
"panel", alertRule.GetPanelID(),
|
2022-11-07 10:03:53 -06:00
|
|
|
"error", err)
|
|
|
|
} else if image != nil {
|
|
|
|
resolvedImage = image
|
2022-11-02 17:14:22 -05:00
|
|
|
}
|
2022-02-24 10:25:28 -06:00
|
|
|
}
|
2022-11-07 10:03:53 -06:00
|
|
|
s.Image = resolvedImage
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|
2022-10-06 01:22:58 -05:00
|
|
|
|
2022-11-14 09:57:51 -06:00
|
|
|
record := StateTransition{
|
|
|
|
State: s,
|
|
|
|
PreviousState: oldState,
|
|
|
|
PreviousStateReason: oldReason,
|
2022-10-28 12:10:28 -05:00
|
|
|
}
|
2022-11-14 09:57:51 -06:00
|
|
|
resolvedStates = append(resolvedStates, record)
|
2022-10-06 01:22:58 -05:00
|
|
|
}
|
2022-09-21 12:24:47 -05:00
|
|
|
return resolvedStates
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 01:22:58 -05:00
|
|
|
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
2022-06-21 17:16:53 -05:00
|
|
|
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|