2021-04-23 14:32:25 -05:00
|
|
|
package state
|
|
|
|
|
|
|
|
import (
|
2021-09-14 09:08:04 -05:00
|
|
|
"context"
|
2022-05-22 09:33:49 -05:00
|
|
|
"errors"
|
2021-07-13 11:50:10 -05:00
|
|
|
"fmt"
|
2021-10-04 13:04:37 -05:00
|
|
|
"net/url"
|
2021-04-23 14:32:25 -05:00
|
|
|
"time"
|
|
|
|
|
2022-06-22 11:18:42 -05:00
|
|
|
"github.com/benbjohnson/clock"
|
2022-02-24 04:58:54 -06:00
|
|
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
2021-07-13 11:50:10 -05:00
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
2022-05-22 09:33:49 -05:00
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/image"
|
2021-04-30 11:28:06 -05:00
|
|
|
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
|
2021-04-23 14:32:25 -05:00
|
|
|
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
2022-09-26 13:55:05 -05:00
|
|
|
|
2022-05-22 09:33:49 -05:00
|
|
|
"github.com/grafana/grafana/pkg/services/screenshot"
|
2021-04-23 14:32:25 -05:00
|
|
|
)
|
|
|
|
|
2021-09-02 10:22:59 -05:00
|
|
|
var ResendDelay = 30 * time.Second
|
|
|
|
|
2022-03-09 12:20:29 -06:00
|
|
|
// AlertInstanceManager defines the interface for querying the current alert instances.
|
|
|
|
type AlertInstanceManager interface {
|
|
|
|
GetAll(orgID int64) []*State
|
|
|
|
GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State
|
|
|
|
}
|
|
|
|
|
2021-04-23 14:32:25 -05:00
|
|
|
type Manager struct {
|
2021-07-07 11:18:31 -05:00
|
|
|
log log.Logger
|
2021-09-14 06:55:01 -05:00
|
|
|
metrics *metrics.State
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-06-22 11:18:42 -05:00
|
|
|
clock clock.Clock
|
2021-05-19 15:15:09 -05:00
|
|
|
cache *cache
|
|
|
|
quit chan struct{}
|
|
|
|
ResendDelay time.Duration
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-10-05 15:32:20 -05:00
|
|
|
ruleStore RuleReader
|
|
|
|
instanceStore InstanceStore
|
|
|
|
imageService image.ImageService
|
|
|
|
historian Historian
|
2022-10-06 14:30:12 -05:00
|
|
|
externalURL *url.URL
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-10-21 16:16:51 -05:00
|
|
|
func NewManager(metrics *metrics.State, externalURL *url.URL,
|
2022-10-05 15:32:20 -05:00
|
|
|
ruleStore RuleReader, instanceStore InstanceStore, imageService image.ImageService, clock clock.Clock, historian Historian) *Manager {
|
2021-04-23 14:32:25 -05:00
|
|
|
manager := &Manager{
|
2022-10-06 14:30:12 -05:00
|
|
|
cache: newCache(),
|
2022-10-05 15:32:20 -05:00
|
|
|
quit: make(chan struct{}),
|
|
|
|
ResendDelay: ResendDelay, // TODO: make this configurable
|
2022-10-21 16:16:51 -05:00
|
|
|
log: log.New("ngalert.state.manager"),
|
2022-10-05 15:32:20 -05:00
|
|
|
metrics: metrics,
|
|
|
|
ruleStore: ruleStore,
|
|
|
|
instanceStore: instanceStore,
|
|
|
|
imageService: imageService,
|
|
|
|
historian: historian,
|
|
|
|
clock: clock,
|
2022-10-06 14:30:12 -05:00
|
|
|
externalURL: externalURL,
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
2021-05-18 12:56:14 -05:00
|
|
|
go manager.recordMetrics()
|
2021-04-23 14:32:25 -05:00
|
|
|
return manager
|
|
|
|
}
|
|
|
|
|
2022-09-21 09:10:17 -05:00
|
|
|
func (st *Manager) Close() {
|
2021-04-23 14:32:25 -05:00
|
|
|
st.quit <- struct{}{}
|
|
|
|
}
|
|
|
|
|
2022-02-08 02:52:03 -06:00
|
|
|
func (st *Manager) Warm(ctx context.Context) {
|
2022-10-06 14:30:12 -05:00
|
|
|
startTime := time.Now()
|
|
|
|
st.log.Info("Warming state cache for startup")
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-02-08 07:49:04 -06:00
|
|
|
orgIds, err := st.instanceStore.FetchOrgIds(ctx)
|
2021-07-07 11:18:31 -05:00
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch orgIds", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
statesCount := 0
|
|
|
|
states := make(map[int64]map[string]*ruleStates, len(orgIds))
|
2021-07-07 11:18:31 -05:00
|
|
|
for _, orgId := range orgIds {
|
|
|
|
// Get Rules
|
|
|
|
ruleCmd := ngModels.ListAlertRulesQuery{
|
|
|
|
OrgID: orgId,
|
|
|
|
}
|
2022-04-25 05:42:42 -05:00
|
|
|
if err := st.ruleStore.ListAlertRules(ctx, &ruleCmd); err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch previous state", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
ruleByUID := make(map[string]*ngModels.AlertRule, len(ruleCmd.Result))
|
|
|
|
for _, rule := range ruleCmd.Result {
|
|
|
|
ruleByUID[rule.UID] = rule
|
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
orgStates := make(map[string]*ruleStates, len(ruleByUID))
|
|
|
|
states[orgId] = orgStates
|
|
|
|
|
2021-07-07 11:18:31 -05:00
|
|
|
// Get Instances
|
|
|
|
cmd := ngModels.ListAlertInstancesQuery{
|
|
|
|
RuleOrgID: orgId,
|
|
|
|
}
|
2022-02-08 07:49:04 -06:00
|
|
|
if err := st.instanceStore.ListAlertInstances(ctx, &cmd); err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Unable to fetch previous state", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, entry := range cmd.Result {
|
|
|
|
ruleForEntry, ok := ruleByUID[entry.RuleUID]
|
|
|
|
if !ok {
|
2022-10-06 14:30:12 -05:00
|
|
|
// TODO Should we delete the orphaned state from the db?
|
2021-07-07 11:18:31 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
rulesStates, ok := orgStates[entry.RuleUID]
|
|
|
|
if !ok {
|
|
|
|
rulesStates = &ruleStates{states: make(map[string]*State)}
|
|
|
|
orgStates[entry.RuleUID] = rulesStates
|
|
|
|
}
|
|
|
|
|
2021-07-07 11:18:31 -05:00
|
|
|
lbs := map[string]string(entry.Labels)
|
2022-10-11 03:30:33 -05:00
|
|
|
cacheID, err := entry.Labels.StringKey()
|
2021-07-07 11:18:31 -05:00
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Error getting cacheId for entry", "error", err)
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
2022-10-11 03:30:33 -05:00
|
|
|
rulesStates.states[cacheID] = &State{
|
2022-02-02 12:18:20 -06:00
|
|
|
AlertRuleUID: entry.RuleUID,
|
|
|
|
OrgID: entry.RuleOrgID,
|
2022-10-11 03:30:33 -05:00
|
|
|
CacheID: cacheID,
|
2022-02-02 12:18:20 -06:00
|
|
|
Labels: lbs,
|
|
|
|
State: translateInstanceState(entry.CurrentState),
|
2022-05-23 03:49:49 -05:00
|
|
|
StateReason: entry.CurrentReason,
|
2022-02-02 12:18:20 -06:00
|
|
|
LastEvaluationString: "",
|
|
|
|
StartsAt: entry.CurrentStateSince,
|
|
|
|
EndsAt: entry.CurrentStateEnd,
|
|
|
|
LastEvaluationTime: entry.LastEvalTime,
|
|
|
|
Annotations: ruleForEntry.Annotations,
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
2022-10-06 14:30:12 -05:00
|
|
|
statesCount++
|
2021-07-07 11:18:31 -05:00
|
|
|
}
|
|
|
|
}
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.setAllStates(states)
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Info("State cache has been initialized", "states", statesCount, "duration", time.Since(startTime))
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
func (st *Manager) Get(orgID int64, alertRuleUID, stateId string) *State {
|
2021-05-04 11:57:50 -05:00
|
|
|
return st.cache.get(orgID, alertRuleUID, stateId)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-08-25 13:12:22 -05:00
|
|
|
// ResetStateByRuleUID deletes all entries in the state manager that match the given rule UID.
|
|
|
|
func (st *Manager) ResetStateByRuleUID(ctx context.Context, ruleKey ngModels.AlertRuleKey) []*State {
|
|
|
|
logger := st.log.New(ruleKey.LogContext()...)
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("Resetting state of the rule")
|
2022-08-25 13:12:22 -05:00
|
|
|
states := st.cache.removeByRuleUID(ruleKey.OrgID, ruleKey.UID)
|
|
|
|
if len(states) > 0 {
|
|
|
|
err := st.instanceStore.DeleteAlertInstancesByRule(ctx, ruleKey)
|
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Error("Failed to delete states that belong to a rule from database", "error", err)
|
2022-08-25 13:12:22 -05:00
|
|
|
}
|
|
|
|
}
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Info("Rules state was reset", "states", len(states))
|
2022-08-25 13:12:22 -05:00
|
|
|
return states
|
2021-05-03 13:01:33 -05:00
|
|
|
}
|
|
|
|
|
2022-07-14 14:59:59 -05:00
|
|
|
// ProcessEvalResults updates the current states that belong to a rule with the evaluation results.
|
|
|
|
// if extraLabels is not empty, those labels will be added to every state. The extraLabels take precedence over rule labels and result labels
|
|
|
|
func (st *Manager) ProcessEvalResults(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, results eval.Results, extraLabels data.Labels) []*State {
|
2022-10-26 18:16:02 -05:00
|
|
|
logger := st.log.FromContext(ctx)
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("State manager processing evaluation results", "resultCount", len(results))
|
2021-04-23 14:32:25 -05:00
|
|
|
var states []*State
|
2021-07-26 11:12:04 -05:00
|
|
|
processedResults := make(map[string]*State, len(results))
|
2021-04-23 14:32:25 -05:00
|
|
|
for _, result := range results {
|
2022-10-21 16:16:51 -05:00
|
|
|
s := st.setNextState(ctx, alertRule, result, extraLabels, logger)
|
2021-04-23 14:32:25 -05:00
|
|
|
states = append(states, s)
|
2022-10-11 03:30:33 -05:00
|
|
|
processedResults[s.CacheID] = s
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
2022-10-21 16:16:51 -05:00
|
|
|
resolvedStates := st.staleResultsHandler(ctx, evaluatedAt, alertRule, processedResults, logger)
|
2022-08-18 08:40:33 -05:00
|
|
|
if len(states) > 0 {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("Saving new states to the database", "count", len(states))
|
2022-10-06 01:22:58 -05:00
|
|
|
_, _ = st.saveAlertStates(ctx, states...)
|
2022-08-18 08:40:33 -05:00
|
|
|
}
|
2022-09-21 12:24:47 -05:00
|
|
|
return append(states, resolvedStates...)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2022-05-22 21:53:41 -05:00
|
|
|
// Maybe take a screenshot. Do it if:
|
|
|
|
// 1. The alert state is transitioning into the "Alerting" state from something else.
|
|
|
|
// 2. The alert state has just transitioned to the resolved state.
|
|
|
|
// 3. The state is alerting and there is no screenshot annotation on the alert state.
|
|
|
|
func (st *Manager) maybeTakeScreenshot(
|
|
|
|
ctx context.Context,
|
|
|
|
alertRule *ngModels.AlertRule,
|
|
|
|
state *State,
|
|
|
|
oldState eval.State,
|
|
|
|
) error {
|
|
|
|
shouldScreenshot := state.Resolved ||
|
|
|
|
state.State == eval.Alerting && oldState != eval.Alerting ||
|
|
|
|
state.State == eval.Alerting && state.Image == nil
|
|
|
|
if !shouldScreenshot {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
img, err := st.imageService.NewImage(ctx, alertRule)
|
|
|
|
if err != nil &&
|
|
|
|
errors.Is(err, screenshot.ErrScreenshotsUnavailable) ||
|
|
|
|
errors.Is(err, image.ErrNoDashboard) ||
|
|
|
|
errors.Is(err, image.ErrNoPanel) {
|
|
|
|
// It's not an error if screenshots are disabled, or our rule isn't allowed to generate screenshots.
|
|
|
|
return nil
|
|
|
|
} else if err != nil {
|
|
|
|
return err
|
2022-05-22 09:33:49 -05:00
|
|
|
}
|
2022-05-22 21:53:41 -05:00
|
|
|
state.Image = img
|
2022-05-22 09:33:49 -05:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-10-07 16:30:06 -05:00
|
|
|
// Set the current state based on evaluation results
|
2022-10-21 16:16:51 -05:00
|
|
|
func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRule, result eval.Result, extraLabels data.Labels, logger log.Logger) *State {
|
2022-10-06 14:30:12 -05:00
|
|
|
currentState := st.cache.getOrCreate(ctx, st.log, alertRule, result, extraLabels, st.externalURL)
|
2021-04-23 14:32:25 -05:00
|
|
|
|
|
|
|
currentState.LastEvaluationTime = result.EvaluatedAt
|
|
|
|
currentState.EvaluationDuration = result.EvaluationDuration
|
|
|
|
currentState.Results = append(currentState.Results, Evaluation{
|
2022-02-02 12:18:20 -06:00
|
|
|
EvaluationTime: result.EvaluatedAt,
|
|
|
|
EvaluationState: result.State,
|
|
|
|
Values: NewEvaluationValues(result.Values),
|
2022-04-05 13:36:42 -05:00
|
|
|
Condition: alertRule.Condition,
|
2021-04-23 14:32:25 -05:00
|
|
|
})
|
2022-02-02 12:18:20 -06:00
|
|
|
currentState.LastEvaluationString = result.EvaluationString
|
2021-05-18 12:56:14 -05:00
|
|
|
currentState.TrimResults(alertRule)
|
2021-07-13 11:50:10 -05:00
|
|
|
oldState := currentState.State
|
2022-05-23 03:49:49 -05:00
|
|
|
oldReason := currentState.StateReason
|
2021-04-23 14:32:25 -05:00
|
|
|
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Debug("Setting alert state")
|
2021-04-23 14:32:25 -05:00
|
|
|
switch result.State {
|
|
|
|
case eval.Normal:
|
2021-07-13 11:50:10 -05:00
|
|
|
currentState.resultNormal(alertRule, result)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Alerting:
|
2021-06-17 12:01:46 -05:00
|
|
|
currentState.resultAlerting(alertRule, result)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Error:
|
2021-06-17 12:01:46 -05:00
|
|
|
currentState.resultError(alertRule, result)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.NoData:
|
2021-06-17 12:01:46 -05:00
|
|
|
currentState.resultNoData(alertRule, result)
|
2021-04-23 14:32:25 -05:00
|
|
|
case eval.Pending: // we do not emit results with this state
|
|
|
|
}
|
|
|
|
|
2022-05-23 03:49:49 -05:00
|
|
|
// Set reason iff: result is different than state, reason is not Alerting or Normal
|
|
|
|
currentState.StateReason = ""
|
|
|
|
|
|
|
|
if currentState.State != result.State &&
|
|
|
|
result.State != eval.Normal &&
|
|
|
|
result.State != eval.Alerting {
|
|
|
|
currentState.StateReason = result.State.String()
|
|
|
|
}
|
|
|
|
|
2021-07-29 13:29:17 -05:00
|
|
|
// Set Resolved property so the scheduler knows to send a postable alert
|
|
|
|
// to Alertmanager.
|
|
|
|
currentState.Resolved = oldState == eval.Alerting && currentState.State == eval.Normal
|
|
|
|
|
2022-05-22 21:53:41 -05:00
|
|
|
err := st.maybeTakeScreenshot(ctx, alertRule, currentState, oldState)
|
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Warn("Failed to generate a screenshot for an alert instance",
|
2022-05-22 21:53:41 -05:00
|
|
|
"dashboard", alertRule.DashboardUID,
|
2022-05-31 07:56:22 -05:00
|
|
|
"panel", alertRule.PanelID,
|
2022-10-19 16:36:54 -05:00
|
|
|
"error", err)
|
2022-05-22 21:53:41 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.set(currentState)
|
2022-05-23 03:49:49 -05:00
|
|
|
|
|
|
|
shouldUpdateAnnotation := oldState != currentState.State || oldReason != currentState.StateReason
|
|
|
|
if shouldUpdateAnnotation {
|
2022-10-05 15:32:20 -05:00
|
|
|
go st.historian.RecordState(ctx, alertRule, currentState.Labels, result.EvaluatedAt, InstanceStateAndReason{State: currentState.State, Reason: currentState.StateReason}, InstanceStateAndReason{State: oldState, Reason: oldReason})
|
2021-07-13 11:50:10 -05:00
|
|
|
}
|
2021-04-23 14:32:25 -05:00
|
|
|
return currentState
|
|
|
|
}
|
|
|
|
|
2021-05-04 11:57:50 -05:00
|
|
|
func (st *Manager) GetAll(orgID int64) []*State {
|
|
|
|
return st.cache.getAll(orgID)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2021-05-04 11:57:50 -05:00
|
|
|
func (st *Manager) GetStatesForRuleUID(orgID int64, alertRuleUID string) []*State {
|
|
|
|
return st.cache.getStatesForRuleUID(orgID, alertRuleUID)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
|
2021-05-18 12:56:14 -05:00
|
|
|
func (st *Manager) recordMetrics() {
|
2021-04-30 11:28:06 -05:00
|
|
|
// TODO: parameterize?
|
|
|
|
// Setting to a reasonable default scrape interval for Prometheus.
|
|
|
|
dur := time.Duration(15) * time.Second
|
2022-06-22 11:18:42 -05:00
|
|
|
ticker := st.clock.Ticker(dur)
|
2021-04-23 14:32:25 -05:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Debug("Recording state cache metrics", "now", st.clock.Now())
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.recordMetrics(st.metrics)
|
2021-04-23 14:32:25 -05:00
|
|
|
case <-st.quit:
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Debug("Stopping state cache metrics recording", "now", st.clock.Now())
|
2021-04-23 14:32:25 -05:00
|
|
|
ticker.Stop()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (st *Manager) Put(states []*State) {
|
|
|
|
for _, s := range states {
|
2022-10-06 14:30:12 -05:00
|
|
|
st.cache.set(s)
|
2021-04-23 14:32:25 -05:00
|
|
|
}
|
|
|
|
}
|
2021-07-07 11:18:31 -05:00
|
|
|
|
2022-10-06 01:22:58 -05:00
|
|
|
// TODO: Is the `State` type necessary? Should it embed the instance?
|
|
|
|
func (st *Manager) saveAlertStates(ctx context.Context, states ...*State) (saved, failed int) {
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Debug("Saving alert states", "count", len(states))
|
2022-10-06 01:22:58 -05:00
|
|
|
instances := make([]ngModels.AlertInstance, 0, len(states))
|
|
|
|
|
|
|
|
type debugInfo struct {
|
|
|
|
OrgID int64
|
|
|
|
Uid string
|
|
|
|
State string
|
|
|
|
Labels string
|
|
|
|
}
|
|
|
|
debug := make([]debugInfo, 0)
|
|
|
|
|
|
|
|
for _, s := range states {
|
|
|
|
labels := ngModels.InstanceLabels(s.Labels)
|
|
|
|
_, hash, err := labels.StringAndHash()
|
|
|
|
if err != nil {
|
|
|
|
debug = append(debug, debugInfo{s.OrgID, s.AlertRuleUID, s.State.String(), s.Labels.String()})
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Failed to save alert instance with invalid labels", "orgID", s.OrgID, "rule", s.AlertRuleUID, "error", err)
|
2022-10-06 01:22:58 -05:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
fields := ngModels.AlertInstance{
|
|
|
|
AlertInstanceKey: ngModels.AlertInstanceKey{
|
|
|
|
RuleOrgID: s.OrgID,
|
|
|
|
RuleUID: s.AlertRuleUID,
|
|
|
|
LabelsHash: hash,
|
|
|
|
},
|
|
|
|
Labels: ngModels.InstanceLabels(s.Labels),
|
|
|
|
CurrentState: ngModels.InstanceStateType(s.State.String()),
|
|
|
|
CurrentReason: s.StateReason,
|
|
|
|
LastEvalTime: s.LastEvaluationTime,
|
|
|
|
CurrentStateSince: s.StartsAt,
|
|
|
|
CurrentStateEnd: s.EndsAt,
|
|
|
|
}
|
|
|
|
instances = append(instances, fields)
|
2022-08-18 08:40:33 -05:00
|
|
|
}
|
2022-10-06 01:22:58 -05:00
|
|
|
|
|
|
|
if err := st.instanceStore.SaveAlertInstances(ctx, instances...); err != nil {
|
|
|
|
for _, inst := range instances {
|
|
|
|
debug = append(debug, debugInfo{inst.RuleOrgID, inst.RuleUID, string(inst.CurrentState), data.Labels(inst.Labels).String()})
|
|
|
|
}
|
2022-10-21 16:16:51 -05:00
|
|
|
st.log.Error("Failed to save alert states", "states", debug, "error", err)
|
2022-10-06 01:22:58 -05:00
|
|
|
return 0, len(debug)
|
|
|
|
}
|
|
|
|
|
|
|
|
return len(instances), len(debug)
|
2022-08-18 08:40:33 -05:00
|
|
|
}
|
|
|
|
|
2022-05-23 03:49:49 -05:00
|
|
|
// TODO: why wouldn't you allow other types like NoData or Error?
|
2021-07-07 11:18:31 -05:00
|
|
|
func translateInstanceState(state ngModels.InstanceStateType) eval.State {
|
|
|
|
switch {
|
|
|
|
case state == ngModels.InstanceStateFiring:
|
|
|
|
return eval.Alerting
|
|
|
|
case state == ngModels.InstanceStateNormal:
|
|
|
|
return eval.Normal
|
|
|
|
default:
|
|
|
|
return eval.Error
|
|
|
|
}
|
|
|
|
}
|
2021-07-13 11:50:10 -05:00
|
|
|
|
2022-05-23 03:49:49 -05:00
|
|
|
// This struct provides grouping of state with reason, and string formatting.
|
|
|
|
type InstanceStateAndReason struct {
|
|
|
|
State eval.State
|
|
|
|
Reason string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (i InstanceStateAndReason) String() string {
|
|
|
|
s := fmt.Sprintf("%v", i.State)
|
|
|
|
if len(i.Reason) > 0 {
|
|
|
|
s += fmt.Sprintf(" (%v)", i.Reason)
|
|
|
|
}
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
2022-10-21 16:16:51 -05:00
|
|
|
func (st *Manager) staleResultsHandler(ctx context.Context, evaluatedAt time.Time, alertRule *ngModels.AlertRule, states map[string]*State, logger log.Logger) []*State {
|
2022-09-21 12:24:47 -05:00
|
|
|
var resolvedStates []*State
|
2021-07-26 11:12:04 -05:00
|
|
|
allStates := st.GetStatesForRuleUID(alertRule.OrgID, alertRule.UID)
|
2022-10-06 01:22:58 -05:00
|
|
|
toDelete := make([]ngModels.AlertInstanceKey, 0)
|
|
|
|
|
2021-07-26 11:12:04 -05:00
|
|
|
for _, s := range allStates {
|
2022-10-06 01:22:58 -05:00
|
|
|
// Is the cached state in our recently processed results? If not, is it stale?
|
2022-10-11 03:30:33 -05:00
|
|
|
if _, ok := states[s.CacheID]; !ok && stateIsStale(evaluatedAt, s.LastEvaluationTime, alertRule.IntervalSeconds) {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Info("Removing stale state entry", "cacheID", s.CacheID, "state", s.State, "reason", s.StateReason)
|
2022-10-11 03:30:33 -05:00
|
|
|
st.cache.deleteEntry(s.OrgID, s.AlertRuleUID, s.CacheID)
|
2021-07-26 11:12:04 -05:00
|
|
|
ilbs := ngModels.InstanceLabels(s.Labels)
|
|
|
|
_, labelsHash, err := ilbs.StringAndHash()
|
|
|
|
if err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Error("Unable to get labelsHash", "error", err.Error(), s.AlertRuleUID)
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 01:22:58 -05:00
|
|
|
toDelete = append(toDelete, ngModels.AlertInstanceKey{RuleOrgID: s.OrgID, RuleUID: s.AlertRuleUID, LabelsHash: labelsHash})
|
2022-02-24 10:25:28 -06:00
|
|
|
|
|
|
|
if s.State == eval.Alerting {
|
2022-09-21 12:24:47 -05:00
|
|
|
previousState := InstanceStateAndReason{State: s.State, Reason: s.StateReason}
|
|
|
|
s.State = eval.Normal
|
|
|
|
s.StateReason = ngModels.StateReasonMissingSeries
|
|
|
|
s.EndsAt = evaluatedAt
|
|
|
|
s.Resolved = true
|
2022-10-05 15:32:20 -05:00
|
|
|
st.historian.RecordState(ctx, alertRule, s.Labels, evaluatedAt,
|
2022-09-21 12:24:47 -05:00
|
|
|
InstanceStateAndReason{State: eval.Normal, Reason: s.StateReason},
|
|
|
|
previousState,
|
|
|
|
)
|
|
|
|
resolvedStates = append(resolvedStates, s)
|
2022-02-24 10:25:28 -06:00
|
|
|
}
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|
|
|
|
}
|
2022-10-06 01:22:58 -05:00
|
|
|
|
|
|
|
if err := st.instanceStore.DeleteAlertInstances(ctx, toDelete...); err != nil {
|
2022-10-21 16:16:51 -05:00
|
|
|
logger.Error("Unable to delete stale instances from database", "error", err, "count", len(toDelete))
|
2022-10-06 01:22:58 -05:00
|
|
|
}
|
2022-09-21 12:24:47 -05:00
|
|
|
return resolvedStates
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|
|
|
|
|
2022-10-06 01:22:58 -05:00
|
|
|
func stateIsStale(evaluatedAt time.Time, lastEval time.Time, intervalSeconds int64) bool {
|
2022-06-21 17:16:53 -05:00
|
|
|
return !lastEval.Add(2 * time.Duration(intervalSeconds) * time.Second).After(evaluatedAt)
|
2021-07-26 11:12:04 -05:00
|
|
|
}
|