Alerting: Resend resolved notifications for ResolvedRetention duration (#88938)

* Simple replace of State.Resolved with State.ResolvedAt

* Retain ResolvedAt time between Normal->Normal transition

* Introduce ResolvedRetention to keep sending recently resolved alerts

* Make ResolvedRetention configurable with resolved_alert_retention

* Tick-based LastSentAt for testing of ResendDelay and ResolvedRetention

* Do not reset ResolvedAt during Normal->Pending transition

Initially this was done to be inline with Prom ruler. However, Prom ruler
doesn't keep track of Inactive->Pending/Alerting using the same alert instance,
so it's more understandable that they choose not to retain ResolvedAt. In our
case, since we use the same cached instance to represent the transition, it
makes more sense to retain it.

This should help alleviate some odd situations where temporarily entering
Pending will stop future resolved notifications that would have happened
because of ResolvedRetention.

* Pointers for ResolvedAt & LastSentAt

To avoid awkward time.Time{}.Unix() defaults on persist
This commit is contained in:
Matthew Jacobson 2024-06-20 16:33:03 -04:00 committed by GitHub
parent 3044319039
commit 3228b64fe6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 377 additions and 104 deletions

View File

@ -1304,6 +1304,9 @@ disable_jitter = false
# Retention period for Alertmanager notification log entries.
notification_log_retention = 5d
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
resolved_alert_retention = 15m
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.

View File

@ -792,7 +792,7 @@
;role_attribute_strict = false
;groups_attribute_path =
;id_token_attribute_name =
;team_ids_attribute_path
;team_ids_attribute_path
;auth_url = https://foo.bar/login/oauth/authorize
;token_url = https://foo.bar/login/oauth/access_token
;api_url = https://foo.bar/user
@ -1290,6 +1290,9 @@
# Retention period for Alertmanager notification log entries.
;notification_log_retention = 5d
# Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
;resolved_alert_retention = 15m
[unified_alerting.screenshots]
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
# plugin, or set up Grafana to use a remote rendering service.
@ -1837,4 +1840,4 @@ timeout = 30s
#################################### Public Dashboards #####################################
[public_dashboards]
# Set to false to disable public dashboards
;enabled = true
;enabled = true

View File

@ -377,6 +377,7 @@ func (ng *AlertNG) init() error {
RulesPerRuleGroupLimit: ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
Tracer: ng.tracer,
Log: log.New("ngalert.state.manager"),
ResolvedRetention: ng.Cfg.UnifiedAlerting.ResolvedAlertRetention,
}
logger := log.New("ngalert.state.manager.persist")
statePersister := state.NewSyncStatePersisiter(logger, cfg)

View File

@ -8,6 +8,10 @@ import (
"time"
"github.com/benbjohnson/clock"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/datasources"
@ -19,9 +23,6 @@ import (
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/services/user"
"github.com/grafana/grafana/pkg/util"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/trace"
)
// Rule represents a single piece of work that is executed periodically by the ruler.
@ -418,7 +419,7 @@ func (a *alertRule) evaluate(ctx context.Context, key ngmodels.AlertRuleKey, f f
processDuration.Observe(a.clock.Now().Sub(start).Seconds())
start = a.clock.Now()
alerts := state.FromStateTransitionToPostableAlerts(processedStates, a.stateManager, a.appURL)
alerts := state.FromStateTransitionToPostableAlerts(e.scheduledAt, processedStates, a.stateManager, a.appURL)
span.AddEvent("results processed", trace.WithAttributes(
attribute.Int64("state_transitions", int64(len(processedStates))),
attribute.Int64("alerts_to_send", int64(len(alerts.PostableAlerts))),

View File

@ -2,7 +2,7 @@ package schedule
import (
"bytes"
context "context"
"context"
"fmt"
"math"
"math/rand"
@ -11,19 +11,21 @@ import (
"testing"
"time"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
prometheusModel "github.com/prometheus/common/model"
"github.com/stretchr/testify/assert"
mock "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana/pkg/infra/log"
definitions "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/infra/log/logtest"
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
models "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/util"
)
@ -762,8 +764,94 @@ func TestRuleRoutine(t *testing.T) {
require.NotEmpty(t, sch.stateManager.GetStatesForRuleUID(rule.OrgID, rule.UID))
})
t.Run("when there are resolved alerts they should keep sending until retention period is over", func(t *testing.T) {
rule := gen.With(withQueryForState(t, eval.Normal), models.RuleMuts.WithInterval(time.Second)).GenerateRef()
evalAppliedChan := make(chan time.Time)
sender := NewSyncAlertsSenderMock()
sender.EXPECT().Send(mock.Anything, rule.GetKey(), mock.Anything).Return()
sch, ruleStore, _, _ := createSchedule(evalAppliedChan, sender)
sch.stateManager.ResolvedRetention = 4 * time.Second
sch.stateManager.ResendDelay = 2 * time.Second
sch.stateManager.Put([]*state.State{
stateForRule(rule, sch.clock.Now(), eval.Alerting), // Add existing Alerting state so evals will resolve.
})
ruleStore.PutRule(context.Background(), rule)
factory := ruleFactoryFromScheduler(sch)
ctx, cancel := context.WithCancel(context.Background())
t.Cleanup(cancel)
ruleInfo := factory.new(ctx, rule)
go func() {
_ = ruleInfo.Run(rule.GetKey())
}()
// Evaluate 10 times:
// 1. Send resolve #1.
// 2. 2s resend delay.
// 3. Send resolve #2.
// 4. 2s resend delay.
// 5. Send resolve #3.
// 6. No more sends, 4s retention period is over.
expectedResolves := map[time.Time]struct{}{
sch.clock.Now().Add(1 * time.Second): {},
sch.clock.Now().Add(3 * time.Second): {},
sch.clock.Now().Add(5 * time.Second): {},
}
calls := 0
for i := 1; i < 10; i++ {
ts := sch.clock.Now().Add(time.Duration(int64(i)*rule.IntervalSeconds) * time.Second)
ruleInfo.Eval(&Evaluation{
scheduledAt: ts,
rule: rule,
})
waitForTimeChannel(t, evalAppliedChan)
if _, ok := expectedResolves[ts]; ok {
calls++
prevCallAlerts, ok := sender.Calls()[calls-1].Arguments[2].(definitions.PostableAlerts)
assert.Truef(t, ok, fmt.Sprintf("expected argument of function was supposed to be 'definitions.PostableAlerts' but got %T", sender.Calls()[calls-1].Arguments[2]))
assert.Len(t, prevCallAlerts.PostableAlerts, 1)
}
sender.AssertNumberOfCalls(t, "Send", calls)
}
})
}
func ruleFactoryFromScheduler(sch *schedule) ruleFactory {
return newRuleFactory(sch.appURL, sch.disableGrafanaFolder, sch.maxAttempts, sch.alertsSender, sch.stateManager, sch.evaluatorFactory, &sch.schedulableAlertRules, sch.clock, sch.featureToggles, sch.metrics, sch.log, sch.tracer, sch.recordingWriter, sch.evalAppliedFunc, sch.stopAppliedFunc)
}
func stateForRule(rule *models.AlertRule, ts time.Time, evalState eval.State) *state.State {
s := &state.State{
OrgID: rule.OrgID,
AlertRuleUID: rule.UID,
CacheID: 0,
State: evalState,
Annotations: make(map[string]string),
Labels: make(map[string]string),
StartsAt: ts,
EndsAt: ts,
ResolvedAt: &ts,
LastSentAt: &ts,
LastEvaluationTime: ts,
}
for k, v := range rule.Labels {
s.Labels[k] = v
}
for k, v := range state.GetRuleExtraLabels(&logtest.Fake{}, rule, "", true) {
if _, ok := s.Labels[k]; !ok {
s.Labels[k] = v
}
}
il := models.InstanceLabels(s.Labels)
s.Labels = data.Labels(il)
id := il.Fingerprint()
s.CacheID = id
return s
}

View File

@ -7,15 +7,17 @@ import (
"testing"
"time"
definitions "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/stretchr/testify/mock"
"github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/models"
mock "github.com/stretchr/testify/mock"
)
// waitForTimeChannel blocks the execution until either the channel ch has some data or a timeout of 10 second expires.
// Timeout will cause the test to fail.
// Returns the data from the channel.
func waitForTimeChannel(t *testing.T, ch chan time.Time) time.Time {
t.Helper()
select {
case result := <-ch:
return result

View File

@ -10,11 +10,12 @@ import (
"github.com/benbjohnson/clock"
"github.com/go-openapi/strfmt"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/common/model"
alertingModels "github.com/grafana/alerting/models"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
@ -73,7 +74,7 @@ func StateToPostableAlert(transition StateTransition, appURL *url.URL) *models.P
}
state := alertState.State
if alertState.Resolved {
if alertState.ResolvedAt != nil {
// If this is a resolved alert, we need to send an alert with the correct labels such that they will expire the previous alert.
// In most cases the labels on the state will be correct, however when the previous alert was a NoData or Error alert, we need to
// ensure to modify it appropriately.
@ -139,13 +140,12 @@ func errorAlert(labels, annotations data.Labels, alertState *State, urlStr strin
}
}
func FromStateTransitionToPostableAlerts(firingStates []StateTransition, stateManager *Manager, appURL *url.URL) apimodels.PostableAlerts {
func FromStateTransitionToPostableAlerts(evaluatedAt time.Time, firingStates []StateTransition, stateManager *Manager, appURL *url.URL) apimodels.PostableAlerts {
alerts := apimodels.PostableAlerts{PostableAlerts: make([]models.PostableAlert, 0, len(firingStates))}
ts := time.Now()
sentAlerts := make([]*State, 0, len(firingStates))
for _, alertState := range firingStates {
if !alertState.NeedsSending(stateManager.ResendDelay) {
if !alertState.NeedsSending(stateManager.ResendDelay, stateManager.ResolvedRetention) {
continue
}
alert := StateToPostableAlert(alertState, appURL)
@ -153,7 +153,7 @@ func FromStateTransitionToPostableAlerts(firingStates []StateTransition, stateMa
if alertState.StateReason == ngModels.StateReasonMissingSeries { // do not put stale state back to state manager
continue
}
alertState.LastSentAt = ts
alertState.LastSentAt = &evaluatedAt
sentAlerts = append(sentAlerts, alertState.State)
}
stateManager.Put(sentAlerts)

View File

@ -9,12 +9,13 @@ import (
"github.com/benbjohnson/clock"
"github.com/go-openapi/strfmt"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
alertingModels "github.com/grafana/alerting/models"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/util"
@ -267,7 +268,9 @@ func TestStateToPostableAlertFromNodataError(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
alertState := randomTransition(tc.from, tc.to)
alertState.Resolved = tc.resolved
if tc.resolved {
alertState.ResolvedAt = &alertState.LastEvaluationTime
}
alertState.Labels = data.Labels(standardLabels)
result := StateToPostableAlert(alertState, appURL)
require.Equal(t, tc.expectedLabels, result.Labels)
@ -339,7 +342,7 @@ func randomTransition(from, to eval.State) StateTransition {
EndsAt: randomTimeInFuture(),
LastEvaluationTime: randomTimeInPast(),
EvaluationDuration: randomDuration(),
LastSentAt: randomTimeInPast(),
LastSentAt: util.Pointer(randomTimeInPast()),
Annotations: make(map[string]string),
Labels: make(map[string]string),
Values: make(map[string]float64),

View File

@ -39,9 +39,10 @@ type Manager struct {
metrics *metrics.State
tracer tracing.Tracer
clock clock.Clock
cache *cache
ResendDelay time.Duration
clock clock.Clock
cache *cache
ResendDelay time.Duration
ResolvedRetention time.Duration
instanceStore InstanceStore
images ImageCapturer
@ -73,6 +74,9 @@ type ManagerCfg struct {
DisableExecution bool
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
ResolvedRetention time.Duration
Tracer tracing.Tracer
Log log.Logger
}
@ -88,6 +92,7 @@ func NewManager(cfg ManagerCfg, statePersister StatePersister) *Manager {
m := &Manager{
cache: c,
ResendDelay: ResendDelay, // TODO: make this configurable
ResolvedRetention: cfg.ResolvedRetention,
log: cfg.Log,
metrics: cfg.Metrics,
instanceStore: cfg.InstanceStore,
@ -245,7 +250,11 @@ func (st *Manager) DeleteStateByRuleUID(ctx context.Context, ruleKey ngModels.Al
s.SetNormal(reason, startsAt, now)
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
s.Resolved = oldState == eval.Alerting || oldState == eval.Error || oldState == eval.NoData
if oldState == eval.Alerting || oldState == eval.Error || oldState == eval.NoData {
s.ResolvedAt = &now
} else {
s.ResolvedAt = nil
}
s.LastEvaluationTime = now
s.Values = map[string]float64{}
transitions = append(transitions, StateTransition{
@ -418,9 +427,15 @@ func (st *Manager) setNextState(ctx context.Context, alertRule *ngModels.AlertRu
// Set Resolved property so the scheduler knows to send a postable alert
// to Alertmanager.
currentState.Resolved = oldState == eval.Alerting && currentState.State == eval.Normal
newlyResolved := false
if oldState == eval.Alerting && currentState.State == eval.Normal {
currentState.ResolvedAt = &result.EvaluatedAt
newlyResolved = true
} else if currentState.State != eval.Normal && currentState.State != eval.Pending { // Retain the last resolved time for Normal->Normal and Normal->Pending.
currentState.ResolvedAt = nil
}
if shouldTakeImage(currentState.State, oldState, currentState.Image, currentState.Resolved) {
if shouldTakeImage(currentState.State, oldState, currentState.Image, newlyResolved) {
image, err := takeImage(ctx, st.images, alertRule)
if err != nil {
logger.Warn("Failed to take an image",
@ -505,7 +520,7 @@ func (st *Manager) deleteStaleStatesFromCache(ctx context.Context, logger log.Lo
s.LastEvaluationTime = evaluatedAt
if oldState == eval.Alerting {
s.Resolved = true
s.ResolvedAt = &evaluatedAt
image, err := takeImage(ctx, st.images, alertRule)
if err != nil {
logger.Warn("Failed to take an image",

View File

@ -563,7 +563,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
Resolved: true,
ResolvedAt: &t2,
},
},
},
@ -622,7 +622,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -1051,7 +1051,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -1091,7 +1091,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -1133,7 +1133,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -1175,7 +1175,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -1275,7 +1275,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
Resolved: true,
ResolvedAt: &t2,
},
},
},
@ -1304,6 +1304,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t3,
ResolvedAt: &t2,
},
},
},
@ -1935,7 +1936,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
},
@ -2163,7 +2164,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -2191,7 +2192,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -2221,7 +2222,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -2251,7 +2252,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
{
@ -2314,7 +2315,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
Resolved: true,
ResolvedAt: &t2,
},
},
},
@ -2330,6 +2331,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t3,
ResolvedAt: &t2,
},
},
},
@ -3060,7 +3062,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t1,
EndsAt: t3,
LastEvaluationTime: t3,
Resolved: true,
ResolvedAt: &t3,
},
},
},
@ -3480,7 +3482,7 @@ func TestProcessEvalResults_StateTransitions(t *testing.T) {
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
Resolved: true,
ResolvedAt: &t2,
},
},
},

View File

@ -7,6 +7,7 @@ import (
"fmt"
"math"
"math/rand"
"net/url"
"sort"
"strings"
"testing"
@ -294,7 +295,7 @@ func TestProcessEvalResults(t *testing.T) {
evaluationDuration := 10 * time.Millisecond
evaluationInterval := 10 * time.Second
t1 := time.Time{}.Add(evaluationInterval)
t1 := time.Unix(0, 0).Add(evaluationInterval)
tn := func(n int) time.Time {
return t1.Add(time.Duration(n-1) * evaluationInterval)
@ -424,6 +425,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t1,
EndsAt: t1.Add(state.ResendDelay * 4),
LastEvaluationTime: t1,
LastSentAt: &t1,
},
},
},
@ -471,6 +473,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
},
},
},
@ -501,6 +504,94 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: util.Pointer(tn(4)),
},
},
},
{
desc: "alerting -> normal resolves and sets ResolvedAt",
alertRule: baseRule,
evalResults: map[time.Time]eval.Results{
t1: {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)),
},
t2: {
newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)),
},
},
expectedAnnotations: 2,
expectedStates: []*state.State{
{
Labels: labels["system + rule + labels1"],
ResultFingerprint: labels1.Fingerprint(),
State: eval.Normal,
LatestResult: newEvaluation(t2, eval.Normal),
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t2,
ResolvedAt: &t2,
LastSentAt: &t2,
},
},
},
{
desc: "alerting -> normal -> normal resolves and maintains ResolvedAt",
alertRule: baseRule,
evalResults: map[time.Time]eval.Results{
t1: {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)),
},
t2: {
newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)),
},
t3: {
newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)),
},
},
expectedAnnotations: 2,
expectedStates: []*state.State{
{
Labels: labels["system + rule + labels1"],
ResultFingerprint: labels1.Fingerprint(),
State: eval.Normal,
LatestResult: newEvaluation(t3, eval.Normal),
StartsAt: t2,
EndsAt: t2,
LastEvaluationTime: t3,
ResolvedAt: &t2,
LastSentAt: &t2,
},
},
},
{
desc: "pending -> alerting -> normal -> pending resolves and resets ResolvedAt at t4",
alertRule: baseRuleWith(m.WithForNTimes(1)),
evalResults: map[time.Time]eval.Results{
t1: {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)),
},
t2: {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), // Alerting.
},
t3: {
newResult(eval.WithState(eval.Normal), eval.WithLabels(labels1)),
},
tn(4): {
newResult(eval.WithState(eval.Alerting), eval.WithLabels(labels1)), // Pending.
},
},
expectedAnnotations: 4,
expectedStates: []*state.State{
{
Labels: labels["system + rule + labels1"],
ResultFingerprint: labels1.Fingerprint(),
State: eval.Pending,
LatestResult: newEvaluation(tn(4), eval.Alerting),
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
ResolvedAt: &t3,
LastSentAt: &t3,
},
},
},
@ -534,6 +625,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(5),
LastSentAt: util.Pointer(tn(3)), // 30s resend delay causing the last sent at to be t3.
},
},
},
@ -564,6 +656,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: &t3, // Resend delay is 30s, so last sent at is t3.
},
},
},
@ -672,6 +765,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(5),
EndsAt: tn(5).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(5),
LastSentAt: util.Pointer(tn(5)),
},
},
},
@ -696,6 +790,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
},
},
},
@ -729,6 +824,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
},
},
},
@ -772,6 +868,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
},
},
},
@ -808,6 +905,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
},
},
},
@ -839,6 +937,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t3,
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: &t3, // Resend delay is 30s, so last sent at is t3.
},
},
},
@ -870,6 +969,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: util.Pointer(tn(4)),
},
},
},
@ -956,6 +1056,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(5),
EndsAt: tn(5).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(5),
LastSentAt: util.Pointer(tn(5)),
},
},
},
@ -988,6 +1089,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t2,
EndsAt: t2.Add(state.ResendDelay * 4),
LastEvaluationTime: t2,
LastSentAt: &t2,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test", "Error": "[sse.dataQueryError] failed to execute query [A]: this is an error"},
},
@ -1021,6 +1123,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t3,
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: &t3, // Resend delay is 30s, so last sent at is t3.
},
},
},
@ -1052,6 +1155,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(4).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(4),
LastSentAt: util.Pointer(tn(4)),
},
},
},
@ -1139,6 +1243,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(4),
EndsAt: tn(6).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(6),
LastSentAt: util.Pointer(tn(6)), // After 30s resend delay, last sent at is t6.
},
},
},
@ -1169,6 +1274,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(8),
EndsAt: tn(8).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(8),
LastSentAt: util.Pointer(tn(5)),
},
},
},
@ -1199,6 +1305,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: tn(6),
EndsAt: tn(6).Add(state.ResendDelay * 4),
LastEvaluationTime: tn(6),
LastSentAt: util.Pointer(tn(5)),
},
},
},
@ -1265,6 +1372,7 @@ func TestProcessEvalResults(t *testing.T) {
StartsAt: t3,
EndsAt: t3.Add(state.ResendDelay * 4),
LastEvaluationTime: t3,
LastSentAt: &t1, // Resend delay is 30s, so last sent at is t1.
},
},
},
@ -1306,8 +1414,9 @@ func TestProcessEvalResults(t *testing.T) {
res[i].EvaluatedAt = evalTime
}
clk.Set(evalTime)
_ = st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
processedStates := st.ProcessEvalResults(context.Background(), evalTime, tc.alertRule, res, systemLabels)
results += len(res)
_ = state.FromStateTransitionToPostableAlerts(evalTime, processedStates, st, &url.URL{}) // Set LastSentAt.
}
states := st.GetStatesForRuleUID(tc.alertRule.OrgID, tc.alertRule.UID)
@ -1670,7 +1779,7 @@ func TestStaleResults(t *testing.T) {
assert.Equal(t, models.StateReasonMissingSeries, s.StateReason)
assert.Equal(t, clk.Now(), s.EndsAt)
if s.CacheID == state2 {
assert.Truef(t, s.Resolved, "Returned stale state should have Resolved set to true")
assert.Equalf(t, clk.Now(), *s.ResolvedAt, "Returned stale state should have ResolvedAt set")
}
key, err := s.GetAlertInstanceKey()
require.NoError(t, err)
@ -1819,11 +1928,11 @@ func TestDeleteStateByRuleUID(t *testing.T) {
assert.Equal(t, expectedReason, s.StateReason)
if oldState.State == eval.Normal {
assert.Equal(t, oldState.StartsAt, s.StartsAt)
assert.False(t, s.Resolved)
assert.Zero(t, s.ResolvedAt)
} else {
assert.Equal(t, clk.Now(), s.StartsAt)
if oldState.State == eval.Alerting {
assert.True(t, s.Resolved)
assert.Equal(t, clk.Now(), *s.ResolvedAt)
}
}
assert.Equal(t, clk.Now(), s.EndsAt)
@ -1959,11 +2068,11 @@ func TestResetStateByRuleUID(t *testing.T) {
assert.Equal(t, models.StateReasonPaused, s.StateReason)
if oldState.State == eval.Normal {
assert.Equal(t, oldState.StartsAt, s.StartsAt)
assert.False(t, s.Resolved)
assert.Zero(t, s.ResolvedAt)
} else {
assert.Equal(t, clk.Now(), s.StartsAt)
if oldState.State == eval.Alerting {
assert.True(t, s.Resolved)
assert.Equal(t, clk.Now(), *s.ResolvedAt)
}
}
assert.Equal(t, clk.Now(), s.EndsAt)

View File

@ -45,10 +45,6 @@ type State struct {
// can still contain the results of previous evaluations.
Error error
// Resolved is set to true if this state is the transitional state between Firing and Normal.
// All subsequent states will be false until the next transition from Firing to Normal.
Resolved bool
// Image contains an optional image for the state. It tends to be included in notifications
// as a visualization to show why the alert fired.
Image *models.Image
@ -65,9 +61,15 @@ type State struct {
// conditions.
Values map[string]float64
StartsAt time.Time
EndsAt time.Time
LastSentAt time.Time
StartsAt time.Time
// EndsAt is different from the Prometheus EndsAt as EndsAt is updated for both Normal states
// and states that have been resolved. It cannot be used to determine when a state was resolved.
EndsAt time.Time
// ResolvedAt is set when the state is first resolved. That is to say, when the state first transitions
// from Alerting, NoData, or Error to Normal. It is reset to zero when the state transitions from Normal
// to any other state.
ResolvedAt *time.Time
LastSentAt *time.Time
LastEvaluationString string
LastEvaluationTime time.Time
EvaluationDuration time.Duration
@ -134,14 +136,6 @@ func (a *State) SetNormal(reason string, startsAt, endsAt time.Time) {
a.Error = nil
}
// Resolve sets the State to Normal. It updates the StateReason, the end time, and sets Resolved to true.
func (a *State) Resolve(reason string, endsAt time.Time) {
a.State = eval.Normal
a.StateReason = reason
a.Resolved = true
a.EndsAt = endsAt
}
// Maintain updates the end time using the most recent evaluation.
func (a *State) Maintain(interval int64, evaluatedAt time.Time) {
a.EndsAt = nextEndsTime(interval, evaluatedAt)
@ -400,19 +394,31 @@ func resultKeepLast(state *State, rule *models.AlertRule, result eval.Result, lo
}
}
func (a *State) NeedsSending(resendDelay time.Duration) bool {
switch a.State {
case eval.Pending:
// We do not send notifications for pending states
// NeedsSending returns true if the given state needs to be sent to the Alertmanager.
// Reasons for sending include:
// - The state has been resolved since the last notification.
// - The state is firing and the last notification was sent at least resendDelay ago.
// - The state was resolved within the resolvedRetention period, and the last notification was sent at least resendDelay ago.
func (a *State) NeedsSending(resendDelay time.Duration, resolvedRetention time.Duration) bool {
if a.State == eval.Pending {
// We do not send notifications for pending states.
return false
case eval.Normal:
// We should send a notification if the state is Normal because it was resolved
return a.Resolved
default:
// We should send, and re-send notifications, each time LastSentAt is <= LastEvaluationTime + resendDelay
nextSent := a.LastSentAt.Add(resendDelay)
return nextSent.Before(a.LastEvaluationTime) || nextSent.Equal(a.LastEvaluationTime)
}
// We should send a notification if the state has been resolved since the last notification.
if a.ResolvedAt != nil && (a.LastSentAt == nil || a.ResolvedAt.After(*a.LastSentAt)) {
return true
}
// For normal states, we should only be sending if this is a resolved notification or a re-send of the resolved
// notification within the resolvedRetention period.
if a.State == eval.Normal && (a.ResolvedAt == nil || a.LastEvaluationTime.Sub(*a.ResolvedAt) > resolvedRetention) {
return false
}
// We should send, and re-send notifications, each time LastSentAt is <= LastEvaluationTime + resendDelay.
// This can include normal->normal transitions that were resolved in recent past evaluations.
return a.LastSentAt == nil || !a.LastSentAt.Add(resendDelay).After(a.LastEvaluationTime)
}
func (a *State) Equals(b *State) bool {

View File

@ -11,11 +11,12 @@ import (
"github.com/benbjohnson/clock"
"github.com/golang/mock/gomock"
"github.com/google/uuid"
"github.com/grafana/alerting/models"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/grafana/alerting/models"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
@ -350,10 +351,11 @@ func TestEnd(t *testing.T) {
func TestNeedsSending(t *testing.T) {
evaluationTime, _ := time.Parse("2006-01-02", "2021-03-25")
testCases := []struct {
name string
resendDelay time.Duration
expected bool
testState *State
name string
resendDelay time.Duration
resolvedRetention time.Duration
expected bool
testState *State
}{
{
name: "state: alerting and LastSentAt before LastEvaluationTime + ResendDelay",
@ -362,7 +364,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Alerting,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-2 * time.Minute),
LastSentAt: util.Pointer(evaluationTime.Add(-2 * time.Minute)),
},
},
{
@ -372,7 +374,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Alerting,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime,
LastSentAt: util.Pointer(evaluationTime),
},
},
{
@ -382,7 +384,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Alerting,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-1 * time.Minute),
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
@ -400,18 +402,54 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Alerting,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime,
LastSentAt: util.Pointer(evaluationTime),
},
},
{
name: "state: normal + resolved should send without waiting",
name: "state: normal + resolved should send without waiting if ResolvedAt > LastSentAt",
resendDelay: 1 * time.Minute,
expected: true,
testState: &State{
State: eval.Normal,
Resolved: true,
ResolvedAt: util.Pointer(evaluationTime),
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime,
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
name: "state: normal + recently resolved should send with wait",
resendDelay: 1 * time.Minute,
resolvedRetention: 15 * time.Minute,
expected: true,
testState: &State{
State: eval.Normal,
ResolvedAt: util.Pointer(evaluationTime.Add(-2 * time.Minute)),
LastEvaluationTime: evaluationTime,
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
name: "state: normal + recently resolved should not send without wait",
resendDelay: 2 * time.Minute,
resolvedRetention: 15 * time.Minute,
expected: false,
testState: &State{
State: eval.Normal,
ResolvedAt: util.Pointer(evaluationTime.Add(-2 * time.Minute)),
LastEvaluationTime: evaluationTime,
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
name: "state: normal + not recently resolved should not send even with wait",
resendDelay: 1 * time.Minute,
resolvedRetention: 15 * time.Minute,
expected: false,
testState: &State{
State: eval.Normal,
ResolvedAt: util.Pointer(evaluationTime.Add(-16 * time.Minute)),
LastEvaluationTime: evaluationTime,
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
@ -420,9 +458,9 @@ func TestNeedsSending(t *testing.T) {
expected: false,
testState: &State{
State: eval.Normal,
Resolved: false,
ResolvedAt: util.Pointer(time.Time{}),
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-1 * time.Minute),
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
@ -432,7 +470,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.NoData,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-1 * time.Minute),
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
@ -442,7 +480,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.NoData,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-time.Duration(rand.Int63n(59)+1) * time.Second),
LastSentAt: util.Pointer(evaluationTime.Add(-time.Duration(rand.Int63n(59)+1) * time.Second)),
},
},
{
@ -452,7 +490,7 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Error,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-1 * time.Minute),
LastSentAt: util.Pointer(evaluationTime.Add(-1 * time.Minute)),
},
},
{
@ -462,14 +500,14 @@ func TestNeedsSending(t *testing.T) {
testState: &State{
State: eval.Error,
LastEvaluationTime: evaluationTime,
LastSentAt: evaluationTime.Add(-time.Duration(rand.Int63n(59)+1) * time.Second),
LastSentAt: util.Pointer(evaluationTime.Add(-time.Duration(rand.Int63n(59)+1) * time.Second)),
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.expected, tc.testState.NeedsSending(tc.resendDelay))
assert.Equal(t, tc.expected, tc.testState.NeedsSending(tc.resendDelay, tc.resolvedRetention))
})
}
}
@ -531,13 +569,6 @@ func TestGetLastEvaluationValuesForCondition(t *testing.T) {
})
}
func TestResolve(t *testing.T) {
s := State{State: eval.Alerting, EndsAt: time.Now().Add(time.Minute)}
expected := State{State: eval.Normal, StateReason: "This is a reason", EndsAt: time.Now(), Resolved: true}
s.Resolve("This is a reason", expected.EndsAt)
assert.Equal(t, expected, s)
}
func TestShouldTakeImage(t *testing.T) {
tests := []struct {
name string

View File

@ -6,11 +6,12 @@ import (
"strings"
"time"
alertingCluster "github.com/grafana/alerting/cluster"
dstls "github.com/grafana/dskit/crypto/tls"
"github.com/grafana/grafana-plugin-sdk-go/backend/gtime"
"gopkg.in/ini.v1"
alertingCluster "github.com/grafana/alerting/cluster"
"github.com/grafana/grafana/pkg/util"
)
@ -113,6 +114,9 @@ type UnifiedAlertingSettings struct {
// Retention period for Alertmanager notification log entries.
NotificationLogRetention time.Duration
// Duration for which a resolved alert state transition will continue to be sent to the Alertmanager.
ResolvedAlertRetention time.Duration
}
type RecordingRuleSettings struct {
@ -435,6 +439,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
return err
}
uaCfg.ResolvedAlertRetention, err = gtime.ParseDuration(valueAsString(ua, "resolved_alert_retention", (15 * time.Minute).String()))
if err != nil {
return err
}
cfg.UnifiedAlerting = uaCfg
return nil
}