mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
* Alerting: Fix alert flapping in the alertmanager fixes a bug that caused Alerts that are evaluated at low intervals (sub 1 minute), to flap in the Alertmanager. Mostly due to a combination of `EndsAt` and resend delay. The Alertmanager uses `EndsAt` as a heuristic to know whenever it should resolve a firing alert, in the case that it hasn't heard back from the alert generation system. Because grafana sent the alert with an `EndsAt` which is equal to the `For` of the alert itself, and we had a hard-coded 1 minute re-send delay (only applicable to firing alerts) this meant that a firing alert would resolve in the Alertmanager before we re-notify that it still firing. This commit, increases the `EndsAt` by 3x the the resend delay or alert interval (depending on which one is higher). The resendDelay has been decreased to 30 seconds.
166 lines
4.8 KiB
Go
166 lines
4.8 KiB
Go
package state
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/grafana/grafana-plugin-sdk-go/data"
|
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
|
ngModels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
)
|
|
|
|
type State struct {
|
|
AlertRuleUID string
|
|
OrgID int64
|
|
CacheId string
|
|
State eval.State
|
|
Resolved bool
|
|
Results []Evaluation
|
|
StartsAt time.Time
|
|
EndsAt time.Time
|
|
LastEvaluationTime time.Time
|
|
EvaluationDuration time.Duration
|
|
LastSentAt time.Time
|
|
Annotations map[string]string
|
|
Labels data.Labels
|
|
Error error
|
|
}
|
|
|
|
type Evaluation struct {
|
|
EvaluationTime time.Time
|
|
EvaluationState eval.State
|
|
EvaluationString string
|
|
// Values contains the RefID and value of reduce and math expressions.
|
|
// It does not contain values for classic conditions as the values
|
|
// in classic conditions do not have a RefID.
|
|
Values map[string]EvaluationValue
|
|
}
|
|
|
|
// EvaluationValue contains the labels and value for a RefID in an evaluation.
|
|
type EvaluationValue struct {
|
|
Labels data.Labels
|
|
Value *float64
|
|
}
|
|
|
|
// NewEvaluationValues returns the labels and values for each RefID in the capture.
|
|
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]EvaluationValue {
|
|
result := make(map[string]EvaluationValue, len(m))
|
|
for k, v := range m {
|
|
result[k] = EvaluationValue{
|
|
Labels: v.Labels,
|
|
Value: v.Value,
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
func (a *State) resultNormal(alertRule *ngModels.AlertRule, result eval.Result) {
|
|
if a.State != eval.Normal {
|
|
a.EndsAt = result.EvaluatedAt
|
|
a.StartsAt = result.EvaluatedAt
|
|
}
|
|
a.Error = result.Error // should be nil since state is not error
|
|
a.State = eval.Normal
|
|
}
|
|
|
|
func (a *State) resultAlerting(alertRule *ngModels.AlertRule, result eval.Result) {
|
|
switch a.State {
|
|
case eval.Alerting:
|
|
a.setEndsAt(alertRule, result)
|
|
case eval.Pending:
|
|
if result.EvaluatedAt.Sub(a.StartsAt) > alertRule.For {
|
|
a.State = eval.Alerting
|
|
a.StartsAt = result.EvaluatedAt
|
|
a.setEndsAt(alertRule, result)
|
|
}
|
|
default:
|
|
a.StartsAt = result.EvaluatedAt
|
|
a.setEndsAt(alertRule, result)
|
|
if !(alertRule.For > 0) {
|
|
// If For is 0, immediately set Alerting
|
|
a.State = eval.Alerting
|
|
} else {
|
|
a.State = eval.Pending
|
|
}
|
|
}
|
|
}
|
|
|
|
func (a *State) resultError(alertRule *ngModels.AlertRule, result eval.Result) {
|
|
a.Error = result.Error
|
|
if a.StartsAt.IsZero() {
|
|
a.StartsAt = result.EvaluatedAt
|
|
}
|
|
a.setEndsAt(alertRule, result)
|
|
|
|
if alertRule.ExecErrState == ngModels.AlertingErrState {
|
|
a.State = eval.Alerting
|
|
}
|
|
}
|
|
|
|
func (a *State) resultNoData(alertRule *ngModels.AlertRule, result eval.Result) {
|
|
if a.StartsAt.IsZero() {
|
|
a.StartsAt = result.EvaluatedAt
|
|
}
|
|
a.setEndsAt(alertRule, result)
|
|
|
|
switch alertRule.NoDataState {
|
|
case ngModels.Alerting:
|
|
a.State = eval.Alerting
|
|
case ngModels.NoData:
|
|
a.State = eval.NoData
|
|
case ngModels.OK:
|
|
a.State = eval.Normal
|
|
}
|
|
}
|
|
|
|
func (a *State) NeedsSending(resendDelay time.Duration) bool {
|
|
if a.State != eval.Alerting && a.State != eval.Normal {
|
|
return false
|
|
}
|
|
|
|
if a.State == eval.Normal && !a.Resolved {
|
|
return false
|
|
}
|
|
// if LastSentAt is before or equal to LastEvaluationTime + resendDelay, send again
|
|
return a.LastSentAt.Add(resendDelay).Before(a.LastEvaluationTime) ||
|
|
a.LastSentAt.Add(resendDelay).Equal(a.LastEvaluationTime)
|
|
}
|
|
|
|
func (a *State) Equals(b *State) bool {
|
|
return a.AlertRuleUID == b.AlertRuleUID &&
|
|
a.OrgID == b.OrgID &&
|
|
a.CacheId == b.CacheId &&
|
|
a.Labels.String() == b.Labels.String() &&
|
|
a.State.String() == b.State.String() &&
|
|
a.StartsAt == b.StartsAt &&
|
|
a.EndsAt == b.EndsAt &&
|
|
a.LastEvaluationTime == b.LastEvaluationTime &&
|
|
data.Labels(a.Annotations).String() == data.Labels(b.Annotations).String()
|
|
}
|
|
|
|
func (a *State) TrimResults(alertRule *ngModels.AlertRule) {
|
|
numBuckets := 2 * (int64(alertRule.For.Seconds()) / alertRule.IntervalSeconds)
|
|
if numBuckets == 0 {
|
|
numBuckets = 10 // keep at least 10 evaluations in the event For is set to 0
|
|
}
|
|
|
|
if len(a.Results) < int(numBuckets) {
|
|
return
|
|
}
|
|
newResults := make([]Evaluation, numBuckets)
|
|
copy(newResults, a.Results[len(a.Results)-int(numBuckets):])
|
|
a.Results = newResults
|
|
}
|
|
|
|
// setEndsAt sets the ending timestamp of the alert.
|
|
// The internal Alertmanager will use this time to know when it should automatically resolve the alert
|
|
// in case it hasn't received additional alerts. Under regular operations the scheduler will continue to send the
|
|
// alert with an updated EndsAt, if the alert is resolved then a last alert is sent with EndsAt = last evaluation time.
|
|
func (a *State) setEndsAt(alertRule *ngModels.AlertRule, result eval.Result) {
|
|
ends := ResendDelay
|
|
if alertRule.IntervalSeconds > int64(ResendDelay.Seconds()) {
|
|
ends = time.Duration(alertRule.IntervalSeconds)
|
|
}
|
|
|
|
a.EndsAt = result.EvaluatedAt.Add(ends * 3)
|
|
}
|