Alerting: Fix database unavailable removes rules from scheduler (#49874)

This commit is contained in:
George Robinson
2022-06-07 16:20:06 +01:00
committed by GitHub
parent ae449cc823
commit c83f84348c
7 changed files with 231 additions and 34 deletions

View File

@@ -46,14 +46,16 @@ type NGAlert struct {
}
type Scheduler struct {
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
GetAlertRulesDuration prometheus.Histogram
SchedulePeriodicDuration prometheus.Histogram
Ticker *legacyMetrics.Ticker
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
SchedulePeriodicDuration prometheus.Histogram
SchedulableAlertRules prometheus.Gauge
SchedulableAlertRulesHash prometheus.Gauge
UpdateSchedulableAlertRulesDuration prometheus.Histogram
Ticker *legacyMetrics.Ticker
}
type MultiOrgAlertmanager struct {
@@ -163,15 +165,6 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
GetAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "get_alert_rules_duration_seconds",
Help: "The time taken to get all alert rules.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
@@ -181,6 +174,30 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulableAlertRules: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules",
Help: "The number of alert rules being considered for evaluation each tick.",
},
),
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules_hash",
Help: "A hash of the alert rules over time.",
}),
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_query_alert_rules_duration_seconds",
Help: "The time taken to fetch alert rules from the database.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
Ticker: legacyMetrics.NewTickerMetrics(r),
}
}