mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Add metrics to ngalert scheduler (#44602)
This pull request adds metrics to the ngalert scheduler so we can see how long it takes to evaluate a tick.
This commit is contained in:
parent
793e3d3556
commit
5e2280ceee
@ -45,10 +45,13 @@ type NGAlert struct {
|
||||
}
|
||||
|
||||
type Scheduler struct {
|
||||
Registerer prometheus.Registerer
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
Registerer prometheus.Registerer
|
||||
BehindSeconds prometheus.Gauge
|
||||
EvalTotal *prometheus.CounterVec
|
||||
EvalFailures *prometheus.CounterVec
|
||||
EvalDuration *prometheus.SummaryVec
|
||||
GetAlertRulesDuration prometheus.Histogram
|
||||
SchedulePeriodicDuration prometheus.Histogram
|
||||
}
|
||||
|
||||
type MultiOrgAlertmanager struct {
|
||||
@ -120,6 +123,12 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg
|
||||
func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
return &Scheduler{
|
||||
Registerer: r,
|
||||
BehindSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "scheduler_behind_seconds",
|
||||
Help: "The total number of seconds the scheduler is behind.",
|
||||
}),
|
||||
// TODO: once rule groups support multiple rules, consider partitioning
|
||||
// on rule group as well as tenant, similar to loki|cortex.
|
||||
EvalTotal: promauto.With(r).NewCounterVec(
|
||||
@ -152,6 +161,24 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
|
||||
},
|
||||
[]string{"org"},
|
||||
),
|
||||
GetAlertRulesDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "get_alert_rules_duration_seconds",
|
||||
Help: "The time taken to get all alert rules.",
|
||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||
},
|
||||
),
|
||||
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "schedule_periodic_duration_seconds",
|
||||
Help: "The time taken to run the scheduler.",
|
||||
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,17 @@
|
||||
package schedule
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
)
|
||||
|
||||
func (sch *schedule) fetchAllDetails(disabledOrgs []int64) []*models.AlertRule {
|
||||
func (sch *schedule) getAlertRules(disabledOrgs []int64) []*models.AlertRule {
|
||||
start := time.Now()
|
||||
defer func() {
|
||||
sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
|
||||
}()
|
||||
|
||||
q := models.ListAlertRulesQuery{
|
||||
ExcludeOrgs: disabledOrgs,
|
||||
}
|
||||
|
@ -175,7 +175,7 @@ func (sch *schedule) Run(ctx context.Context) error {
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
if err := sch.ruleEvaluationLoop(ctx); err != nil {
|
||||
if err := sch.schedulePeriodic(ctx); err != nil {
|
||||
sch.log.Error("failure while running the rule evaluation loop", "err", err)
|
||||
}
|
||||
}()
|
||||
@ -352,17 +352,21 @@ func (sch *schedule) adminConfigSync(ctx context.Context) error {
|
||||
}
|
||||
}
|
||||
|
||||
func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||
func (sch *schedule) schedulePeriodic(ctx context.Context) error {
|
||||
dispatcherGroup, ctx := errgroup.WithContext(ctx)
|
||||
for {
|
||||
select {
|
||||
case tick := <-sch.heartbeat.C:
|
||||
start := time.Now()
|
||||
sch.metrics.BehindSeconds.Set(start.Sub(tick).Seconds())
|
||||
|
||||
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
|
||||
disabledOrgs := make([]int64, 0, len(sch.disabledOrgs))
|
||||
for disabledOrg := range sch.disabledOrgs {
|
||||
disabledOrgs = append(disabledOrgs, disabledOrg)
|
||||
}
|
||||
alertRules := sch.fetchAllDetails(disabledOrgs)
|
||||
|
||||
alertRules := sch.getAlertRules(disabledOrgs)
|
||||
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
|
||||
|
||||
// registeredDefinitions is a map used for finding deleted alert rules
|
||||
@ -433,6 +437,8 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
|
||||
for key := range registeredDefinitions {
|
||||
sch.DeleteAlertRule(key)
|
||||
}
|
||||
|
||||
sch.metrics.SchedulePeriodicDuration.Observe(time.Since(start).Seconds())
|
||||
case <-ctx.Done():
|
||||
waitErr := dispatcherGroup.Wait()
|
||||
|
||||
|
@ -79,7 +79,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
LastEvaluationTime: evaluationTime,
|
||||
@ -133,7 +133,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
LastEvaluationTime: evaluationTime,
|
||||
@ -156,7 +156,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime,
|
||||
@ -213,12 +213,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||
@ -274,12 +274,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(1 * time.Minute),
|
||||
@ -346,17 +346,17 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(80 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(80 * time.Second),
|
||||
@ -440,22 +440,22 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(20 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(30 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(40 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(30 * time.Second),
|
||||
@ -531,22 +531,22 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(20 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(30 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime,
|
||||
@ -605,12 +605,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -669,12 +669,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime,
|
||||
@ -733,12 +733,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -797,12 +797,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -860,7 +860,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -924,7 +924,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -991,7 +991,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(20 * time.Second),
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: time.Time{},
|
||||
@ -1049,12 +1049,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -1114,12 +1114,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -1179,12 +1179,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Error,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -1258,12 +1258,12 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationState: eval.Error,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
@ -1339,22 +1339,22 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(30 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(40 * time.Second),
|
||||
EvaluationState: eval.Error,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(70 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(70 * time.Second),
|
||||
@ -1431,22 +1431,22 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(30 * time.Second),
|
||||
EvaluationState: eval.Alerting,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(40 * time.Second),
|
||||
EvaluationState: eval.Error,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(50 * time.Second),
|
||||
EvaluationState: eval.NoData,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(30 * time.Second),
|
||||
@ -1498,7 +1498,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime,
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
LastEvaluationTime: evaluationTime,
|
||||
@ -1603,7 +1603,7 @@ func TestStaleResultsHandler(t *testing.T) {
|
||||
{
|
||||
EvaluationTime: evaluationTime.Add(3 * time.Minute),
|
||||
EvaluationState: eval.Normal,
|
||||
Values: make(map[string]state.EvaluationValue),
|
||||
Values: make(map[string]*float64),
|
||||
},
|
||||
},
|
||||
LastEvaluationTime: evaluationTime.Add(3 * time.Minute),
|
||||
|
@ -35,23 +35,14 @@ type Evaluation struct {
|
||||
// Values contains the RefID and value of reduce and math expressions.
|
||||
// It does not contain values for classic conditions as the values
|
||||
// in classic conditions do not have a RefID.
|
||||
Values map[string]EvaluationValue
|
||||
}
|
||||
|
||||
// EvaluationValue contains the labels and value for a RefID in an evaluation.
|
||||
type EvaluationValue struct {
|
||||
Labels data.Labels
|
||||
Value *float64
|
||||
Values map[string]*float64
|
||||
}
|
||||
|
||||
// NewEvaluationValues returns the labels and values for each RefID in the capture.
|
||||
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]EvaluationValue {
|
||||
result := make(map[string]EvaluationValue, len(m))
|
||||
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]*float64 {
|
||||
result := make(map[string]*float64, len(m))
|
||||
for k, v := range m {
|
||||
result[k] = EvaluationValue{
|
||||
Labels: v.Labels,
|
||||
Value: v.Value,
|
||||
}
|
||||
result[k] = v.Value
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user