Add metrics to ngalert scheduler (#44602)

This pull request adds metrics to the ngalert scheduler so we can see how long it takes to evaluate a tick.
This commit is contained in:
George Robinson 2022-01-31 16:56:43 +00:00 committed by GitHub
parent 793e3d3556
commit 5e2280ceee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 99 additions and 68 deletions

View File

@ -45,10 +45,13 @@ type NGAlert struct {
}
type Scheduler struct {
Registerer prometheus.Registerer
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
Registerer prometheus.Registerer
BehindSeconds prometheus.Gauge
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
EvalDuration *prometheus.SummaryVec
GetAlertRulesDuration prometheus.Histogram
SchedulePeriodicDuration prometheus.Histogram
}
type MultiOrgAlertmanager struct {
@ -120,6 +123,12 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg
func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
return &Scheduler{
Registerer: r,
BehindSeconds: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "scheduler_behind_seconds",
Help: "The total number of seconds the scheduler is behind.",
}),
// TODO: once rule groups support multiple rules, consider partitioning
// on rule group as well as tenant, similar to loki|cortex.
EvalTotal: promauto.With(r).NewCounterVec(
@ -152,6 +161,24 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
},
[]string{"org"},
),
GetAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "get_alert_rules_duration_seconds",
Help: "The time taken to get all alert rules.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
SchedulePeriodicDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_periodic_duration_seconds",
Help: "The time taken to run the scheduler.",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2, 5, 10},
},
),
}
}

View File

@ -1,10 +1,17 @@
package schedule
import (
"time"
"github.com/grafana/grafana/pkg/services/ngalert/models"
)
func (sch *schedule) fetchAllDetails(disabledOrgs []int64) []*models.AlertRule {
func (sch *schedule) getAlertRules(disabledOrgs []int64) []*models.AlertRule {
start := time.Now()
defer func() {
sch.metrics.GetAlertRulesDuration.Observe(time.Since(start).Seconds())
}()
q := models.ListAlertRulesQuery{
ExcludeOrgs: disabledOrgs,
}

View File

@ -175,7 +175,7 @@ func (sch *schedule) Run(ctx context.Context) error {
go func() {
defer wg.Done()
if err := sch.ruleEvaluationLoop(ctx); err != nil {
if err := sch.schedulePeriodic(ctx); err != nil {
sch.log.Error("failure while running the rule evaluation loop", "err", err)
}
}()
@ -352,17 +352,21 @@ func (sch *schedule) adminConfigSync(ctx context.Context) error {
}
}
func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
func (sch *schedule) schedulePeriodic(ctx context.Context) error {
dispatcherGroup, ctx := errgroup.WithContext(ctx)
for {
select {
case tick := <-sch.heartbeat.C:
start := time.Now()
sch.metrics.BehindSeconds.Set(start.Sub(tick).Seconds())
tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())
disabledOrgs := make([]int64, 0, len(sch.disabledOrgs))
for disabledOrg := range sch.disabledOrgs {
disabledOrgs = append(disabledOrgs, disabledOrg)
}
alertRules := sch.fetchAllDetails(disabledOrgs)
alertRules := sch.getAlertRules(disabledOrgs)
sch.log.Debug("alert rules fetched", "count", len(alertRules), "disabled_orgs", disabledOrgs)
// registeredDefinitions is a map used for finding deleted alert rules
@ -433,6 +437,8 @@ func (sch *schedule) ruleEvaluationLoop(ctx context.Context) error {
for key := range registeredDefinitions {
sch.DeleteAlertRule(key)
}
sch.metrics.SchedulePeriodicDuration.Observe(time.Since(start).Seconds())
case <-ctx.Done():
waitErr := dispatcherGroup.Wait()

View File

@ -79,7 +79,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
LastEvaluationTime: evaluationTime,
@ -133,7 +133,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
LastEvaluationTime: evaluationTime,
@ -156,7 +156,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
@ -213,12 +213,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(1 * time.Minute),
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
@ -274,12 +274,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(1 * time.Minute),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(1 * time.Minute),
@ -346,17 +346,17 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(80 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(80 * time.Second),
@ -440,22 +440,22 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(20 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(30 * time.Second),
@ -531,22 +531,22 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(20 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
@ -605,12 +605,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -669,12 +669,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime,
@ -733,12 +733,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -797,12 +797,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -860,7 +860,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -924,7 +924,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -991,7 +991,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime.Add(20 * time.Second),
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: time.Time{},
@ -1049,12 +1049,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -1114,12 +1114,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -1179,12 +1179,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -1258,12 +1258,12 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
@ -1339,22 +1339,22 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(70 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(70 * time.Second),
@ -1431,22 +1431,22 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(30 * time.Second),
EvaluationState: eval.Alerting,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(40 * time.Second),
EvaluationState: eval.Error,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
{
EvaluationTime: evaluationTime.Add(50 * time.Second),
EvaluationState: eval.NoData,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
StartsAt: evaluationTime.Add(30 * time.Second),
@ -1498,7 +1498,7 @@ func TestProcessEvalResults(t *testing.T) {
{
EvaluationTime: evaluationTime,
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
LastEvaluationTime: evaluationTime,
@ -1603,7 +1603,7 @@ func TestStaleResultsHandler(t *testing.T) {
{
EvaluationTime: evaluationTime.Add(3 * time.Minute),
EvaluationState: eval.Normal,
Values: make(map[string]state.EvaluationValue),
Values: make(map[string]*float64),
},
},
LastEvaluationTime: evaluationTime.Add(3 * time.Minute),

View File

@ -35,23 +35,14 @@ type Evaluation struct {
// Values contains the RefID and value of reduce and math expressions.
// It does not contain values for classic conditions as the values
// in classic conditions do not have a RefID.
Values map[string]EvaluationValue
}
// EvaluationValue contains the labels and value for a RefID in an evaluation.
type EvaluationValue struct {
Labels data.Labels
Value *float64
Values map[string]*float64
}
// NewEvaluationValues returns the labels and values for each RefID in the capture.
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]EvaluationValue {
result := make(map[string]EvaluationValue, len(m))
func NewEvaluationValues(m map[string]eval.NumberValueCapture) map[string]*float64 {
result := make(map[string]*float64, len(m))
for k, v := range m {
result[k] = EvaluationValue{
Labels: v.Labels,
Value: v.Value,
}
result[k] = v.Value
}
return result
}