Alerting: Schedule Alert rules metric tracking (#50415)

* Alerting: Schedule Alert rules metric tracking

Change the record of metrics from one place to two as an attempt to have a semi-accurate record.
This commit is contained in:
gotjosh 2022-06-08 18:37:33 +01:00 committed by GitHub
parent 2813e49842
commit c59938b235
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 21 additions and 5 deletions

View File

@ -56,7 +56,12 @@ Scopes must have an order to ensure consistency and ease of search, this helps u
- [ENHANCEMENT] Create folder 'General Alerting' when Grafana starts from the scratch #48866
- [ENHANCEMENT] Rule changes authorization logic to use UID folder scope instead of ID scope #48970
- [ENHANCEMENT] Scheduler: ticker to support stopping #48142
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled.
- [ENHANCEMENT] Scheduler: Adds new metrics to track rules that might be scheduled #49874
- `grafana_alerting_schedule_alert_rules `
- `grafana_alerting_schedule_alert_rules_hash `
- [CHANGE] Scheduler: Renaming of metrics to make them consistent with similar metrics exposed by the component #49874
- `grafana_alerting_get_alert_rules_duration_seconds` to `grafana_alerting_schedule_periodic_duration_seconds`
- `grafana_alerting_schedule_periodic_duration_seconds` to `grafana_alerting_schedule_query_alert_rules_duration_seconds`
- [FEATURE] Indicate whether routes are provisioned when GETting Alertmanager configuration #47857
- [FEATURE] Indicate whether contact point is provisioned when GETting Alertmanager configuration #48323
- [FEATURE] Indicate whether alert rule is provisioned when GETting the rule #48458

View File

@ -180,7 +180,7 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules",
Help: "The number of alert rules being considered for evaluation each tick.",
Help: "The number of alert rules that could be considered for evaluation at the next tick.",
},
),
SchedulableAlertRulesHash: promauto.With(r).NewGauge(
@ -188,7 +188,7 @@ func newSchedulerMetrics(r prometheus.Registerer) *Scheduler {
Namespace: Namespace,
Subsystem: Subsystem,
Name: "schedule_alert_rules_hash",
Help: "A hash of the alert rules over time.",
Help: "A hash of the alert rules that could be considered for evaluation at the next tick.",
}),
UpdateSchedulableAlertRulesDuration: promauto.With(r).NewHistogram(
prometheus.HistogramOpts{

View File

@ -50,7 +50,5 @@ func (sch *schedule) updateSchedulableAlertRules(ctx context.Context, disabledOr
return fmt.Errorf("failed to get alert rules: %w", err)
}
sch.schedulableAlertRules.set(q.Result)
sch.metrics.SchedulableAlertRules.Set(float64(len(q.Result)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(q.Result)))
return nil
}

View File

@ -163,6 +163,9 @@ func (r *schedulableAlertRulesRegistry) update(rule *models.SchedulableAlertRule
r.rules[rule.GetKey()] = rule
}
// del removes pair that has specific key from schedulableAlertRulesRegistry.
// Returns 2-tuple where the first element is value of the removed pair
// and the second element indicates whether element with the specified key existed.
func (r *schedulableAlertRulesRegistry) del(k models.AlertRuleKey) (*models.SchedulableAlertRule, bool) {
r.mu.Lock()
defer r.mu.Unlock()

View File

@ -338,6 +338,11 @@ func (sch *schedule) DeleteAlertRule(key models.AlertRuleKey) {
}
// stop rule evaluation
ruleInfo.stop()
// Our best bet at this point is that we update the metrics with what we hope to schedule in the next tick.
alertRules := sch.schedulableAlertRules.all()
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
}
func (sch *schedule) adminConfigSync(ctx context.Context) error {
@ -392,6 +397,11 @@ func (sch *schedule) schedulePeriodic(ctx context.Context) error {
// so, at the end, the remaining registered alert rules are the deleted ones
registeredDefinitions := sch.registry.keyMap()
// While these are the rules that we iterate over, at the moment there's no 100% guarantee that they'll be
// scheduled as rules could be removed before we get a chance to evaluate them.
sch.metrics.SchedulableAlertRules.Set(float64(len(alertRules)))
sch.metrics.SchedulableAlertRulesHash.Set(float64(hashUIDs(alertRules)))
type readyToRunItem struct {
key models.AlertRuleKey
ruleName string