Alerting: Expose metrics for Alertmanager Alerts - grafana_alerting_alertmanager_alerts (#75802)

* Alerting: Expose metrics for Alertmanager Alerts In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly. I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
2025-02-25 18:55:37 -06:00 · 2023-10-02 16:36:23 +01:00 · 2023-10-02 16:36:23 +01:00 · e877174501
commit e877174501
parent ed7d29f2b9
3 changed files with 14 additions and 3 deletions
--- a/pkg/services/ngalert/metrics/multi_org_alertmanager.go
+++ b/pkg/services/ngalert/metrics/multi_org_alertmanager.go
@ -94,6 +94,9 @@ type AlertmanagerAggregatedMetrics struct {
 	nflogQueryDuration           *prometheus.Desc
 	nflogPropagatedMessagesTotal *prometheus.Desc

+	// exporter metrics, gathered from the Alertmanager Alert Marker.
+	markerAlerts *prometheus.Desc
+
 	// exported metrics, gathered from Alertmanager Silences
 	silencesGCDuration              *prometheus.Desc
 	silencesSnapshotDuration        *prometheus.Desc
@ -187,6 +190,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
 			"Number of received gossip messages that have been further gossiped.",
 			nil, nil),

+		markerAlerts: prometheus.NewDesc(
+			fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
+			"How many alerts by state are in Grafana's Alertmanager.",
+			[]string{"org", "state"}, nil),
+
 		silencesGCDuration: prometheus.NewDesc(
 			fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
 			"Duration of the last silence garbage collection cycle.",
@ -270,6 +278,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
 	out <- a.nflogQueryDuration
 	out <- a.nflogPropagatedMessagesTotal

+	out <- a.markerAlerts
+
 	out <- a.silencesGCDuration
 	out <- a.silencesSnapshotDuration
 	out <- a.silencesSnapshotSize
@ -310,6 +320,8 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
 	data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
 	data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")

+	data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
+
 	data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
 	data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
 	data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
--- a/pkg/services/ngalert/state/cache.go
+++ b/pkg/services/ngalert/state/cache.go
@ -41,7 +41,7 @@ func (c *cache) RegisterMetrics(r prometheus.Registerer) {
 			Namespace:   metrics.Namespace,
 			Subsystem:   metrics.Subsystem,
 			Name:        "alerts",
-			Help:        "How many alerts by state.",
+			Help:        "How many alerts by state are in the scheduler.",
 			ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())},
 		}, func() float64 {
 			return c.countAlertsBy(state)
--- a/pkg/services/ngalert/state/manager.go
+++ b/pkg/services/ngalert/state/manager.go
@ -20,7 +20,6 @@ import (

 var (
 	ResendDelay = 30 * time.Second
-	MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
 )

 // AlertInstanceManager defines the interface for querying the current alert instances.