From e8771745012ab0257e827adbd41a1eaa7edd6996 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Mon, 2 Oct 2023 16:36:23 +0100 Subject: [PATCH] Alerting: Expose metrics for Alertmanager Alerts - `grafana_alerting_alertmanager_alerts` (#75802) * Alerting: Expose metrics for Alertmanager Alerts In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly. I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether. --- .../ngalert/metrics/multi_org_alertmanager.go | 12 ++++++++++++ pkg/services/ngalert/state/cache.go | 2 +- pkg/services/ngalert/state/manager.go | 3 +-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pkg/services/ngalert/metrics/multi_org_alertmanager.go b/pkg/services/ngalert/metrics/multi_org_alertmanager.go index 65999dbb48b..150019f774d 100644 --- a/pkg/services/ngalert/metrics/multi_org_alertmanager.go +++ b/pkg/services/ngalert/metrics/multi_org_alertmanager.go @@ -94,6 +94,9 @@ type AlertmanagerAggregatedMetrics struct { nflogQueryDuration *prometheus.Desc nflogPropagatedMessagesTotal *prometheus.Desc + // exporter metrics, gathered from the Alertmanager Alert Marker. + markerAlerts *prometheus.Desc + // exported metrics, gathered from Alertmanager Silences silencesGCDuration *prometheus.Desc silencesSnapshotDuration *prometheus.Desc @@ -187,6 +190,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale "Number of received gossip messages that have been further gossiped.", nil, nil), + markerAlerts: prometheus.NewDesc( + fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem), + "How many alerts by state are in Grafana's Alertmanager.", + []string{"org", "state"}, nil), + silencesGCDuration: prometheus.NewDesc( fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem), "Duration of the last silence garbage collection cycle.", @@ -270,6 +278,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) { out <- a.nflogQueryDuration out <- a.nflogPropagatedMessagesTotal + out <- a.markerAlerts + out <- a.silencesGCDuration out <- a.silencesSnapshotDuration out <- a.silencesSnapshotSize @@ -310,6 +320,8 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") + data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state") + data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") diff --git a/pkg/services/ngalert/state/cache.go b/pkg/services/ngalert/state/cache.go index 008eb889278..7f7e7f510f5 100644 --- a/pkg/services/ngalert/state/cache.go +++ b/pkg/services/ngalert/state/cache.go @@ -41,7 +41,7 @@ func (c *cache) RegisterMetrics(r prometheus.Registerer) { Namespace: metrics.Namespace, Subsystem: metrics.Subsystem, Name: "alerts", - Help: "How many alerts by state.", + Help: "How many alerts by state are in the scheduler.", ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())}, }, func() float64 { return c.countAlertsBy(state) diff --git a/pkg/services/ngalert/state/manager.go b/pkg/services/ngalert/state/manager.go index df722d3d10b..004fee270b7 100644 --- a/pkg/services/ngalert/state/manager.go +++ b/pkg/services/ngalert/state/manager.go @@ -19,8 +19,7 @@ import ( ) var ( - ResendDelay = 30 * time.Second - MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus. + ResendDelay = 30 * time.Second ) // AlertInstanceManager defines the interface for querying the current alert instances.