Alerting: Expose metrics for Alertmanager Alerts - grafana_alerting_alertmanager_alerts (#75802)

* Alerting: Expose metrics for Alertmanager Alerts

In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly.

I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
This commit is contained in:
gotjosh 2023-10-02 16:36:23 +01:00 committed by GitHub
parent ed7d29f2b9
commit e877174501
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 3 deletions

View File

@ -94,6 +94,9 @@ type AlertmanagerAggregatedMetrics struct {
nflogQueryDuration *prometheus.Desc nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc nflogPropagatedMessagesTotal *prometheus.Desc
// exporter metrics, gathered from the Alertmanager Alert Marker.
markerAlerts *prometheus.Desc
// exported metrics, gathered from Alertmanager Silences // exported metrics, gathered from Alertmanager Silences
silencesGCDuration *prometheus.Desc silencesGCDuration *prometheus.Desc
silencesSnapshotDuration *prometheus.Desc silencesSnapshotDuration *prometheus.Desc
@ -187,6 +190,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
"Number of received gossip messages that have been further gossiped.", "Number of received gossip messages that have been further gossiped.",
nil, nil), nil, nil),
markerAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
"How many alerts by state are in Grafana's Alertmanager.",
[]string{"org", "state"}, nil),
silencesGCDuration: prometheus.NewDesc( silencesGCDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem), fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
"Duration of the last silence garbage collection cycle.", "Duration of the last silence garbage collection cycle.",
@ -270,6 +278,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a.nflogQueryDuration out <- a.nflogQueryDuration
out <- a.nflogPropagatedMessagesTotal out <- a.nflogPropagatedMessagesTotal
out <- a.markerAlerts
out <- a.silencesGCDuration out <- a.silencesGCDuration
out <- a.silencesSnapshotDuration out <- a.silencesSnapshotDuration
out <- a.silencesSnapshotSize out <- a.silencesSnapshotSize
@ -310,6 +320,8 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds") data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total") data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds") data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds") data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes") data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")

View File

@ -41,7 +41,7 @@ func (c *cache) RegisterMetrics(r prometheus.Registerer) {
Namespace: metrics.Namespace, Namespace: metrics.Namespace,
Subsystem: metrics.Subsystem, Subsystem: metrics.Subsystem,
Name: "alerts", Name: "alerts",
Help: "How many alerts by state.", Help: "How many alerts by state are in the scheduler.",
ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())}, ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())},
}, func() float64 { }, func() float64 {
return c.countAlertsBy(state) return c.countAlertsBy(state)

View File

@ -19,8 +19,7 @@ import (
) )
var ( var (
ResendDelay = 30 * time.Second ResendDelay = 30 * time.Second
MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
) )
// AlertInstanceManager defines the interface for querying the current alert instances. // AlertInstanceManager defines the interface for querying the current alert instances.