Alerting: Expose metrics for Alertmanager Alerts - grafana_alerting_alertmanager_alerts (#75802)

* Alerting: Expose metrics for Alertmanager Alerts

In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly.

I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
This commit is contained in:
gotjosh 2023-10-02 16:36:23 +01:00 committed by GitHub
parent ed7d29f2b9
commit e877174501
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 14 additions and 3 deletions

View File

@ -94,6 +94,9 @@ type AlertmanagerAggregatedMetrics struct {
nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc
// exporter metrics, gathered from the Alertmanager Alert Marker.
markerAlerts *prometheus.Desc
// exported metrics, gathered from Alertmanager Silences
silencesGCDuration *prometheus.Desc
silencesSnapshotDuration *prometheus.Desc
@ -187,6 +190,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
"Number of received gossip messages that have been further gossiped.",
nil, nil),
markerAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
"How many alerts by state are in Grafana's Alertmanager.",
[]string{"org", "state"}, nil),
silencesGCDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
"Duration of the last silence garbage collection cycle.",
@ -270,6 +278,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a.nflogQueryDuration
out <- a.nflogPropagatedMessagesTotal
out <- a.markerAlerts
out <- a.silencesGCDuration
out <- a.silencesSnapshotDuration
out <- a.silencesSnapshotSize
@ -310,6 +320,8 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")

View File

@ -41,7 +41,7 @@ func (c *cache) RegisterMetrics(r prometheus.Registerer) {
Namespace: metrics.Namespace,
Subsystem: metrics.Subsystem,
Name: "alerts",
Help: "How many alerts by state.",
Help: "How many alerts by state are in the scheduler.",
ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())},
}, func() float64 {
return c.countAlertsBy(state)

View File

@ -20,7 +20,6 @@ import (
var (
ResendDelay = 30 * time.Second
MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
)
// AlertInstanceManager defines the interface for querying the current alert instances.