mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Expose metrics for Alertmanager Alerts - grafana_alerting_alertmanager_alerts
(#75802)
* Alerting: Expose metrics for Alertmanager Alerts In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly. I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
This commit is contained in:
parent
ed7d29f2b9
commit
e877174501
@ -94,6 +94,9 @@ type AlertmanagerAggregatedMetrics struct {
|
||||
nflogQueryDuration *prometheus.Desc
|
||||
nflogPropagatedMessagesTotal *prometheus.Desc
|
||||
|
||||
// exporter metrics, gathered from the Alertmanager Alert Marker.
|
||||
markerAlerts *prometheus.Desc
|
||||
|
||||
// exported metrics, gathered from Alertmanager Silences
|
||||
silencesGCDuration *prometheus.Desc
|
||||
silencesSnapshotDuration *prometheus.Desc
|
||||
@ -187,6 +190,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
|
||||
"Number of received gossip messages that have been further gossiped.",
|
||||
nil, nil),
|
||||
|
||||
markerAlerts: prometheus.NewDesc(
|
||||
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
|
||||
"How many alerts by state are in Grafana's Alertmanager.",
|
||||
[]string{"org", "state"}, nil),
|
||||
|
||||
silencesGCDuration: prometheus.NewDesc(
|
||||
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
||||
"Duration of the last silence garbage collection cycle.",
|
||||
@ -270,6 +278,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
||||
out <- a.nflogQueryDuration
|
||||
out <- a.nflogPropagatedMessagesTotal
|
||||
|
||||
out <- a.markerAlerts
|
||||
|
||||
out <- a.silencesGCDuration
|
||||
out <- a.silencesSnapshotDuration
|
||||
out <- a.silencesSnapshotSize
|
||||
@ -310,6 +320,8 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
||||
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
|
||||
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
|
||||
|
||||
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
|
||||
|
||||
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
||||
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
||||
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
||||
|
@ -41,7 +41,7 @@ func (c *cache) RegisterMetrics(r prometheus.Registerer) {
|
||||
Namespace: metrics.Namespace,
|
||||
Subsystem: metrics.Subsystem,
|
||||
Name: "alerts",
|
||||
Help: "How many alerts by state.",
|
||||
Help: "How many alerts by state are in the scheduler.",
|
||||
ConstLabels: prometheus.Labels{"state": strings.ToLower(state.String())},
|
||||
}, func() float64 {
|
||||
return c.countAlertsBy(state)
|
||||
|
@ -20,7 +20,6 @@ import (
|
||||
|
||||
var (
|
||||
ResendDelay = 30 * time.Second
|
||||
MetricsScrapeInterval = 15 * time.Second // TODO: parameterize? // Setting to a reasonable default scrape interval for Prometheus.
|
||||
)
|
||||
|
||||
// AlertInstanceManager defines the interface for querying the current alert instances.
|
||||
|
Loading…
Reference in New Issue
Block a user