mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add metrics around notification delivery. (#62778)
This change exposes more metrics from the embedded Alertmanager, which are valuable for troubleshooting Alertmanager operation particularly in HA setups. ``` grafana_alerting_notifications_total grafana_alerting_notifications_failed_total grafana_alerting_notification_requests_total grafana_alerting_notification_requests_failed_total grafana_alerting_notification_latency_seconds grafana_alerting_nflog_gc_duration_seconds grafana_alerting_nflog_snapshot_duration_seconds grafana_alerting_nflog_snapshot_size_bytes grafana_alerting_nflog_queries_total grafana_alerting_nflog_query_errors_total grafana_alerting_nflog_query_duration_seconds grafana_alerting_nflog_gossip_messages_propagated_total grafana_alerting_dispatcher_aggregation_groups grafana_alerting_dispatcher_alert_processing_duration_seconds ``` Note that `alertmanager_dispatcher_aggregation_group_limit_reached_total` is explicitly not exposed, as the group limit metrics are not enabled.
This commit is contained in:
@@ -72,6 +72,22 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg
|
|||||||
type AlertmanagerAggregatedMetrics struct {
|
type AlertmanagerAggregatedMetrics struct {
|
||||||
registries *metrics.TenantRegistries
|
registries *metrics.TenantRegistries
|
||||||
|
|
||||||
|
// exported metrics, gathered from Alertmanager PipelineBuilder
|
||||||
|
numNotifications *prometheus.Desc
|
||||||
|
numFailedNotifications *prometheus.Desc
|
||||||
|
numNotificationRequestsTotal *prometheus.Desc
|
||||||
|
numNotificationRequestsFailedTotal *prometheus.Desc
|
||||||
|
notificationLatencySeconds *prometheus.Desc
|
||||||
|
|
||||||
|
// exported metrics, gathered from Alertmanager nflog
|
||||||
|
nflogGCDuration *prometheus.Desc
|
||||||
|
nflogSnapshotDuration *prometheus.Desc
|
||||||
|
nflogSnapshotSize *prometheus.Desc
|
||||||
|
nflogQueriesTotal *prometheus.Desc
|
||||||
|
nflogQueryErrorsTotal *prometheus.Desc
|
||||||
|
nflogQueryDuration *prometheus.Desc
|
||||||
|
nflogPropagatedMessagesTotal *prometheus.Desc
|
||||||
|
|
||||||
// exported metrics, gathered from Alertmanager Silences
|
// exported metrics, gathered from Alertmanager Silences
|
||||||
silencesGCDuration *prometheus.Desc
|
silencesGCDuration *prometheus.Desc
|
||||||
silencesSnapshotDuration *prometheus.Desc
|
silencesSnapshotDuration *prometheus.Desc
|
||||||
@@ -81,12 +97,66 @@ type AlertmanagerAggregatedMetrics struct {
|
|||||||
silencesQueryDuration *prometheus.Desc
|
silencesQueryDuration *prometheus.Desc
|
||||||
silences *prometheus.Desc
|
silences *prometheus.Desc
|
||||||
silencesPropagatedMessagesTotal *prometheus.Desc
|
silencesPropagatedMessagesTotal *prometheus.Desc
|
||||||
|
|
||||||
|
// exported metrics, gathered from Alertmanager Dispatcher
|
||||||
|
dispatchAggrGroups *prometheus.Desc
|
||||||
|
dispatchProcessingDuration *prometheus.Desc
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
|
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
|
||||||
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
|
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
|
||||||
registries: registries,
|
registries: registries,
|
||||||
|
|
||||||
|
numNotifications: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem),
|
||||||
|
"The total number of attempted notifications.",
|
||||||
|
[]string{"org", "integration"}, nil),
|
||||||
|
numFailedNotifications: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_notifications_failed_total", Namespace, Subsystem),
|
||||||
|
"The total number of failed notifications.",
|
||||||
|
[]string{"org", "integration"}, nil),
|
||||||
|
numNotificationRequestsTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_notification_requests_total", Namespace, Subsystem),
|
||||||
|
"The total number of attempted notification requests.",
|
||||||
|
[]string{"org", "integration"}, nil),
|
||||||
|
numNotificationRequestsFailedTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_notification_requests_failed_total", Namespace, Subsystem),
|
||||||
|
"The total number of failed notification requests.",
|
||||||
|
[]string{"org", "integration"}, nil),
|
||||||
|
notificationLatencySeconds: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_notification_latency_seconds", Namespace, Subsystem),
|
||||||
|
"The latency of notifications in seconds.",
|
||||||
|
nil, nil),
|
||||||
|
|
||||||
|
nflogGCDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_gc_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of the last notification log garbage collection cycle.",
|
||||||
|
nil, nil),
|
||||||
|
nflogSnapshotDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_snapshot_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of the last notification log snapshot.",
|
||||||
|
nil, nil),
|
||||||
|
nflogSnapshotSize: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_snapshot_size_bytes", Namespace, Subsystem),
|
||||||
|
"Size of the last notification log snapshot in bytes.",
|
||||||
|
nil, nil),
|
||||||
|
nflogQueriesTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_queries_total", Namespace, Subsystem),
|
||||||
|
"Number of notification log queries were received.",
|
||||||
|
nil, nil),
|
||||||
|
nflogQueryErrorsTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_query_errors_total", Namespace, Subsystem),
|
||||||
|
"Number notification log received queries that failed.",
|
||||||
|
nil, nil),
|
||||||
|
nflogQueryDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_query_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Duration of notification log query evaluation.",
|
||||||
|
nil, nil),
|
||||||
|
nflogPropagatedMessagesTotal: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_nflog_gossip_messages_propagated_total", Namespace, Subsystem),
|
||||||
|
"Number of received gossip messages that have been further gossiped.",
|
||||||
|
nil, nil),
|
||||||
|
|
||||||
silencesGCDuration: prometheus.NewDesc(
|
silencesGCDuration: prometheus.NewDesc(
|
||||||
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
||||||
"Duration of the last silence garbage collection cycle.",
|
"Duration of the last silence garbage collection cycle.",
|
||||||
@@ -119,12 +189,35 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
|
|||||||
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
|
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
|
||||||
"How many silences by state.",
|
"How many silences by state.",
|
||||||
[]string{"org", "state"}, nil),
|
[]string{"org", "state"}, nil),
|
||||||
|
|
||||||
|
dispatchAggrGroups: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_dispatcher_aggregation_groups", Namespace, Subsystem),
|
||||||
|
"Number of active aggregation groups",
|
||||||
|
nil, nil),
|
||||||
|
dispatchProcessingDuration: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_%s_dispatcher_alert_processing_duration_seconds", Namespace, Subsystem),
|
||||||
|
"Summary of latencies for the processing of alerts.",
|
||||||
|
nil, nil),
|
||||||
}
|
}
|
||||||
|
|
||||||
return aggregatedMetrics
|
return aggregatedMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
||||||
|
out <- a.numNotifications
|
||||||
|
out <- a.numFailedNotifications
|
||||||
|
out <- a.numNotificationRequestsTotal
|
||||||
|
out <- a.numNotificationRequestsFailedTotal
|
||||||
|
out <- a.notificationLatencySeconds
|
||||||
|
|
||||||
|
out <- a.nflogGCDuration
|
||||||
|
out <- a.nflogSnapshotDuration
|
||||||
|
out <- a.nflogSnapshotSize
|
||||||
|
out <- a.nflogQueriesTotal
|
||||||
|
out <- a.nflogQueryErrorsTotal
|
||||||
|
out <- a.nflogQueryDuration
|
||||||
|
out <- a.nflogPropagatedMessagesTotal
|
||||||
|
|
||||||
out <- a.silencesGCDuration
|
out <- a.silencesGCDuration
|
||||||
out <- a.silencesSnapshotDuration
|
out <- a.silencesSnapshotDuration
|
||||||
out <- a.silencesSnapshotSize
|
out <- a.silencesSnapshotSize
|
||||||
@@ -133,11 +226,28 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
|||||||
out <- a.silencesQueryDuration
|
out <- a.silencesQueryDuration
|
||||||
out <- a.silencesPropagatedMessagesTotal
|
out <- a.silencesPropagatedMessagesTotal
|
||||||
out <- a.silences
|
out <- a.silences
|
||||||
|
|
||||||
|
out <- a.dispatchAggrGroups
|
||||||
|
out <- a.dispatchProcessingDuration
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
||||||
data := a.registries.BuildMetricFamiliesPerTenant()
|
data := a.registries.BuildMetricFamiliesPerTenant()
|
||||||
|
|
||||||
|
data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
||||||
|
data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
||||||
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
||||||
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
||||||
|
data.SendSumOfHistograms(out, a.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
|
||||||
|
|
||||||
|
data.SendSumOfSummaries(out, a.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
|
||||||
|
data.SendSumOfSummaries(out, a.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
|
||||||
|
data.SendSumOfGauges(out, a.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
|
||||||
|
data.SendSumOfCounters(out, a.nflogQueriesTotal, "alertmanager_nflog_queries_total")
|
||||||
|
data.SendSumOfCounters(out, a.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
|
||||||
|
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
|
||||||
|
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
|
||||||
|
|
||||||
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
||||||
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
||||||
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
||||||
@@ -146,4 +256,7 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
|||||||
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
|
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
|
||||||
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
|
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
|
||||||
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
|
||||||
|
|
||||||
|
data.SendSumOfGauges(out, a.dispatchAggrGroups, "alertmanager_dispatcher_aggregation_groups")
|
||||||
|
data.SendSumOfSummaries(out, a.dispatchProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds")
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user