mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
* Alerting: Expose metrics for Alertmanager Alerts In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly. I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
342 lines
15 KiB
Go
342 lines
15 KiB
Go
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
|
|
"github.com/grafana/dskit/metrics"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
type MultiOrgAlertmanager struct {
|
|
Registerer prometheus.Registerer
|
|
registries *metrics.TenantRegistries
|
|
|
|
ActiveConfigurations prometheus.Gauge
|
|
DiscoveredConfigurations prometheus.Gauge
|
|
|
|
aggregatedMetrics *AlertmanagerAggregatedMetrics
|
|
}
|
|
|
|
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
|
registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not.
|
|
moa := &MultiOrgAlertmanager{
|
|
Registerer: r,
|
|
registries: registries,
|
|
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: Namespace,
|
|
Subsystem: Subsystem,
|
|
Name: "discovered_configurations",
|
|
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
|
|
}),
|
|
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: Namespace,
|
|
Subsystem: Subsystem,
|
|
Name: "active_configurations",
|
|
Help: "The number of active Alertmanager configurations.",
|
|
}),
|
|
aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries),
|
|
}
|
|
|
|
// These metrics use a different registration method as the struct itself represents a custom collector.
|
|
// There's no way to "auto-register" a collector.
|
|
r.MustRegister(moa.aggregatedMetrics)
|
|
|
|
return moa
|
|
}
|
|
|
|
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
|
moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false)
|
|
}
|
|
|
|
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
|
sid := strconv.FormatInt(id, 10)
|
|
reg := moa.registries.GetRegistryForTenant(sid)
|
|
if reg != nil {
|
|
return reg
|
|
}
|
|
|
|
result := prometheus.NewRegistry()
|
|
moa.registries.AddTenantRegistry(sid, result)
|
|
|
|
return result
|
|
}
|
|
|
|
// AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
|
|
// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
|
|
type AlertmanagerAggregatedMetrics struct {
|
|
registries *metrics.TenantRegistries
|
|
|
|
// metrics gather from the in-house "Alertmanager" directly.
|
|
numReceivedAlerts *prometheus.Desc
|
|
numInvalidAlerts *prometheus.Desc
|
|
configuredReceivers *prometheus.Desc
|
|
configuredIntegrations *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager PipelineBuilder
|
|
numNotifications *prometheus.Desc
|
|
numFailedNotifications *prometheus.Desc
|
|
numNotificationRequestsTotal *prometheus.Desc
|
|
numNotificationRequestsFailedTotal *prometheus.Desc
|
|
notificationLatencySeconds *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager nflog
|
|
nflogGCDuration *prometheus.Desc
|
|
nflogSnapshotDuration *prometheus.Desc
|
|
nflogSnapshotSize *prometheus.Desc
|
|
nflogQueriesTotal *prometheus.Desc
|
|
nflogQueryErrorsTotal *prometheus.Desc
|
|
nflogQueryDuration *prometheus.Desc
|
|
nflogPropagatedMessagesTotal *prometheus.Desc
|
|
|
|
// exporter metrics, gathered from the Alertmanager Alert Marker.
|
|
markerAlerts *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager Silences
|
|
silencesGCDuration *prometheus.Desc
|
|
silencesSnapshotDuration *prometheus.Desc
|
|
silencesSnapshotSize *prometheus.Desc
|
|
silencesQueriesTotal *prometheus.Desc
|
|
silencesQueryErrorsTotal *prometheus.Desc
|
|
silencesQueryDuration *prometheus.Desc
|
|
silences *prometheus.Desc
|
|
silencesPropagatedMessagesTotal *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager Dispatcher
|
|
dispatchAggrGroups *prometheus.Desc
|
|
dispatchProcessingDuration *prometheus.Desc
|
|
|
|
// added to measure usage of matchers, match_re, match and
|
|
// object_matchers
|
|
matchers *prometheus.Desc
|
|
matchRE *prometheus.Desc
|
|
match *prometheus.Desc
|
|
objectMatchers *prometheus.Desc
|
|
}
|
|
|
|
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
|
|
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
|
|
registries: registries,
|
|
|
|
numReceivedAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alerts_received_total", Namespace, Subsystem),
|
|
"The total number of received alerts.",
|
|
[]string{"org", "status"}, nil),
|
|
numInvalidAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alerts_invalid_total", Namespace, Subsystem),
|
|
"The total number of received alerts that were invalid.",
|
|
[]string{"org"}, nil),
|
|
configuredReceivers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_receivers", Namespace, Subsystem),
|
|
"Number of configured receivers by state. It is considered active if used within a route.",
|
|
[]string{"org", "state"}, nil),
|
|
configuredIntegrations: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_integrations", Namespace, Subsystem),
|
|
"Number of configured receivers.",
|
|
[]string{"org", "type"}, nil),
|
|
|
|
numNotifications: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem),
|
|
"The total number of attempted notifications.",
|
|
[]string{"org", "integration"}, nil),
|
|
numFailedNotifications: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notifications_failed_total", Namespace, Subsystem),
|
|
"The total number of failed notifications.",
|
|
[]string{"org", "integration"}, nil),
|
|
numNotificationRequestsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_requests_total", Namespace, Subsystem),
|
|
"The total number of attempted notification requests.",
|
|
[]string{"org", "integration"}, nil),
|
|
numNotificationRequestsFailedTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_requests_failed_total", Namespace, Subsystem),
|
|
"The total number of failed notification requests.",
|
|
[]string{"org", "integration"}, nil),
|
|
notificationLatencySeconds: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_latency_seconds", Namespace, Subsystem),
|
|
"The latency of notifications in seconds.",
|
|
nil, nil),
|
|
|
|
nflogGCDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_gc_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last notification log garbage collection cycle.",
|
|
nil, nil),
|
|
nflogSnapshotDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_snapshot_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last notification log snapshot.",
|
|
nil, nil),
|
|
nflogSnapshotSize: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_snapshot_size_bytes", Namespace, Subsystem),
|
|
"Size of the last notification log snapshot in bytes.",
|
|
nil, nil),
|
|
nflogQueriesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_queries_total", Namespace, Subsystem),
|
|
"Number of notification log queries were received.",
|
|
nil, nil),
|
|
nflogQueryErrorsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_query_errors_total", Namespace, Subsystem),
|
|
"Number notification log received queries that failed.",
|
|
nil, nil),
|
|
nflogQueryDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_query_duration_seconds", Namespace, Subsystem),
|
|
"Duration of notification log query evaluation.",
|
|
nil, nil),
|
|
nflogPropagatedMessagesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_gossip_messages_propagated_total", Namespace, Subsystem),
|
|
"Number of received gossip messages that have been further gossiped.",
|
|
nil, nil),
|
|
|
|
markerAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
|
|
"How many alerts by state are in Grafana's Alertmanager.",
|
|
[]string{"org", "state"}, nil),
|
|
|
|
silencesGCDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last silence garbage collection cycle.",
|
|
nil, nil),
|
|
silencesSnapshotDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last silence snapshot.",
|
|
nil, nil),
|
|
silencesSnapshotSize: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem),
|
|
"Size of the last silence snapshot in bytes.",
|
|
nil, nil),
|
|
silencesQueriesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem),
|
|
"How many silence queries were received.",
|
|
nil, nil),
|
|
silencesQueryErrorsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem),
|
|
"How many silence received queries did not succeed.",
|
|
nil, nil),
|
|
silencesQueryDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem),
|
|
"Duration of silence query evaluation.",
|
|
nil, nil),
|
|
silencesPropagatedMessagesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem),
|
|
"Number of received gossip messages that have been further gossiped.",
|
|
nil, nil),
|
|
silences: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
|
|
"How many silences by state.",
|
|
[]string{"org", "state"}, nil),
|
|
|
|
dispatchAggrGroups: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_dispatcher_aggregation_groups", Namespace, Subsystem),
|
|
"Number of active aggregation groups",
|
|
nil, nil),
|
|
dispatchProcessingDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_dispatcher_alert_processing_duration_seconds", Namespace, Subsystem),
|
|
"Summary of latencies for the processing of alerts.",
|
|
nil, nil),
|
|
|
|
matchers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_matchers", Namespace, Subsystem),
|
|
"The total number of matchers",
|
|
nil, nil),
|
|
matchRE: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_match_re", Namespace, Subsystem),
|
|
"The total number of matchRE",
|
|
nil, nil),
|
|
match: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_match", Namespace, Subsystem),
|
|
"The total number of match",
|
|
nil, nil),
|
|
objectMatchers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_object_matchers", Namespace, Subsystem),
|
|
"The total number of object_matchers",
|
|
nil, nil),
|
|
}
|
|
|
|
return aggregatedMetrics
|
|
}
|
|
|
|
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
|
out <- a.numReceivedAlerts
|
|
out <- a.numInvalidAlerts
|
|
out <- a.configuredReceivers
|
|
out <- a.configuredIntegrations
|
|
|
|
out <- a.numNotifications
|
|
out <- a.numFailedNotifications
|
|
out <- a.numNotificationRequestsTotal
|
|
out <- a.numNotificationRequestsFailedTotal
|
|
out <- a.notificationLatencySeconds
|
|
|
|
out <- a.nflogGCDuration
|
|
out <- a.nflogSnapshotDuration
|
|
out <- a.nflogSnapshotSize
|
|
out <- a.nflogQueriesTotal
|
|
out <- a.nflogQueryErrorsTotal
|
|
out <- a.nflogQueryDuration
|
|
out <- a.nflogPropagatedMessagesTotal
|
|
|
|
out <- a.markerAlerts
|
|
|
|
out <- a.silencesGCDuration
|
|
out <- a.silencesSnapshotDuration
|
|
out <- a.silencesSnapshotSize
|
|
out <- a.silencesQueriesTotal
|
|
out <- a.silencesQueryErrorsTotal
|
|
out <- a.silencesQueryDuration
|
|
out <- a.silencesPropagatedMessagesTotal
|
|
out <- a.silences
|
|
|
|
out <- a.dispatchAggrGroups
|
|
out <- a.dispatchProcessingDuration
|
|
|
|
out <- a.matchers
|
|
out <- a.matchRE
|
|
out <- a.match
|
|
out <- a.objectMatchers
|
|
}
|
|
|
|
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
|
data := a.registries.BuildMetricFamiliesPerTenant()
|
|
|
|
data.SendSumOfCountersPerTenant(out, a.numReceivedAlerts, "alertmanager_alerts_received_total", metrics.WithLabels("status"))
|
|
data.SendSumOfCountersPerTenant(out, a.numInvalidAlerts, "alertmanager_alerts_invalid_total")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredReceivers, "grafana_alerting_alertmanager_receivers", "state")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredIntegrations, "grafana_alerting_alertmanager_integrations", "type")
|
|
|
|
data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfHistograms(out, a.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
|
|
|
|
data.SendSumOfSummaries(out, a.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
|
|
data.SendSumOfSummaries(out, a.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
|
|
data.SendSumOfGauges(out, a.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
|
|
data.SendSumOfCounters(out, a.nflogQueriesTotal, "alertmanager_nflog_queries_total")
|
|
data.SendSumOfCounters(out, a.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
|
|
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
|
|
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
|
|
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
|
|
|
|
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
|
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
|
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
|
data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total")
|
|
data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
|
|
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
|
|
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
|
|
|
|
data.SendSumOfGauges(out, a.dispatchAggrGroups, "alertmanager_dispatcher_aggregation_groups")
|
|
data.SendSumOfSummaries(out, a.dispatchProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds")
|
|
|
|
data.SendSumOfGauges(out, a.matchers, "alertmanager_config_matchers")
|
|
data.SendSumOfGauges(out, a.matchRE, "alertmanager_config_match_re")
|
|
data.SendSumOfGauges(out, a.match, "alertmanager_config_match")
|
|
data.SendSumOfGauges(out, a.objectMatchers, "alertmanager_config_object_matchers")
|
|
}
|