3
0
mirror of https://github.com/grafana/grafana.git synced 2025-02-25 18:55:37 -06:00
grafana/pkg/services/ngalert/metrics/multi_org_alertmanager.go
gotjosh e877174501
Alerting: Expose metrics for Alertmanager Alerts - grafana_alerting_alertmanager_alerts ()
* Alerting: Expose metrics for Alertmanager Alerts

In Grafana, the alert evaluation and alert delivery are combined. We're always used a metric named `grafana_alerting_alerts` to get a sense of what are the alerts that are currently firing (these come from the evaluation side) and opted to not map the alertmanager alerts metric directly.

I think it's important that we make a disction between alerts that happen at evaluation vs alerts that are received for delivery by the internal Alertmanager as we have options to skip the delivery of these alerts to the internal alertmanager altogether.
2023-10-02 16:36:23 +01:00

342 lines
15 KiB
Go

package metrics
import (
"fmt"
"strconv"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/dskit/metrics"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
type MultiOrgAlertmanager struct {
Registerer prometheus.Registerer
registries *metrics.TenantRegistries
ActiveConfigurations prometheus.Gauge
DiscoveredConfigurations prometheus.Gauge
aggregatedMetrics *AlertmanagerAggregatedMetrics
}
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not.
moa := &MultiOrgAlertmanager{
Registerer: r,
registries: registries,
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "discovered_configurations",
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
}),
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "active_configurations",
Help: "The number of active Alertmanager configurations.",
}),
aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries),
}
// These metrics use a different registration method as the struct itself represents a custom collector.
// There's no way to "auto-register" a collector.
r.MustRegister(moa.aggregatedMetrics)
return moa
}
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false)
}
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
sid := strconv.FormatInt(id, 10)
reg := moa.registries.GetRegistryForTenant(sid)
if reg != nil {
return reg
}
result := prometheus.NewRegistry()
moa.registries.AddTenantRegistry(sid, result)
return result
}
// AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
type AlertmanagerAggregatedMetrics struct {
registries *metrics.TenantRegistries
// metrics gather from the in-house "Alertmanager" directly.
numReceivedAlerts *prometheus.Desc
numInvalidAlerts *prometheus.Desc
configuredReceivers *prometheus.Desc
configuredIntegrations *prometheus.Desc
// exported metrics, gathered from Alertmanager PipelineBuilder
numNotifications *prometheus.Desc
numFailedNotifications *prometheus.Desc
numNotificationRequestsTotal *prometheus.Desc
numNotificationRequestsFailedTotal *prometheus.Desc
notificationLatencySeconds *prometheus.Desc
// exported metrics, gathered from Alertmanager nflog
nflogGCDuration *prometheus.Desc
nflogSnapshotDuration *prometheus.Desc
nflogSnapshotSize *prometheus.Desc
nflogQueriesTotal *prometheus.Desc
nflogQueryErrorsTotal *prometheus.Desc
nflogQueryDuration *prometheus.Desc
nflogPropagatedMessagesTotal *prometheus.Desc
// exporter metrics, gathered from the Alertmanager Alert Marker.
markerAlerts *prometheus.Desc
// exported metrics, gathered from Alertmanager Silences
silencesGCDuration *prometheus.Desc
silencesSnapshotDuration *prometheus.Desc
silencesSnapshotSize *prometheus.Desc
silencesQueriesTotal *prometheus.Desc
silencesQueryErrorsTotal *prometheus.Desc
silencesQueryDuration *prometheus.Desc
silences *prometheus.Desc
silencesPropagatedMessagesTotal *prometheus.Desc
// exported metrics, gathered from Alertmanager Dispatcher
dispatchAggrGroups *prometheus.Desc
dispatchProcessingDuration *prometheus.Desc
// added to measure usage of matchers, match_re, match and
// object_matchers
matchers *prometheus.Desc
matchRE *prometheus.Desc
match *prometheus.Desc
objectMatchers *prometheus.Desc
}
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
registries: registries,
numReceivedAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alerts_received_total", Namespace, Subsystem),
"The total number of received alerts.",
[]string{"org", "status"}, nil),
numInvalidAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alerts_invalid_total", Namespace, Subsystem),
"The total number of received alerts that were invalid.",
[]string{"org"}, nil),
configuredReceivers: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_receivers", Namespace, Subsystem),
"Number of configured receivers by state. It is considered active if used within a route.",
[]string{"org", "state"}, nil),
configuredIntegrations: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_integrations", Namespace, Subsystem),
"Number of configured receivers.",
[]string{"org", "type"}, nil),
numNotifications: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem),
"The total number of attempted notifications.",
[]string{"org", "integration"}, nil),
numFailedNotifications: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notifications_failed_total", Namespace, Subsystem),
"The total number of failed notifications.",
[]string{"org", "integration"}, nil),
numNotificationRequestsTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notification_requests_total", Namespace, Subsystem),
"The total number of attempted notification requests.",
[]string{"org", "integration"}, nil),
numNotificationRequestsFailedTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notification_requests_failed_total", Namespace, Subsystem),
"The total number of failed notification requests.",
[]string{"org", "integration"}, nil),
notificationLatencySeconds: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notification_latency_seconds", Namespace, Subsystem),
"The latency of notifications in seconds.",
nil, nil),
nflogGCDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_gc_duration_seconds", Namespace, Subsystem),
"Duration of the last notification log garbage collection cycle.",
nil, nil),
nflogSnapshotDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_snapshot_duration_seconds", Namespace, Subsystem),
"Duration of the last notification log snapshot.",
nil, nil),
nflogSnapshotSize: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_snapshot_size_bytes", Namespace, Subsystem),
"Size of the last notification log snapshot in bytes.",
nil, nil),
nflogQueriesTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_queries_total", Namespace, Subsystem),
"Number of notification log queries were received.",
nil, nil),
nflogQueryErrorsTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_query_errors_total", Namespace, Subsystem),
"Number notification log received queries that failed.",
nil, nil),
nflogQueryDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_query_duration_seconds", Namespace, Subsystem),
"Duration of notification log query evaluation.",
nil, nil),
nflogPropagatedMessagesTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_nflog_gossip_messages_propagated_total", Namespace, Subsystem),
"Number of received gossip messages that have been further gossiped.",
nil, nil),
markerAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
"How many alerts by state are in Grafana's Alertmanager.",
[]string{"org", "state"}, nil),
silencesGCDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
"Duration of the last silence garbage collection cycle.",
nil, nil),
silencesSnapshotDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem),
"Duration of the last silence snapshot.",
nil, nil),
silencesSnapshotSize: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem),
"Size of the last silence snapshot in bytes.",
nil, nil),
silencesQueriesTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem),
"How many silence queries were received.",
nil, nil),
silencesQueryErrorsTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem),
"How many silence received queries did not succeed.",
nil, nil),
silencesQueryDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem),
"Duration of silence query evaluation.",
nil, nil),
silencesPropagatedMessagesTotal: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem),
"Number of received gossip messages that have been further gossiped.",
nil, nil),
silences: prometheus.NewDesc(
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
"How many silences by state.",
[]string{"org", "state"}, nil),
dispatchAggrGroups: prometheus.NewDesc(
fmt.Sprintf("%s_%s_dispatcher_aggregation_groups", Namespace, Subsystem),
"Number of active aggregation groups",
nil, nil),
dispatchProcessingDuration: prometheus.NewDesc(
fmt.Sprintf("%s_%s_dispatcher_alert_processing_duration_seconds", Namespace, Subsystem),
"Summary of latencies for the processing of alerts.",
nil, nil),
matchers: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_config_matchers", Namespace, Subsystem),
"The total number of matchers",
nil, nil),
matchRE: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_config_match_re", Namespace, Subsystem),
"The total number of matchRE",
nil, nil),
match: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_config_match", Namespace, Subsystem),
"The total number of match",
nil, nil),
objectMatchers: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_config_object_matchers", Namespace, Subsystem),
"The total number of object_matchers",
nil, nil),
}
return aggregatedMetrics
}
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a.numReceivedAlerts
out <- a.numInvalidAlerts
out <- a.configuredReceivers
out <- a.configuredIntegrations
out <- a.numNotifications
out <- a.numFailedNotifications
out <- a.numNotificationRequestsTotal
out <- a.numNotificationRequestsFailedTotal
out <- a.notificationLatencySeconds
out <- a.nflogGCDuration
out <- a.nflogSnapshotDuration
out <- a.nflogSnapshotSize
out <- a.nflogQueriesTotal
out <- a.nflogQueryErrorsTotal
out <- a.nflogQueryDuration
out <- a.nflogPropagatedMessagesTotal
out <- a.markerAlerts
out <- a.silencesGCDuration
out <- a.silencesSnapshotDuration
out <- a.silencesSnapshotSize
out <- a.silencesQueriesTotal
out <- a.silencesQueryErrorsTotal
out <- a.silencesQueryDuration
out <- a.silencesPropagatedMessagesTotal
out <- a.silences
out <- a.dispatchAggrGroups
out <- a.dispatchProcessingDuration
out <- a.matchers
out <- a.matchRE
out <- a.match
out <- a.objectMatchers
}
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data := a.registries.BuildMetricFamiliesPerTenant()
data.SendSumOfCountersPerTenant(out, a.numReceivedAlerts, "alertmanager_alerts_received_total", metrics.WithLabels("status"))
data.SendSumOfCountersPerTenant(out, a.numInvalidAlerts, "alertmanager_alerts_invalid_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredReceivers, "grafana_alerting_alertmanager_receivers", "state")
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredIntegrations, "grafana_alerting_alertmanager_integrations", "type")
data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfHistograms(out, a.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfSummaries(out, a.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
data.SendSumOfSummaries(out, a.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
data.SendSumOfGauges(out, a.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
data.SendSumOfCounters(out, a.nflogQueriesTotal, "alertmanager_nflog_queries_total")
data.SendSumOfCounters(out, a.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total")
data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
data.SendSumOfGauges(out, a.dispatchAggrGroups, "alertmanager_dispatcher_aggregation_groups")
data.SendSumOfSummaries(out, a.dispatchProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds")
data.SendSumOfGauges(out, a.matchers, "alertmanager_config_matchers")
data.SendSumOfGauges(out, a.matchRE, "alertmanager_config_match_re")
data.SendSumOfGauges(out, a.match, "alertmanager_config_match")
data.SendSumOfGauges(out, a.objectMatchers, "alertmanager_config_object_matchers")
}