mirror of
https://github.com/grafana/grafana.git
synced 2024-12-01 21:19:28 -06:00
3afd94185c
* Alerting: Add metric to check for default AM configurations * Use a gauge for the config hash * don't go out of bounds when converting uint64 to float64 * expose metric for config hash * update metrics after applying config
353 lines
15 KiB
Go
353 lines
15 KiB
Go
package metrics
|
|
|
|
import (
|
|
"fmt"
|
|
"strconv"
|
|
|
|
"github.com/grafana/grafana/pkg/infra/log"
|
|
|
|
"github.com/grafana/dskit/metrics"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
)
|
|
|
|
type MultiOrgAlertmanager struct {
|
|
Registerer prometheus.Registerer
|
|
registries *metrics.TenantRegistries
|
|
|
|
ActiveConfigurations prometheus.Gauge
|
|
DiscoveredConfigurations prometheus.Gauge
|
|
|
|
aggregatedMetrics *AlertmanagerAggregatedMetrics
|
|
}
|
|
|
|
func NewMultiOrgAlertmanagerMetrics(r prometheus.Registerer) *MultiOrgAlertmanager {
|
|
registries := metrics.NewTenantRegistries(log.New("ngalert.multiorg.alertmanager.metrics")) //TODO: Should this be here? Probably not.
|
|
moa := &MultiOrgAlertmanager{
|
|
Registerer: r,
|
|
registries: registries,
|
|
DiscoveredConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: Namespace,
|
|
Subsystem: Subsystem,
|
|
Name: "discovered_configurations",
|
|
Help: "The number of organizations we've discovered that require an Alertmanager configuration.",
|
|
}),
|
|
ActiveConfigurations: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
|
Namespace: Namespace,
|
|
Subsystem: Subsystem,
|
|
Name: "active_configurations",
|
|
Help: "The number of active Alertmanager configurations.",
|
|
}),
|
|
aggregatedMetrics: NewAlertmanagerAggregatedMetrics(registries),
|
|
}
|
|
|
|
// These metrics use a different registration method as the struct itself represents a custom collector.
|
|
// There's no way to "auto-register" a collector.
|
|
r.MustRegister(moa.aggregatedMetrics)
|
|
|
|
return moa
|
|
}
|
|
|
|
// RemoveOrgRegistry removes the *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
func (moa *MultiOrgAlertmanager) RemoveOrgRegistry(id int64) {
|
|
moa.registries.RemoveTenantRegistry(strconv.FormatInt(id, 10), false)
|
|
}
|
|
|
|
// GetOrCreateOrgRegistry gets or creates a *prometheus.Registry for the specified org. It is safe to call concurrently.
|
|
func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Registerer {
|
|
sid := strconv.FormatInt(id, 10)
|
|
reg := moa.registries.GetRegistryForTenant(sid)
|
|
if reg != nil {
|
|
return reg
|
|
}
|
|
|
|
result := prometheus.NewRegistry()
|
|
moa.registries.AddTenantRegistry(sid, result)
|
|
|
|
return result
|
|
}
|
|
|
|
// AlertmanagerAggregatedMetrics are metrics collected directly from the registry.
|
|
// Unlike metrics.Alertmanager they are not called within this codebase hence the need for direct collection.
|
|
type AlertmanagerAggregatedMetrics struct {
|
|
registries *metrics.TenantRegistries
|
|
|
|
// metrics gather from the in-house "Alertmanager" directly.
|
|
numReceivedAlerts *prometheus.Desc
|
|
numInvalidAlerts *prometheus.Desc
|
|
configuredReceivers *prometheus.Desc
|
|
configuredIntegrations *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager PipelineBuilder
|
|
numNotifications *prometheus.Desc
|
|
numFailedNotifications *prometheus.Desc
|
|
numNotificationRequestsTotal *prometheus.Desc
|
|
numNotificationRequestsFailedTotal *prometheus.Desc
|
|
notificationLatencySeconds *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager nflog
|
|
nflogGCDuration *prometheus.Desc
|
|
nflogSnapshotDuration *prometheus.Desc
|
|
nflogSnapshotSize *prometheus.Desc
|
|
nflogQueriesTotal *prometheus.Desc
|
|
nflogQueryErrorsTotal *prometheus.Desc
|
|
nflogQueryDuration *prometheus.Desc
|
|
nflogPropagatedMessagesTotal *prometheus.Desc
|
|
|
|
// exporter metrics, gathered from the Alertmanager Alert Marker.
|
|
markerAlerts *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager Silences
|
|
silencesGCDuration *prometheus.Desc
|
|
silencesSnapshotDuration *prometheus.Desc
|
|
silencesSnapshotSize *prometheus.Desc
|
|
silencesQueriesTotal *prometheus.Desc
|
|
silencesQueryErrorsTotal *prometheus.Desc
|
|
silencesQueryDuration *prometheus.Desc
|
|
silences *prometheus.Desc
|
|
silencesPropagatedMessagesTotal *prometheus.Desc
|
|
|
|
// exported metrics, gathered from Alertmanager Dispatcher
|
|
dispatchAggrGroups *prometheus.Desc
|
|
dispatchProcessingDuration *prometheus.Desc
|
|
|
|
// added to measure usage of matchers, match_re, match and
|
|
// object_matchers
|
|
matchers *prometheus.Desc
|
|
matchRE *prometheus.Desc
|
|
match *prometheus.Desc
|
|
objectMatchers *prometheus.Desc
|
|
|
|
configHash *prometheus.Desc
|
|
}
|
|
|
|
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
|
|
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
|
|
registries: registries,
|
|
|
|
numReceivedAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alerts_received_total", Namespace, Subsystem),
|
|
"The total number of received alerts.",
|
|
[]string{"org", "status"}, nil),
|
|
numInvalidAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alerts_invalid_total", Namespace, Subsystem),
|
|
"The total number of received alerts that were invalid.",
|
|
[]string{"org"}, nil),
|
|
configuredReceivers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_receivers", Namespace, Subsystem),
|
|
"Number of configured receivers by state. It is considered active if used within a route.",
|
|
[]string{"org", "state"}, nil),
|
|
configuredIntegrations: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_integrations", Namespace, Subsystem),
|
|
"Number of configured receivers.",
|
|
[]string{"org", "type"}, nil),
|
|
|
|
numNotifications: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem),
|
|
"The total number of attempted notifications.",
|
|
[]string{"org", "integration"}, nil),
|
|
numFailedNotifications: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notifications_failed_total", Namespace, Subsystem),
|
|
"The total number of failed notifications.",
|
|
[]string{"org", "integration"}, nil),
|
|
numNotificationRequestsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_requests_total", Namespace, Subsystem),
|
|
"The total number of attempted notification requests.",
|
|
[]string{"org", "integration"}, nil),
|
|
numNotificationRequestsFailedTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_requests_failed_total", Namespace, Subsystem),
|
|
"The total number of failed notification requests.",
|
|
[]string{"org", "integration"}, nil),
|
|
notificationLatencySeconds: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_notification_latency_seconds", Namespace, Subsystem),
|
|
"The latency of notifications in seconds.",
|
|
nil, nil),
|
|
|
|
nflogGCDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_gc_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last notification log garbage collection cycle.",
|
|
nil, nil),
|
|
nflogSnapshotDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_snapshot_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last notification log snapshot.",
|
|
nil, nil),
|
|
nflogSnapshotSize: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_snapshot_size_bytes", Namespace, Subsystem),
|
|
"Size of the last notification log snapshot in bytes.",
|
|
nil, nil),
|
|
nflogQueriesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_queries_total", Namespace, Subsystem),
|
|
"Number of notification log queries were received.",
|
|
nil, nil),
|
|
nflogQueryErrorsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_query_errors_total", Namespace, Subsystem),
|
|
"Number notification log received queries that failed.",
|
|
nil, nil),
|
|
nflogQueryDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_query_duration_seconds", Namespace, Subsystem),
|
|
"Duration of notification log query evaluation.",
|
|
nil, nil),
|
|
nflogPropagatedMessagesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_nflog_gossip_messages_propagated_total", Namespace, Subsystem),
|
|
"Number of received gossip messages that have been further gossiped.",
|
|
nil, nil),
|
|
|
|
markerAlerts: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_alerts", Namespace, Subsystem),
|
|
"How many alerts by state are in Grafana's Alertmanager.",
|
|
[]string{"org", "state"}, nil),
|
|
|
|
silencesGCDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_gc_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last silence garbage collection cycle.",
|
|
nil, nil),
|
|
silencesSnapshotDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_snapshot_duration_seconds", Namespace, Subsystem),
|
|
"Duration of the last silence snapshot.",
|
|
nil, nil),
|
|
silencesSnapshotSize: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_snapshot_size_bytes", Namespace, Subsystem),
|
|
"Size of the last silence snapshot in bytes.",
|
|
nil, nil),
|
|
silencesQueriesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_queries_total", Namespace, Subsystem),
|
|
"How many silence queries were received.",
|
|
nil, nil),
|
|
silencesQueryErrorsTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_query_errors_total", Namespace, Subsystem),
|
|
"How many silence received queries did not succeed.",
|
|
nil, nil),
|
|
silencesQueryDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_query_duration_seconds", Namespace, Subsystem),
|
|
"Duration of silence query evaluation.",
|
|
nil, nil),
|
|
silencesPropagatedMessagesTotal: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences_gossip_messages_propagated_total", Namespace, Subsystem),
|
|
"Number of received gossip messages that have been further gossiped.",
|
|
nil, nil),
|
|
silences: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_silences", Namespace, Subsystem),
|
|
"How many silences by state.",
|
|
[]string{"org", "state"}, nil),
|
|
|
|
dispatchAggrGroups: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_dispatcher_aggregation_groups", Namespace, Subsystem),
|
|
"Number of active aggregation groups",
|
|
nil, nil),
|
|
dispatchProcessingDuration: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_dispatcher_alert_processing_duration_seconds", Namespace, Subsystem),
|
|
"Summary of latencies for the processing of alerts.",
|
|
nil, nil),
|
|
|
|
matchers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_matchers", Namespace, Subsystem),
|
|
"The total number of matchers",
|
|
nil, nil),
|
|
matchRE: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_match_re", Namespace, Subsystem),
|
|
"The total number of matchRE",
|
|
nil, nil),
|
|
match: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_match", Namespace, Subsystem),
|
|
"The total number of match",
|
|
nil, nil),
|
|
objectMatchers: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_object_matchers", Namespace, Subsystem),
|
|
"The total number of object_matchers",
|
|
nil, nil),
|
|
|
|
configHash: prometheus.NewDesc(
|
|
fmt.Sprintf("%s_%s_alertmanager_config_hash", Namespace, Subsystem),
|
|
"The hash of the Alertmanager configuration.",
|
|
[]string{"org"}, nil),
|
|
}
|
|
|
|
return aggregatedMetrics
|
|
}
|
|
|
|
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
|
|
out <- a.numReceivedAlerts
|
|
out <- a.numInvalidAlerts
|
|
out <- a.configuredReceivers
|
|
out <- a.configuredIntegrations
|
|
|
|
out <- a.numNotifications
|
|
out <- a.numFailedNotifications
|
|
out <- a.numNotificationRequestsTotal
|
|
out <- a.numNotificationRequestsFailedTotal
|
|
out <- a.notificationLatencySeconds
|
|
|
|
out <- a.nflogGCDuration
|
|
out <- a.nflogSnapshotDuration
|
|
out <- a.nflogSnapshotSize
|
|
out <- a.nflogQueriesTotal
|
|
out <- a.nflogQueryErrorsTotal
|
|
out <- a.nflogQueryDuration
|
|
out <- a.nflogPropagatedMessagesTotal
|
|
|
|
out <- a.markerAlerts
|
|
|
|
out <- a.silencesGCDuration
|
|
out <- a.silencesSnapshotDuration
|
|
out <- a.silencesSnapshotSize
|
|
out <- a.silencesQueriesTotal
|
|
out <- a.silencesQueryErrorsTotal
|
|
out <- a.silencesQueryDuration
|
|
out <- a.silencesPropagatedMessagesTotal
|
|
out <- a.silences
|
|
|
|
out <- a.dispatchAggrGroups
|
|
out <- a.dispatchProcessingDuration
|
|
|
|
out <- a.matchers
|
|
out <- a.matchRE
|
|
out <- a.match
|
|
out <- a.objectMatchers
|
|
|
|
out <- a.configHash
|
|
}
|
|
|
|
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
|
|
data := a.registries.BuildMetricFamiliesPerTenant()
|
|
|
|
data.SendSumOfCountersPerTenant(out, a.numReceivedAlerts, "alertmanager_alerts_received_total", metrics.WithLabels("status"))
|
|
data.SendSumOfCountersPerTenant(out, a.numInvalidAlerts, "alertmanager_alerts_invalid_total")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredReceivers, "grafana_alerting_alertmanager_receivers", "state")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredIntegrations, "grafana_alerting_alertmanager_integrations", "type")
|
|
|
|
data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
|
|
data.SendSumOfHistograms(out, a.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
|
|
|
|
data.SendSumOfSummaries(out, a.nflogGCDuration, "alertmanager_nflog_gc_duration_seconds")
|
|
data.SendSumOfSummaries(out, a.nflogSnapshotDuration, "alertmanager_nflog_snapshot_duration_seconds")
|
|
data.SendSumOfGauges(out, a.nflogSnapshotSize, "alertmanager_nflog_snapshot_size_bytes")
|
|
data.SendSumOfCounters(out, a.nflogQueriesTotal, "alertmanager_nflog_queries_total")
|
|
data.SendSumOfCounters(out, a.nflogQueryErrorsTotal, "alertmanager_nflog_query_errors_total")
|
|
data.SendSumOfHistograms(out, a.nflogQueryDuration, "alertmanager_nflog_query_duration_seconds")
|
|
data.SendSumOfCounters(out, a.nflogPropagatedMessagesTotal, "alertmanager_nflog_gossip_messages_propagated_total")
|
|
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.markerAlerts, "alertmanager_alerts", "state")
|
|
|
|
data.SendSumOfSummaries(out, a.silencesGCDuration, "alertmanager_silences_gc_duration_seconds")
|
|
data.SendSumOfSummaries(out, a.silencesSnapshotDuration, "alertmanager_silences_snapshot_duration_seconds")
|
|
data.SendSumOfGauges(out, a.silencesSnapshotSize, "alertmanager_silences_snapshot_size_bytes")
|
|
data.SendSumOfCounters(out, a.silencesQueriesTotal, "alertmanager_silences_queries_total")
|
|
data.SendSumOfCounters(out, a.silencesQueryErrorsTotal, "alertmanager_silences_query_errors_total")
|
|
data.SendSumOfHistograms(out, a.silencesQueryDuration, "alertmanager_silences_query_duration_seconds")
|
|
data.SendSumOfCounters(out, a.silencesPropagatedMessagesTotal, "alertmanager_silences_gossip_messages_propagated_total")
|
|
data.SendSumOfGaugesPerTenantWithLabels(out, a.silences, "alertmanager_silences", "state")
|
|
|
|
data.SendSumOfGauges(out, a.dispatchAggrGroups, "alertmanager_dispatcher_aggregation_groups")
|
|
data.SendSumOfSummaries(out, a.dispatchProcessingDuration, "alertmanager_dispatcher_alert_processing_duration_seconds")
|
|
|
|
data.SendSumOfGauges(out, a.matchers, "alertmanager_config_matchers")
|
|
data.SendSumOfGauges(out, a.matchRE, "alertmanager_config_match_re")
|
|
data.SendSumOfGauges(out, a.match, "alertmanager_config_match")
|
|
data.SendSumOfGauges(out, a.objectMatchers, "alertmanager_config_object_matchers")
|
|
|
|
data.SendMaxOfGaugesPerTenant(out, a.configHash, "alertmanager_config_hash")
|
|
}
|