Alerting: Add metrics for active receiver and integrations (#64050)

* Alerting: Add metrics for active receiver and integrations

Introduces metrics that allows us to track the number of configured receivers and integration in the Alertmanager for all orgs.

As a bonus, I realised that the alert reception metrics where not being exported nor collected. This does that too.
This commit is contained in:
gotjosh
2023-03-06 16:37:07 +00:00
committed by GitHub
parent fd37ff29b5
commit 5422f7cf56
3 changed files with 37 additions and 4 deletions

View File

@@ -72,6 +72,12 @@ func (moa *MultiOrgAlertmanager) GetOrCreateOrgRegistry(id int64) prometheus.Reg
type AlertmanagerAggregatedMetrics struct {
registries *metrics.TenantRegistries
// metrics gather from the in-house "Alertmanager" directly.
numReceivedAlerts *prometheus.Desc
numInvalidAlerts *prometheus.Desc
configuredReceivers *prometheus.Desc
configuredIntegrations *prometheus.Desc
// exported metrics, gathered from Alertmanager PipelineBuilder
numNotifications *prometheus.Desc
numFailedNotifications *prometheus.Desc
@@ -107,6 +113,23 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
aggregatedMetrics := &AlertmanagerAggregatedMetrics{
registries: registries,
numReceivedAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alerts_received_total", Namespace, Subsystem),
"The total number of received alerts.",
[]string{"org", "status"}, nil),
numInvalidAlerts: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alerts_invalid_total", Namespace, Subsystem),
"The total number of received alerts that were invalid.",
[]string{"org"}, nil),
configuredReceivers: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_receivers", Namespace, Subsystem),
"Number of configured receivers by state. It is considered active if used within a route.",
[]string{"org", "state"}, nil),
configuredIntegrations: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_integrations", Namespace, Subsystem),
"Number of configured receivers.",
[]string{"org", "type"}, nil),
numNotifications: prometheus.NewDesc(
fmt.Sprintf("%s_%s_notifications_total", Namespace, Subsystem),
"The total number of attempted notifications.",
@@ -204,6 +227,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
}
func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a.numReceivedAlerts
out <- a.numInvalidAlerts
out <- a.configuredReceivers
out <- a.configuredIntegrations
out <- a.numNotifications
out <- a.numFailedNotifications
out <- a.numNotificationRequestsTotal
@@ -234,6 +262,11 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data := a.registries.BuildMetricFamiliesPerTenant()
data.SendSumOfCountersPerTenant(out, a.numReceivedAlerts, "alertmanager_alerts_received_total", metrics.WithLabels("status"))
data.SendSumOfCountersPerTenant(out, a.numInvalidAlerts, "alertmanager_alerts_invalid_total")
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredReceivers, "grafana_alerting_alertmanager_receivers", "state")
data.SendSumOfGaugesPerTenantWithLabels(out, a.configuredIntegrations, "grafana_alerting_alertmanager_integrations", "type")
data.SendSumOfCountersPerTenant(out, a.numNotifications, "alertmanager_notifications_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, a.numFailedNotifications, "alertmanager_notifications_failed_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, a.numNotificationRequestsTotal, "alertmanager_notification_requests_total", metrics.WithLabels("integration"), metrics.WithSkipZeroValueMetrics)