Alerting: Add metric to check for default AM configurations (#80225)

* Alerting: Add metric to check for default AM configurations

* Use a gauge for the config hash

* don't go out of bounds when converting uint64 to float64

* expose metric for config hash

* update metrics after applying config
This commit is contained in:
Santiago 2024-01-16 17:12:24 +01:00 committed by GitHub
parent 06800e2d31
commit 3afd94185c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 33 additions and 3 deletions

View File

@ -24,6 +24,7 @@ func NewAlertmanagerMetrics(r prometheus.Registerer) *Alertmanager {
}
type AlertmanagerConfigMetrics struct {
ConfigHash *prometheus.GaugeVec
Matchers prometheus.Gauge
MatchRE prometheus.Gauge
Match prometheus.Gauge
@ -32,6 +33,10 @@ type AlertmanagerConfigMetrics struct {
func NewAlertmanagerConfigMetrics(r prometheus.Registerer) *AlertmanagerConfigMetrics {
m := &AlertmanagerConfigMetrics{
ConfigHash: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "alertmanager_config_hash",
Help: "The hash of the Alertmanager configuration.",
}, []string{"org"}),
Matchers: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "alertmanager_config_matchers",
Help: "The total number of matchers",
@ -50,7 +55,7 @@ func NewAlertmanagerConfigMetrics(r prometheus.Registerer) *AlertmanagerConfigMe
}),
}
if r != nil {
r.MustRegister(m.Matchers, m.MatchRE, m.Match, m.ObjectMatchers)
r.MustRegister(m.ConfigHash, m.Matchers, m.MatchRE, m.Match, m.ObjectMatchers)
}
return m
}

View File

@ -117,6 +117,8 @@ type AlertmanagerAggregatedMetrics struct {
matchRE *prometheus.Desc
match *prometheus.Desc
objectMatchers *prometheus.Desc
configHash *prometheus.Desc
}
func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *AlertmanagerAggregatedMetrics {
@ -253,6 +255,11 @@ func NewAlertmanagerAggregatedMetrics(registries *metrics.TenantRegistries) *Ale
fmt.Sprintf("%s_%s_alertmanager_config_object_matchers", Namespace, Subsystem),
"The total number of object_matchers",
nil, nil),
configHash: prometheus.NewDesc(
fmt.Sprintf("%s_%s_alertmanager_config_hash", Namespace, Subsystem),
"The hash of the Alertmanager configuration.",
[]string{"org"}, nil),
}
return aggregatedMetrics
@ -296,6 +303,8 @@ func (a *AlertmanagerAggregatedMetrics) Describe(out chan<- *prometheus.Desc) {
out <- a.matchRE
out <- a.match
out <- a.objectMatchers
out <- a.configHash
}
func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
@ -338,4 +347,6 @@ func (a *AlertmanagerAggregatedMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfGauges(out, a.matchRE, "alertmanager_config_match_re")
data.SendSumOfGauges(out, a.match, "alertmanager_config_match")
data.SendSumOfGauges(out, a.objectMatchers, "alertmanager_config_object_matchers")
data.SendMaxOfGaugesPerTenant(out, a.configHash, "alertmanager_config_hash")
}

View File

@ -3,6 +3,7 @@ package notifier
import (
"context"
"crypto/md5"
"encoding/binary"
"encoding/json"
"fmt"
"path/filepath"
@ -255,6 +256,10 @@ func (am *alertmanager) updateConfigMetrics(cfg *apimodels.PostableUserConfig) {
am.ConfigMetrics.MatchRE.Set(float64(amu.MatchRE))
am.ConfigMetrics.Match.Set(float64(amu.Match))
am.ConfigMetrics.ObjectMatchers.Set(float64(amu.ObjectMatchers))
am.ConfigMetrics.ConfigHash.
WithLabelValues(strconv.FormatInt(am.orgID, 10)).
Set(hashAsMetricValue(am.Base.ConfigHash()))
}
func (am *alertmanager) aggregateRouteMatchers(r *apimodels.Route, amu *AggregateMatchersUsage) {
@ -315,8 +320,6 @@ func (am *alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
return false, nil
}
am.updateConfigMetrics(cfg)
err = am.Base.ApplyConfig(AlertingConfiguration{
rawAlertmanagerConfig: rawConfig,
alertmanagerConfig: cfg.AlertmanagerConfig,
@ -327,6 +330,7 @@ func (am *alertmanager) applyConfig(cfg *apimodels.PostableUserConfig, rawConfig
return false, err
}
am.updateConfigMetrics(cfg)
return true, nil
}
@ -421,3 +425,13 @@ func (e AlertValidationError) Error() string {
type nilLimits struct{}
func (n nilLimits) MaxNumberOfAggregationGroups() int { return 0 }
// This function is taken from upstream, modified to take a [16]byte instead of a []byte.
// https://github.com/prometheus/alertmanager/blob/30fa9cd44bc91c0d6adcc9985609bb08a09a127b/config/coordinator.go#L149-L156
func hashAsMetricValue(data [16]byte) float64 {
// We only want 48 bits as a float64 only has a 53 bit mantissa.
smallSum := data[0:6]
bytes := make([]byte, 8)
copy(bytes, smallSum)
return float64(binary.LittleEndian.Uint64(bytes))
}