Alerting: Add metrics to the remote Alertmanager struct (#79835)

* Alerting: Add metrics to the remote Alertmanager struct

* rephrase http_requests_failed description

* make linter happy

* remove unnecessary metrics

* extract timed client to separate package

* use histogram collector from dskit

* remove weaveworks dependency

* capture metrics for all requests to the remote Alertmanager (both clients)

* use the timed client in the MimirAuthRoundTripper

* HTTPRequestsDuration -> HTTPRequestDuration, clean up mimir client factory function

* refactor

* less git diff

* gauge for last readiness check in seconds

* initialize LastReadinesCheck to 0, tweak metric names and descriptions

* add counters for sync attempts/errors

* last config sync and last state sync timestamps (gauges)

* change latency metric name

* metric for remote Alertmanager mode

* code review comments

* move label constants to metrics package
This commit is contained in:
Santiago
2024-01-10 11:18:24 +01:00
committed by GitHub
parent 1162c28a55
commit 9e78faa7ba
14 changed files with 171 additions and 37 deletions

View File

@@ -30,6 +30,7 @@ type NGAlert struct {
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
apiMetrics *API
historianMetrics *Historian
remoteAlertmanagerMetrics *RemoteAlertmanager
}
// NewNGAlert manages the metrics of all the alerting components.
@@ -41,6 +42,7 @@ func NewNGAlert(r prometheus.Registerer) *NGAlert {
multiOrgAlertmanagerMetrics: NewMultiOrgAlertmanagerMetrics(r),
apiMetrics: NewAPIMetrics(r),
historianMetrics: NewHistorianMetrics(r, Subsystem),
remoteAlertmanagerMetrics: NewRemoteAlertmanagerMetrics(r),
}
}
@@ -63,3 +65,7 @@ func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
func (ng *NGAlert) GetHistorianMetrics() *Historian {
return ng.historianMetrics
}
func (ng *NGAlert) GetRemoteAlertmanagerMetrics() *RemoteAlertmanager {
return ng.remoteAlertmanagerMetrics
}

View File

@@ -0,0 +1,84 @@
package metrics
import (
"github.com/grafana/dskit/instrument"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
const (
ModeRemoteSecondary = "remote_secondary"
ModeRemotePrimary = "remote_primary"
ModeRemoteOnly = "remote_only"
)
type RemoteAlertmanager struct {
Info *prometheus.GaugeVec
RequestLatency *instrument.HistogramCollector
LastReadinessCheck prometheus.Gauge
ConfigSyncsTotal prometheus.Counter
ConfigSyncErrorsTotal prometheus.Counter
LastConfigSync prometheus.Gauge
StateSyncsTotal prometheus.Counter
StateSyncErrorsTotal prometheus.Counter
LastStateSync prometheus.Gauge
}
func NewRemoteAlertmanagerMetrics(r prometheus.Registerer) *RemoteAlertmanager {
return &RemoteAlertmanager{
Info: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_info",
Help: "Information about the remote Alertmanager.",
}, []string{"mode"}),
RequestLatency: instrument.NewHistogramCollector(promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_latency_seconds",
Help: "Histogram of request latencies to the remote Alertmanager.",
}, instrument.HistogramCollectorBuckets)),
LastReadinessCheck: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_last_readiness_check_timestamp_seconds",
Help: "Timestamp of the last successful readiness check to the remote Alertmanager in seconds.",
}),
ConfigSyncsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_configuration_syncs_total",
Help: "Total number of configuration syncs to the remote Alertmanager.",
}),
ConfigSyncErrorsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_configuration_sync_failures_total",
Help: "Total number of failed attempts to sync configurations between Alertmanagers.",
}),
LastConfigSync: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_last_configuration_sync_timestamp_seconds",
Help: "Timestamp of the last successful configuration sync to the remote Alertmanager in seconds.",
}),
StateSyncsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_state_syncs_total",
Help: "Total number of state syncs to the remote Alertmanager.",
}),
StateSyncErrorsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_state_sync_failures_total",
Help: "Total number of failed attempts to sync state between Alertmanagers.",
}),
LastStateSync: promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_alertmanager_last_state_sync_timestamp_seconds",
Help: "Timestamp of the last successful state sync to the remote Alertmanager in seconds.",
}),
}
}