mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
Alerting: Add metrics to the remote Alertmanager struct (#79835)
* Alerting: Add metrics to the remote Alertmanager struct * rephrase http_requests_failed description * make linter happy * remove unnecessary metrics * extract timed client to separate package * use histogram collector from dskit * remove weaveworks dependency * capture metrics for all requests to the remote Alertmanager (both clients) * use the timed client in the MimirAuthRoundTripper * HTTPRequestsDuration -> HTTPRequestDuration, clean up mimir client factory function * refactor * less git diff * gauge for last readiness check in seconds * initialize LastReadinesCheck to 0, tweak metric names and descriptions * add counters for sync attempts/errors * last config sync and last state sync timestamps (gauges) * change latency metric name * metric for remote Alertmanager mode * code review comments * move label constants to metrics package
This commit is contained in:
@@ -30,6 +30,7 @@ type NGAlert struct {
|
||||
multiOrgAlertmanagerMetrics *MultiOrgAlertmanager
|
||||
apiMetrics *API
|
||||
historianMetrics *Historian
|
||||
remoteAlertmanagerMetrics *RemoteAlertmanager
|
||||
}
|
||||
|
||||
// NewNGAlert manages the metrics of all the alerting components.
|
||||
@@ -41,6 +42,7 @@ func NewNGAlert(r prometheus.Registerer) *NGAlert {
|
||||
multiOrgAlertmanagerMetrics: NewMultiOrgAlertmanagerMetrics(r),
|
||||
apiMetrics: NewAPIMetrics(r),
|
||||
historianMetrics: NewHistorianMetrics(r, Subsystem),
|
||||
remoteAlertmanagerMetrics: NewRemoteAlertmanagerMetrics(r),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -63,3 +65,7 @@ func (ng *NGAlert) GetMultiOrgAlertmanagerMetrics() *MultiOrgAlertmanager {
|
||||
func (ng *NGAlert) GetHistorianMetrics() *Historian {
|
||||
return ng.historianMetrics
|
||||
}
|
||||
|
||||
func (ng *NGAlert) GetRemoteAlertmanagerMetrics() *RemoteAlertmanager {
|
||||
return ng.remoteAlertmanagerMetrics
|
||||
}
|
||||
|
||||
84
pkg/services/ngalert/metrics/remote_alertmanager.go
Normal file
84
pkg/services/ngalert/metrics/remote_alertmanager.go
Normal file
@@ -0,0 +1,84 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"github.com/grafana/dskit/instrument"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
)
|
||||
|
||||
const (
|
||||
ModeRemoteSecondary = "remote_secondary"
|
||||
ModeRemotePrimary = "remote_primary"
|
||||
ModeRemoteOnly = "remote_only"
|
||||
)
|
||||
|
||||
type RemoteAlertmanager struct {
|
||||
Info *prometheus.GaugeVec
|
||||
RequestLatency *instrument.HistogramCollector
|
||||
LastReadinessCheck prometheus.Gauge
|
||||
ConfigSyncsTotal prometheus.Counter
|
||||
ConfigSyncErrorsTotal prometheus.Counter
|
||||
LastConfigSync prometheus.Gauge
|
||||
StateSyncsTotal prometheus.Counter
|
||||
StateSyncErrorsTotal prometheus.Counter
|
||||
LastStateSync prometheus.Gauge
|
||||
}
|
||||
|
||||
func NewRemoteAlertmanagerMetrics(r prometheus.Registerer) *RemoteAlertmanager {
|
||||
return &RemoteAlertmanager{
|
||||
Info: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_info",
|
||||
Help: "Information about the remote Alertmanager.",
|
||||
}, []string{"mode"}),
|
||||
RequestLatency: instrument.NewHistogramCollector(promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_latency_seconds",
|
||||
Help: "Histogram of request latencies to the remote Alertmanager.",
|
||||
}, instrument.HistogramCollectorBuckets)),
|
||||
LastReadinessCheck: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_last_readiness_check_timestamp_seconds",
|
||||
Help: "Timestamp of the last successful readiness check to the remote Alertmanager in seconds.",
|
||||
}),
|
||||
ConfigSyncsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_configuration_syncs_total",
|
||||
Help: "Total number of configuration syncs to the remote Alertmanager.",
|
||||
}),
|
||||
ConfigSyncErrorsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_configuration_sync_failures_total",
|
||||
Help: "Total number of failed attempts to sync configurations between Alertmanagers.",
|
||||
}),
|
||||
LastConfigSync: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_last_configuration_sync_timestamp_seconds",
|
||||
Help: "Timestamp of the last successful configuration sync to the remote Alertmanager in seconds.",
|
||||
}),
|
||||
StateSyncsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_state_syncs_total",
|
||||
Help: "Total number of state syncs to the remote Alertmanager.",
|
||||
}),
|
||||
StateSyncErrorsTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_state_sync_failures_total",
|
||||
Help: "Total number of failed attempts to sync state between Alertmanagers.",
|
||||
}),
|
||||
LastStateSync: promauto.With(r).NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: Namespace,
|
||||
Subsystem: Subsystem,
|
||||
Name: "remote_alertmanager_last_state_sync_timestamp_seconds",
|
||||
Help: "Timestamp of the last successful state sync to the remote Alertmanager in seconds.",
|
||||
}),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user