Alerting: Add metrics to the remote Alertmanager struct (#79835)

* Alerting: Add metrics to the remote Alertmanager struct

* rephrase http_requests_failed description

* make linter happy

* remove unnecessary metrics

* extract timed client to separate package

* use histogram collector from dskit

* remove weaveworks dependency

* capture metrics for all requests to the remote Alertmanager (both clients)

* use the timed client in the MimirAuthRoundTripper

* HTTPRequestsDuration -> HTTPRequestDuration, clean up mimir client factory function

* refactor

* less git diff

* gauge for last readiness check in seconds

* initialize LastReadinesCheck to 0, tweak metric names and descriptions

* add counters for sync attempts/errors

* last config sync and last state sync timestamps (gauges)

* change latency metric name

* metric for remote Alertmanager mode

* code review comments

* move label constants to metrics package
This commit is contained in:
Santiago
2024-01-10 11:18:24 +01:00
committed by GitHub
parent 1162c28a55
commit 9e78faa7ba
14 changed files with 171 additions and 37 deletions

View File

@@ -10,6 +10,7 @@ import (
"github.com/go-openapi/strfmt"
"github.com/grafana/grafana/pkg/infra/log"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
remoteClient "github.com/grafana/grafana/pkg/services/ngalert/remote/client"
@@ -26,6 +27,7 @@ type stateStore interface {
type Alertmanager struct {
log log.Logger
metrics *metrics.RemoteAlertmanager
orgID int64
ready bool
sender *sender.ExternalAlertmanager
@@ -59,7 +61,7 @@ func (cfg *AlertmanagerConfig) Validate() error {
return nil
}
func NewAlertmanager(cfg AlertmanagerConfig, store stateStore) (*Alertmanager, error) {
func NewAlertmanager(cfg AlertmanagerConfig, store stateStore, metrics *metrics.RemoteAlertmanager) (*Alertmanager, error) {
if err := cfg.Validate(); err != nil {
return nil, err
}
@@ -76,7 +78,7 @@ func NewAlertmanager(cfg AlertmanagerConfig, store stateStore) (*Alertmanager, e
Password: cfg.BasicAuthPassword,
Logger: logger,
}
mc, err := remoteClient.New(mcCfg)
mc, err := remoteClient.New(mcCfg, metrics)
if err != nil {
return nil, err
}
@@ -87,7 +89,7 @@ func NewAlertmanager(cfg AlertmanagerConfig, store stateStore) (*Alertmanager, e
Password: cfg.BasicAuthPassword,
Logger: logger,
}
amc, err := remoteClient.NewAlertmanager(amcCfg)
amc, err := remoteClient.NewAlertmanager(amcCfg, metrics)
if err != nil {
return nil, err
}
@@ -104,13 +106,17 @@ func NewAlertmanager(cfg AlertmanagerConfig, store stateStore) (*Alertmanager, e
return nil, err
}
// Initialize LastReadinessCheck so it's present even if the check fails.
metrics.LastReadinessCheck.Set(0)
return &Alertmanager{
log: logger,
mimirClient: mc,
state: store,
amClient: amc,
sender: s,
log: logger,
metrics: metrics,
mimirClient: mc,
orgID: cfg.OrgID,
state: store,
sender: s,
tenantID: cfg.TenantID,
url: cfg.URL,
}, nil
@@ -159,6 +165,7 @@ func (am *Alertmanager) checkReadiness(ctx context.Context) error {
if ready {
am.log.Debug("Alertmanager readiness check successful")
am.metrics.LastReadinessCheck.SetToCurrentTime()
am.ready = true
return nil
}
@@ -170,6 +177,7 @@ func (am *Alertmanager) checkReadiness(ctx context.Context) error {
// If not, it sends the configuration to the remote Alertmanager.
func (am *Alertmanager) CompareAndSendConfiguration(ctx context.Context, config *models.AlertConfiguration) error {
if am.shouldSendConfig(ctx, config) {
am.metrics.ConfigSyncsTotal.Inc()
if err := am.mimirClient.CreateGrafanaAlertmanagerConfig(
ctx,
config.AlertmanagerConfiguration,
@@ -178,8 +186,10 @@ func (am *Alertmanager) CompareAndSendConfiguration(ctx context.Context, config
config.CreatedAt,
config.Default,
); err != nil {
am.metrics.ConfigSyncErrorsTotal.Inc()
return err
}
am.metrics.LastConfigSync.SetToCurrentTime()
}
return nil
}
@@ -193,9 +203,12 @@ func (am *Alertmanager) CompareAndSendState(ctx context.Context) error {
}
if am.shouldSendState(ctx, state) {
am.metrics.StateSyncsTotal.Inc()
if err := am.mimirClient.CreateGrafanaAlertmanagerState(ctx, state); err != nil {
am.metrics.ConfigSyncErrorsTotal.Inc()
return err
}
am.metrics.LastStateSync.SetToCurrentTime()
}
return nil
}

View File

@@ -14,12 +14,14 @@ import (
"github.com/go-openapi/strfmt"
apimodels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/tests/fakes"
"github.com/grafana/grafana/pkg/util"
amv2 "github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/alertmanager/cluster/clusterpb"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"
)
@@ -68,7 +70,8 @@ func TestNewAlertmanager(t *testing.T) {
TenantID: test.tenantID,
BasicAuthPassword: test.password,
}
am, err := NewAlertmanager(cfg, nil)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, m)
if test.expErr != "" {
require.EqualError(tt, err, test.expErr)
return
@@ -106,7 +109,8 @@ func TestApplyConfig(t *testing.T) {
require.NoError(t, store.Set(ctx, cfg.OrgID, "alertmanager", notifier.SilencesFilename, "test"))
require.NoError(t, store.Set(ctx, cfg.OrgID, "alertmanager", notifier.NotificationLogFilename, "test"))
am, err := NewAlertmanager(cfg, fstore)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, fstore, m)
require.NoError(t, err)
config := &ngmodels.AlertConfiguration{}
@@ -175,7 +179,8 @@ func TestIntegrationRemoteAlertmanagerApplyConfigOnlyUploadsOnce(t *testing.T) {
require.NoError(t, err)
encodedFullState := base64.StdEncoding.EncodeToString(fullState)
am, err := NewAlertmanager(cfg, fstore)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, fstore, m)
require.NoError(t, err)
// We should have no configuration or state at first.
@@ -259,7 +264,8 @@ func TestIntegrationRemoteAlertmanagerSilences(t *testing.T) {
TenantID: tenantID,
BasicAuthPassword: password,
}
am, err := NewAlertmanager(cfg, nil)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, m)
require.NoError(t, err)
// We should have no silences at first.
@@ -339,7 +345,8 @@ func TestIntegrationRemoteAlertmanagerAlerts(t *testing.T) {
TenantID: tenantID,
BasicAuthPassword: password,
}
am, err := NewAlertmanager(cfg, nil)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, m)
require.NoError(t, err)
// Wait until the Alertmanager is ready to send alerts.
@@ -405,7 +412,8 @@ func TestIntegrationRemoteAlertmanagerReceivers(t *testing.T) {
BasicAuthPassword: password,
}
am, err := NewAlertmanager(cfg, nil)
m := metrics.NewRemoteAlertmanagerMetrics(prometheus.NewRegistry())
am, err := NewAlertmanager(cfg, nil, m)
require.NoError(t, err)
// We should start with the default config.

View File

@@ -9,6 +9,8 @@ import (
httptransport "github.com/go-openapi/runtime/client"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/client"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
amclient "github.com/prometheus/alertmanager/api/v2/client"
)
@@ -24,12 +26,12 @@ type AlertmanagerConfig struct {
type Alertmanager struct {
*amclient.AlertmanagerAPI
httpClient *http.Client
httpClient client.Requester
url *url.URL
logger log.Logger
}
func NewAlertmanager(cfg *AlertmanagerConfig) (*Alertmanager, error) {
func NewAlertmanager(cfg *AlertmanagerConfig, metrics *metrics.RemoteAlertmanager) (*Alertmanager, error) {
// First, add the authentication middleware.
c := &http.Client{Transport: &MimirAuthRoundTripper{
TenantID: cfg.TenantID,
@@ -37,23 +39,27 @@ func NewAlertmanager(cfg *AlertmanagerConfig) (*Alertmanager, error) {
Next: http.DefaultTransport,
}}
tc := client.NewTimedClient(c, metrics.RequestLatency)
apiEndpoint := *cfg.URL
// Next, make sure you set the right path.
u := apiEndpoint.JoinPath(alertmanagerAPIMountPath, amclient.DefaultBasePath)
transport := httptransport.NewWithClient(u.Host, u.Path, []string{u.Scheme}, c)
// Create an Alertmanager client using the timed client as the transport.
r := httptransport.New(u.Host, u.Path, []string{u.Scheme})
r.Transport = tc
return &Alertmanager{
logger: cfg.Logger,
url: cfg.URL,
AlertmanagerAPI: amclient.New(transport, nil),
httpClient: c,
AlertmanagerAPI: amclient.New(r, nil),
httpClient: tc,
}, nil
}
// GetAuthedClient returns a *http.Client that includes a configured MimirAuthRoundTripper.
// GetAuthedClient returns a client.Requester that includes a configured MimirAuthRoundTripper.
// Requests using this client are fully authenticated.
func (am *Alertmanager) GetAuthedClient() *http.Client {
func (am *Alertmanager) GetAuthedClient() client.Requester {
return am.httpClient
}

View File

@@ -12,6 +12,8 @@ import (
"strings"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/ngalert/client"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
)
// MimirClient contains all the methods to query the migration critical endpoints of Mimir instance, it's an interface to allow multiple implementations.
@@ -26,9 +28,10 @@ type MimirClient interface {
}
type Mimir struct {
client client.Requester
endpoint *url.URL
client http.Client
logger log.Logger
metrics *metrics.RemoteAlertmanager
}
type Config struct {
@@ -60,21 +63,22 @@ func (e *errorResponse) Error() string {
return e.Error2
}
func New(cfg *Config) (*Mimir, error) {
func New(cfg *Config, metrics *metrics.RemoteAlertmanager) (*Mimir, error) {
rt := &MimirAuthRoundTripper{
TenantID: cfg.TenantID,
Password: cfg.Password,
Next: http.DefaultTransport,
}
c := http.Client{
c := &http.Client{
Transport: rt,
}
return &Mimir{
endpoint: cfg.URL,
client: c,
client: client.NewTimedClient(c, metrics.RequestLatency),
logger: cfg.Logger,
metrics: metrics,
}, nil
}