Alerting: Implement Prometheus remote write for recording rules (#89189)

* Fix timestamp recorded by rule

* Implement prometheus remote write

* Create http client instead of transport

* Address PR comments

* Remove status code label
This commit is contained in:
William Wernert 2024-06-25 10:23:42 -04:00 committed by GitHub
parent e4b9f356bc
commit fcfa89f864
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 403 additions and 27 deletions

View File

@ -31,6 +31,7 @@ type NGAlert struct {
apiMetrics *API
historianMetrics *Historian
remoteAlertmanagerMetrics *RemoteAlertmanager
remoteWriterMetrics *RemoteWriter
}
// NewNGAlert manages the metrics of all the alerting components.
@ -43,6 +44,7 @@ func NewNGAlert(r prometheus.Registerer) *NGAlert {
apiMetrics: NewAPIMetrics(r),
historianMetrics: NewHistorianMetrics(r, Subsystem),
remoteAlertmanagerMetrics: NewRemoteAlertmanagerMetrics(r),
remoteWriterMetrics: NewRemoteWriterMetrics(r),
}
}
@ -69,3 +71,7 @@ func (ng *NGAlert) GetHistorianMetrics() *Historian {
func (ng *NGAlert) GetRemoteAlertmanagerMetrics() *RemoteAlertmanager {
return ng.remoteAlertmanagerMetrics
}
func (ng *NGAlert) GetRemoteWriterMetrics() *RemoteWriter {
return ng.remoteWriterMetrics
}

View File

@ -0,0 +1,30 @@
package metrics
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
type RemoteWriter struct {
WritesTotal *prometheus.CounterVec
WriteDuration *prometheus.HistogramVec
}
func NewRemoteWriterMetrics(r prometheus.Registerer) *RemoteWriter {
return &RemoteWriter{
WritesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_writer_writes_total",
Help: "The total number of remote writes attempted.",
}, []string{"org", "backend", "status_code"}),
WriteDuration: promauto.With(r).NewHistogramVec(
prometheus.HistogramOpts{
Namespace: Namespace,
Subsystem: Subsystem,
Name: "remote_writer_write_duration_seconds",
Help: "Histogram of remote write durations.",
Buckets: prometheus.DefBuckets,
}, []string{"org", "backend"}),
}
}

View File

@ -16,6 +16,7 @@ import (
"github.com/grafana/grafana/pkg/events"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
@ -74,6 +75,7 @@ func ProvideService(
pluginsStore pluginstore.Store,
tracer tracing.Tracer,
ruleStore *store.DBstore,
httpClientProvider httpclient.Provider,
) (*AlertNG, error) {
ng := &AlertNG{
Cfg: cfg,
@ -100,6 +102,7 @@ func ProvideService(
pluginsStore: pluginsStore,
tracer: tracer,
store: ruleStore,
httpClientProvider: httpClientProvider,
}
if ng.IsDisabled() {
@ -136,6 +139,7 @@ type AlertNG struct {
folderService folder.Service
dashboardService dashboards.DashboardService
Api *api.API
httpClientProvider httpclient.Provider
// Alerting notification services
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
@ -333,9 +337,9 @@ func (ng *AlertNG) init() error {
evalFactory := eval.NewEvaluatorFactory(ng.Cfg.UnifiedAlerting, ng.DataSourceCache, ng.ExpressionService, ng.pluginsStore)
recordingWriter, err := createRecordingWriter(ng.FeatureToggles, ng.Cfg.UnifiedAlerting.RecordingRules)
recordingWriter, err := createRecordingWriter(ng.FeatureToggles, ng.Cfg.UnifiedAlerting.RecordingRules, ng.httpClientProvider, ng.tracer, ng.Metrics.GetRemoteWriterMetrics())
if err != nil {
return err
return fmt.Errorf("failed to initialize recording writer: %w", err)
}
schedCfg := schedule.SchedulerCfg{
@ -632,11 +636,11 @@ func createRemoteAlertmanager(cfg remote.AlertmanagerConfig, kvstore kvstore.KVS
return remote.NewAlertmanager(cfg, notifier.NewFileStore(cfg.OrgID, kvstore), decryptFn, autogenFn, m, tracer)
}
func createRecordingWriter(featureToggles featuremgmt.FeatureToggles, settings setting.RecordingRuleSettings) (schedule.RecordingWriter, error) {
func createRecordingWriter(featureToggles featuremgmt.FeatureToggles, settings setting.RecordingRuleSettings, httpClientProvider httpclient.Provider, tracer tracing.Tracer, m *metrics.RemoteWriter) (schedule.RecordingWriter, error) {
logger := log.New("ngalert.writer")
if featureToggles.IsEnabledGlobally(featuremgmt.FlagGrafanaManagedRecordingRules) {
return writer.NewPrometheusWriter(settings, logger)
return writer.NewPrometheusWriter(settings, httpClientProvider, tracer, logger, m)
}
return writer.NoopWriter{}, nil

View File

@ -215,13 +215,13 @@ func (r *recordingRule) tryEvaluation(ctx context.Context, ev *Evaluation, logge
}
writeStart := r.clock.Now()
err = r.writer.Write(ctx, ev.rule.Record.Metric, writeStart, frames, ev.rule.Labels)
err = r.writer.Write(ctx, ev.rule.Record.Metric, ev.scheduledAt, frames, ev.rule.Labels)
writeDur := r.clock.Now().Sub(writeStart)
if err != nil {
span.SetStatus(codes.Error, "failed to write metrics")
span.RecordError(err)
return fmt.Errorf("metric remote write failed: %w", err)
return fmt.Errorf("remote write failed: %w", err)
}
logger.Debug("Metrics written", "duration", writeDur)

View File

@ -15,6 +15,7 @@ import (
"github.com/grafana/grafana/pkg/bus"
"github.com/grafana/grafana/pkg/infra/appcontext"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
@ -68,7 +69,7 @@ func SetupTestEnv(tb testing.TB, baseInterval time.Duration) (*ngalert.AlertNG,
ng, err := ngalert.ProvideService(
cfg, features, nil, nil, routing.NewRouteRegister(), sqlStore, kvstore.NewFakeKVStore(), nil, nil, quotatest.New(false, nil),
secretsService, nil, m, folderService, ac, &dashboards.FakeDashboardService{}, nil, bus, ac,
annotationstest.NewFakeAnnotationsRepo(), &pluginstore.FakePluginStore{}, tracer, ruleStore,
annotationstest.NewFakeAnnotationsRepo(), &pluginstore.FakePluginStore{}, tracer, ruleStore, httpclient.NewProvider(),
)
require.NoError(tb, err)
return ng, &store.DBstore{

View File

@ -3,18 +3,42 @@ package writer
import (
"context"
"fmt"
"math"
"net/http"
"net/url"
"strings"
"time"
"github.com/grafana/dataplane/sdata/numeric"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/setting"
"github.com/m3db/prometheus_remote_client_golang/promremote"
"github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
"github.com/grafana/grafana-plugin-sdk-go/data"
)
const backendType = "prometheus"
const (
// Fixed error messages
MimirDuplicateTimestampError = "err-mimir-sample-duplicate-timestamp"
// Best effort error messages
PrometheusDuplicateTimestampError = "duplicate sample for timestamp"
)
var DuplicateTimestampErrors = [...]string{
MimirDuplicateTimestampError,
PrometheusDuplicateTimestampError,
}
// Metric represents a Prometheus time series metric.
type Metric struct {
T int64
T time.Time
V float64
}
@ -38,17 +62,14 @@ func PointsFromFrames(name string, t time.Time, frames data.Frames, extraLabels
points := make([]Point, 0, len(col.Refs))
for _, ref := range col.Refs {
var f float64
if fp, empty, err := ref.NullableFloat64Value(); !empty && fp != nil {
// Use a default value of NaN if the value is empty or nil.
f := math.NaN()
if fp, empty, _ := ref.NullableFloat64Value(); !empty && fp != nil {
f = *fp
} else if err != nil {
return nil, fmt.Errorf("unable to get float64 value: %w", err)
} else {
return nil, fmt.Errorf("unable to get metric value")
}
metric := Metric{
T: t.Unix(),
T: t,
V: f,
}
@ -71,30 +92,166 @@ func PointsFromFrames(name string, t time.Time, frames data.Frames, extraLabels
return points, nil
}
type HttpClientProvider interface {
New(options ...httpclient.Options) (*http.Client, error)
}
type PrometheusWriter struct {
logger log.Logger
client promremote.Client
logger log.Logger
metrics *metrics.RemoteWriter
}
func NewPrometheusWriter(
settings setting.RecordingRuleSettings,
httpClientProvider HttpClientProvider,
tracer tracing.Tracer,
l log.Logger,
metrics *metrics.RemoteWriter,
) (*PrometheusWriter, error) {
if err := validateSettings(settings); err != nil {
return nil, err
}
headers := make(http.Header)
for k, v := range settings.CustomHeaders {
headers.Add(k, v)
}
middlewares := []httpclient.Middleware{
httpclient.TracingMiddleware(tracer),
}
cl, err := httpClientProvider.New(httpclient.Options{
Middlewares: middlewares,
BasicAuth: createAuthOpts(settings.BasicAuthUsername, settings.BasicAuthPassword),
Header: headers,
})
if err != nil {
return nil, err
}
clientCfg := promremote.NewConfig(
promremote.UserAgent("grafana-recording-rule"),
promremote.WriteURLOption(settings.URL),
promremote.HTTPClientTimeoutOption(settings.Timeout),
promremote.HTTPClientOption(cl),
)
client, err := promremote.NewClient(clientCfg)
if err != nil {
return nil, err
}
return &PrometheusWriter{
logger: l,
client: client,
logger: l,
metrics: metrics,
}, nil
}
func validateSettings(settings setting.RecordingRuleSettings) error {
if settings.BasicAuthUsername != "" && settings.BasicAuthPassword == "" {
return fmt.Errorf("basic auth password is required if username is set")
}
if _, err := url.Parse(settings.URL); err != nil {
return fmt.Errorf("invalid URL: %w", err)
}
if settings.Timeout <= 0 {
return fmt.Errorf("timeout must be greater than 0")
}
return nil
}
func createAuthOpts(username, password string) *httpclient.BasicAuthOptions {
// If username is empty, do not use basic auth and ignore password.
if username == "" {
return nil
}
return &httpclient.BasicAuthOptions{
User: username,
Password: password,
}
}
// Write writes the given frames to the Prometheus remote write endpoint.
// TODO: stub implementation, does not make any remote write calls.
func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error {
l := w.logger.FromContext(ctx)
ruleKey, found := models.RuleKeyFromContext(ctx)
if !found {
// sanity check, this should never happen
return fmt.Errorf("rule key not found in context")
}
lvs := []string{fmt.Sprint(ruleKey.OrgID), backendType}
points, err := PointsFromFrames(name, t, frames, extraLabels)
if err != nil {
return err
}
// TODO: placeholder for actual remote write call
l.Debug("writing points", "points", points)
series := make([]promremote.TimeSeries, 0, len(points))
for _, p := range points {
series = append(series, promremote.TimeSeries{
Labels: promremoteLabelsFromPoint(p),
Datapoint: promremote.Datapoint{
Timestamp: p.Metric.T,
Value: p.Metric.V,
},
})
}
l.Debug("Writing metric", "name", name)
writeStart := time.Now()
res, writeErr := w.client.WriteTimeSeries(ctx, series, promremote.WriteOptions{})
w.metrics.WriteDuration.WithLabelValues(lvs...).Observe(time.Since(writeStart).Seconds())
lvs = append(lvs, fmt.Sprint(res.StatusCode))
w.metrics.WritesTotal.WithLabelValues(lvs...).Inc()
if err, ignored := checkWriteError(writeErr); err != nil {
return fmt.Errorf("failed to write time series: %w", err)
} else if ignored {
l.Debug("Ignored write error", "error", err, "status_code", res.StatusCode)
}
return nil
}
func promremoteLabelsFromPoint(point Point) []promremote.Label {
labels := make([]promremote.Label, 0, len(point.Labels))
labels = append(labels, promremote.Label{
Name: "__name__",
Value: point.Name,
})
for k, v := range point.Labels {
labels = append(labels, promremote.Label{
Name: k,
Value: v,
})
}
return labels
}
func checkWriteError(writeErr promremote.WriteError) (err error, ignored bool) {
if writeErr == nil {
return nil, false
}
// special case for 400 status code
if writeErr.StatusCode() == 400 {
msg := writeErr.Error()
// HA may potentially write different values for the same timestamp, so we ignore this error
// TODO: this may not be needed, further testing needed
for _, e := range DuplicateTimestampErrors {
if strings.Contains(msg, e) {
return nil, true
}
}
}
return writeErr, false
}

View File

@ -1,19 +1,86 @@
package writer
import (
"context"
"math"
"math/rand/v2"
"net/http"
"reflect"
"slices"
"testing"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/setting"
"github.com/m3db/prometheus_remote_client_golang/promremote"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/prometheus/prompb"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
)
func TestPrometheusWriter_Write(t *testing.T) {
t.Skip("TODO: implement")
func TestValidateSettings(t *testing.T) {
for _, tc := range []struct {
name string
settings setting.RecordingRuleSettings
err bool
}{
{
name: "invalid url",
settings: setting.RecordingRuleSettings{
URL: "invalid url",
},
err: true,
},
{
name: "missing password",
settings: setting.RecordingRuleSettings{
URL: "http://localhost:9090",
BasicAuthUsername: "user",
},
err: true,
},
{
name: "timeout is 0",
settings: setting.RecordingRuleSettings{
URL: "http://localhost:9090",
BasicAuthUsername: "user",
BasicAuthPassword: "password",
Timeout: 0,
},
err: true,
},
{
name: "valid settings w/ auth",
settings: setting.RecordingRuleSettings{
URL: "http://localhost:9090",
BasicAuthUsername: "user",
BasicAuthPassword: "password",
Timeout: 10,
},
err: false,
},
{
name: "valid settings w/o auth",
settings: setting.RecordingRuleSettings{
URL: "http://localhost:9090",
Timeout: 10,
},
err: false,
},
} {
t.Run(tc.name, func(t *testing.T) {
err := validateSettings(tc.settings)
if tc.err {
require.Error(t, err)
} else {
require.NoError(t, err)
}
})
}
}
func TestPointsFromFrames(t *testing.T) {
@ -61,7 +128,7 @@ func TestPointsFromFrames(t *testing.T) {
}
require.Equal(t, expectedLabels, point.Labels)
require.Equal(t, "test", point.Name)
require.Equal(t, now.Unix(), point.Metric.T)
require.Equal(t, now, point.Metric.T)
require.Equal(t, v, point.Metric.V)
}
})
@ -69,6 +136,74 @@ func TestPointsFromFrames(t *testing.T) {
})
}
func TestPrometheusWriter_Write(t *testing.T) {
client := &testClient{}
writer := &PrometheusWriter{
client: client,
logger: log.New("test"),
metrics: metrics.NewRemoteWriterMetrics(prometheus.NewRegistry()),
}
now := time.Now()
series := []map[string]string{{"foo": "1"}, {"foo": "2"}, {"foo": "3"}, {"foo": "4"}}
frames := frameGenFromLabels(t, data.FrameTypeNumericWide, series)
emptyFrames := data.Frames{data.NewFrame("test")}
ctx := ngmodels.WithRuleKey(context.Background(), ngmodels.GenerateRuleKey(1))
t.Run("error when frames are empty", func(t *testing.T) {
err := writer.Write(ctx, "test", now, emptyFrames, map[string]string{})
require.Error(t, err)
})
t.Run("include client error when client fails", func(t *testing.T) {
clientErr := testClientWriteError{statusCode: http.StatusInternalServerError}
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, clientErr
}
err := writer.Write(ctx, "test", now, frames, map[string]string{})
require.Error(t, err)
require.ErrorIs(t, err, clientErr)
})
t.Run("writes expected points", func(t *testing.T) {
client.writeSeriesFunc = func(ctx context.Context, tslist promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
require.Len(t, tslist, len(series))
for i, ts := range tslist {
expectedLabels := []promremote.Label{
{Name: "__name__", Value: "test"},
{Name: "extra", Value: "label"},
{Name: "foo", Value: series[i]["foo"]},
}
require.ElementsMatch(t, expectedLabels, ts.Labels)
require.Equal(t, now, ts.Datapoint.Timestamp)
require.Equal(t, extractValue(t, frames, series[i], data.FrameTypeNumericWide), ts.Datapoint.Value)
}
return promremote.WriteResult{}, nil
}
err := writer.Write(ctx, "test", now, frames, map[string]string{"extra": "label"})
require.NoError(t, err)
})
t.Run("ignores client error when status code is 400 and message contains duplicate timestamp error", func(t *testing.T) {
for _, msg := range DuplicateTimestampErrors {
t.Run(msg, func(t *testing.T) {
clientErr := testClientWriteError{
statusCode: http.StatusBadRequest,
msg: &msg,
}
client.writeSeriesFunc = func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, clientErr
}
err := writer.Write(ctx, "test", now, frames, map[string]string{"extra": "label"})
require.NoError(t, err)
})
}
})
}
func extractValue(t *testing.T, frames data.Frames, labels map[string]string, frameType data.FrameType) float64 {
t.Helper()
@ -155,7 +290,7 @@ func frameGenFromLabels(t *testing.T, frameType data.FrameType, labelSet []map[s
func frameGenWide(t *testing.T, labelMaps []map[string]string) data.Frames {
t.Helper()
frame := data.NewFrame("test", fieldGenWide(time.Now(), labelMaps)...)
frame := data.NewFrame("test", fieldGenWide(t, time.Now(), labelMaps)...)
frame.SetMeta(&data.FrameMeta{
Type: data.FrameTypeNumericWide,
TypeVersion: data.FrameTypeVersion{0, 1},
@ -163,9 +298,11 @@ func frameGenWide(t *testing.T, labelMaps []map[string]string) data.Frames {
return data.Frames{frame}
}
func fieldGenWide(t time.Time, labelSet []map[string]string) []*data.Field {
func fieldGenWide(t *testing.T, tt time.Time, labelSet []map[string]string) []*data.Field {
t.Helper()
fields := make([]*data.Field, 1, len(labelSet)+1)
fields[0] = data.NewField("T", nil, []time.Time{t})
fields[0] = data.NewField("T", nil, []time.Time{tt})
for _, labels := range labelSet {
field := data.NewField("value", data.Labels(labels), []float64{rand.Float64() * (100 - 0)}) // arbitrary range
fields = append(fields, field)
@ -228,3 +365,43 @@ func frameGenMulti(t *testing.T, labelSet []map[string]string) data.Frames {
return frames
}
type testClient struct {
writeSeriesFunc func(ctx context.Context, ts promremote.TSList, opts promremote.WriteOptions) (promremote.WriteResult, promremote.WriteError)
}
func (c *testClient) WriteProto(
ctx context.Context,
req *prompb.WriteRequest,
opts promremote.WriteOptions,
) (promremote.WriteResult, promremote.WriteError) {
return promremote.WriteResult{}, nil
}
func (c *testClient) WriteTimeSeries(
ctx context.Context,
ts promremote.TSList,
opts promremote.WriteOptions,
) (promremote.WriteResult, promremote.WriteError) {
if c.writeSeriesFunc != nil {
return c.writeSeriesFunc(ctx, ts, opts)
}
return promremote.WriteResult{}, nil
}
type testClientWriteError struct {
statusCode int
msg *string
}
func (e testClientWriteError) StatusCode() int {
return e.statusCode
}
func (e testClientWriteError) Error() string {
if e.msg == nil {
return "test error"
}
return *e.msg
}

View File

@ -11,6 +11,7 @@ import (
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/bus"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/httpclient"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
pluginfakes "github.com/grafana/grafana/pkg/plugins/manager/fakes"
@ -499,7 +500,7 @@ func setupEnv(t *testing.T, sqlStore db.DB, cfg *setting.Cfg, b bus.Bus, quotaSe
_, err = ngalert.ProvideService(
cfg, featuremgmt.WithFeatures(), nil, nil, routing.NewRouteRegister(), sqlStore, ngalertfakes.NewFakeKVStore(t), nil, nil, quotaService,
secretsService, nil, m, &foldertest.FakeService{}, &acmock.Mock{}, &dashboards.FakeDashboardService{}, nil, b, &acmock.Mock{},
annotationstest.NewFakeAnnotationsRepo(), &pluginstore.FakePluginStore{}, tracer, ruleStore,
annotationstest.NewFakeAnnotationsRepo(), &pluginstore.FakePluginStore{}, tracer, ruleStore, httpclient.NewProvider(),
)
require.NoError(t, err)
_, err = storesrv.ProvideService(sqlStore, featuremgmt.WithFeatures(), cfg, quotaService, storesrv.ProvideSystemUsersService())