From c62cc25513e84d5c4b1136b5d5f3727e127153d8 Mon Sep 17 00:00:00 2001 From: William Wernert Date: Wed, 12 Jun 2024 16:04:46 -0400 Subject: [PATCH] Alerting: Configure recording rule writer from config.ini (#89056) --- conf/defaults.ini | 17 ++++++ conf/sample.ini | 18 +++++++ pkg/services/ngalert/ngalert.go | 18 ++++++- pkg/services/ngalert/schedule/alert_rule.go | 3 +- .../ngalert/schedule/recording_rule.go | 5 +- pkg/services/ngalert/schedule/schedule.go | 10 ++-- pkg/services/ngalert/writer/noop.go | 14 +++++ pkg/services/ngalert/writer/prom.go | 54 ++++++++++--------- pkg/services/ngalert/writer/writer.go | 12 ----- pkg/setting/setting_unified_alerting.go | 34 ++++++++++-- 10 files changed, 136 insertions(+), 49 deletions(-) create mode 100644 pkg/services/ngalert/writer/noop.go delete mode 100644 pkg/services/ngalert/writer/writer.go diff --git a/conf/defaults.ini b/conf/defaults.ini index 966edbf4afc..898b0261fa2 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1399,6 +1399,23 @@ max_age = # Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations. max_annotations_to_keep = +[recording_rules] +# Target URL (including write path) for recording rules. +url = + +# Optional username for basic authentication on recording rule write requests. Can be left blank to disable basic auth +basic_auth_username = + +# Optional assword for basic authentication on recording rule write requests. Can be left blank. +basic_auth_password = + +# Request timeout for recording rule writes. +timeout = 10s + +# Optional custom headers to include in recording rule write requests. +[recording_rules.custom_headers] +# exampleHeader = exampleValue + # NOTE: this configuration options are not used yet. [remote.alertmanager] diff --git a/conf/sample.ini b/conf/sample.ini index 40a3279a378..b1d61c44187 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1389,6 +1389,24 @@ max_age = # Configures max number of alert annotations that Grafana stores. Default value is 0, which keeps all alert annotations. max_annotations_to_keep = +#################################### Recording Rules ##################### +[recording_rules] +# Target URL (including write path) for recording rules. +url = + +# Optional username for basic authentication on recording rule write requests. Can be left blank to disable basic auth +basic_auth_username = + +# Optional assword for basic authentication on recording rule write requests. Can be left blank. +basic_auth_password = + +# Request timeout for recording rule writes. +timeout = 30s + +# Optional custom headers to include in recording rule write requests. +[recording_rules.custom_headers] +# exampleHeader = exampleValue + #################################### Annotations ######################### [annotations] # Configures the batch size for the annotation clean-up job. This setting is used for dashboard, API, and alert annotations. diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 21c38405734..5f87cafc251 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -332,6 +332,12 @@ func (ng *AlertNG) init() error { ng.AlertsRouter = alertsRouter evalFactory := eval.NewEvaluatorFactory(ng.Cfg.UnifiedAlerting, ng.DataSourceCache, ng.ExpressionService, ng.pluginsStore) + + recordingWriter, err := createRecordingWriter(ng.FeatureToggles, ng.Cfg.UnifiedAlerting.RecordingRules) + if err != nil { + return err + } + schedCfg := schedule.SchedulerCfg{ MaxAttempts: ng.Cfg.UnifiedAlerting.MaxAttempts, C: clk, @@ -347,7 +353,7 @@ func (ng *AlertNG) init() error { AlertSender: alertsRouter, Tracer: ng.tracer, Log: log.New("ngalert.scheduler"), - RecordingWriter: writer.NewPrometheusWriter(log.New("ngalert.recording.writer")), + RecordingWriter: recordingWriter, } // There are a set of feature toggles available that act as short-circuits for common configurations. @@ -624,3 +630,13 @@ func ApplyStateHistoryFeatureToggles(cfg *setting.UnifiedAlertingStateHistorySet func createRemoteAlertmanager(cfg remote.AlertmanagerConfig, kvstore kvstore.KVStore, decryptFn remote.DecryptFn, autogenFn remote.AutogenFn, m *metrics.RemoteAlertmanager) (*remote.Alertmanager, error) { return remote.NewAlertmanager(cfg, notifier.NewFileStore(cfg.OrgID, kvstore), decryptFn, autogenFn, m) } + +func createRecordingWriter(featureToggles featuremgmt.FeatureToggles, settings setting.RecordingRuleSettings) (schedule.RecordingWriter, error) { + logger := log.New("ngalert.writer") + + if featureToggles.IsEnabledGlobally(featuremgmt.FlagGrafanaManagedRecordingRules) { + return writer.NewPrometheusWriter(settings, logger) + } + + return writer.NoopWriter{}, nil +} diff --git a/pkg/services/ngalert/schedule/alert_rule.go b/pkg/services/ngalert/schedule/alert_rule.go index 99757cc222e..6c3d58ec3e8 100644 --- a/pkg/services/ngalert/schedule/alert_rule.go +++ b/pkg/services/ngalert/schedule/alert_rule.go @@ -16,7 +16,6 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/state" - "github.com/grafana/grafana/pkg/services/ngalert/writer" "github.com/grafana/grafana/pkg/services/org" "github.com/grafana/grafana/pkg/services/user" "github.com/grafana/grafana/pkg/util" @@ -57,7 +56,7 @@ func newRuleFactory( met *metrics.Scheduler, logger log.Logger, tracer tracing.Tracer, - recordingWriter writer.Writer, + recordingWriter RecordingWriter, evalAppliedHook evalAppliedFunc, stopAppliedHook stopAppliedFunc, ) ruleFactoryFunc { diff --git a/pkg/services/ngalert/schedule/recording_rule.go b/pkg/services/ngalert/schedule/recording_rule.go index b84a98b6314..a30bef263d8 100644 --- a/pkg/services/ngalert/schedule/recording_rule.go +++ b/pkg/services/ngalert/schedule/recording_rule.go @@ -14,7 +14,6 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/eval" "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" - "github.com/grafana/grafana/pkg/services/ngalert/writer" "github.com/grafana/grafana/pkg/util" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/codes" @@ -39,10 +38,10 @@ type recordingRule struct { metrics *metrics.Scheduler tracer tracing.Tracer - writer writer.Writer + writer RecordingWriter } -func newRecordingRule(parent context.Context, maxAttempts int64, clock clock.Clock, evalFactory eval.EvaluatorFactory, ft featuremgmt.FeatureToggles, logger log.Logger, metrics *metrics.Scheduler, tracer tracing.Tracer, writer writer.Writer) *recordingRule { +func newRecordingRule(parent context.Context, maxAttempts int64, clock clock.Clock, evalFactory eval.EvaluatorFactory, ft featuremgmt.FeatureToggles, logger log.Logger, metrics *metrics.Scheduler, tracer tracing.Tracer, writer RecordingWriter) *recordingRule { ctx, stop := util.WithCancelCause(parent) return &recordingRule{ ctx: ctx, diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 5574e52d715..c62c26ecea3 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -11,6 +11,7 @@ import ( "github.com/benbjohnson/clock" "golang.org/x/sync/errgroup" + "github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/services/featuremgmt" @@ -19,7 +20,6 @@ import ( "github.com/grafana/grafana/pkg/services/ngalert/metrics" ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/state" - "github.com/grafana/grafana/pkg/services/ngalert/writer" "github.com/grafana/grafana/pkg/util/ticker" ) @@ -47,6 +47,10 @@ type RulesStore interface { GetAlertRulesForScheduling(ctx context.Context, query *ngmodels.GetAlertRulesForSchedulingQuery) error } +type RecordingWriter interface { + Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error +} + type schedule struct { // base tick rate (fastest possible configured check) baseInterval time.Duration @@ -94,7 +98,7 @@ type schedule struct { tracer tracing.Tracer - recordingWriter writer.Writer + recordingWriter RecordingWriter } // SchedulerCfg is the scheduler configuration. @@ -113,7 +117,7 @@ type SchedulerCfg struct { AlertSender AlertsSender Tracer tracing.Tracer Log log.Logger - RecordingWriter writer.Writer + RecordingWriter RecordingWriter } // NewScheduler returns a new scheduler. diff --git a/pkg/services/ngalert/writer/noop.go b/pkg/services/ngalert/writer/noop.go new file mode 100644 index 00000000000..3e434fdf76e --- /dev/null +++ b/pkg/services/ngalert/writer/noop.go @@ -0,0 +1,14 @@ +package writer + +import ( + "context" + "time" + + "github.com/grafana/grafana-plugin-sdk-go/data" +) + +type NoopWriter struct{} + +func (w NoopWriter) Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error { + return nil +} diff --git a/pkg/services/ngalert/writer/prom.go b/pkg/services/ngalert/writer/prom.go index 752d93cd94a..c41d8b4e40b 100644 --- a/pkg/services/ngalert/writer/prom.go +++ b/pkg/services/ngalert/writer/prom.go @@ -7,14 +7,11 @@ import ( "github.com/grafana/dataplane/sdata/numeric" "github.com/grafana/grafana/pkg/infra/log" + "github.com/grafana/grafana/pkg/setting" "github.com/grafana/grafana-plugin-sdk-go/data" ) -type PrometheusWriter struct { - logger log.Logger -} - // Metric represents a Prometheus time series metric. type Metric struct { T int64 @@ -28,27 +25,6 @@ type Point struct { Metric Metric } -func NewPrometheusWriter(l log.Logger) *PrometheusWriter { - return &PrometheusWriter{ - logger: l, - } -} - -// Write writes the given frames to the Prometheus remote write endpoint. -// TODO: stub implementation, does not make any remote write calls. -func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error { - l := w.logger.FromContext(ctx) - - points, err := PointsFromFrames(name, t, frames, extraLabels) - if err != nil { - return err - } - - // TODO: placeholder for actual remote write call - l.Debug("writing points", "points", points) - return nil -} - func PointsFromFrames(name string, t time.Time, frames data.Frames, extraLabels map[string]string) ([]Point, error) { cr, err := numeric.CollectionReaderFromFrames(frames) if err != nil { @@ -94,3 +70,31 @@ func PointsFromFrames(name string, t time.Time, frames data.Frames, extraLabels return points, nil } + +type PrometheusWriter struct { + logger log.Logger +} + +func NewPrometheusWriter( + settings setting.RecordingRuleSettings, + l log.Logger, +) (*PrometheusWriter, error) { + return &PrometheusWriter{ + logger: l, + }, nil +} + +// Write writes the given frames to the Prometheus remote write endpoint. +// TODO: stub implementation, does not make any remote write calls. +func (w PrometheusWriter) Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error { + l := w.logger.FromContext(ctx) + + points, err := PointsFromFrames(name, t, frames, extraLabels) + if err != nil { + return err + } + + // TODO: placeholder for actual remote write call + l.Debug("writing points", "points", points) + return nil +} diff --git a/pkg/services/ngalert/writer/writer.go b/pkg/services/ngalert/writer/writer.go deleted file mode 100644 index 8c52603dd12..00000000000 --- a/pkg/services/ngalert/writer/writer.go +++ /dev/null @@ -1,12 +0,0 @@ -package writer - -import ( - "context" - "time" - - "github.com/grafana/grafana-plugin-sdk-go/data" -) - -type Writer interface { - Write(ctx context.Context, name string, t time.Time, frames data.Frames, extraLabels map[string]string) error -} diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index 6536bd442f8..4cd4e9dde85 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -59,9 +59,10 @@ const ( // with intervals that are not exactly divided by this number not to be evaluated SchedulerBaseInterval = 10 * time.Second // DefaultRuleEvaluationInterval indicates a default interval of for how long a rule should be evaluated to change state from Pending to Alerting - DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds - stateHistoryDefaultEnabled = true - lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki + DefaultRuleEvaluationInterval = SchedulerBaseInterval * 6 // == 60 seconds + stateHistoryDefaultEnabled = true + lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki + defaultRecordingRequestTimeout = 10 * time.Second ) type UnifiedAlertingSettings struct { @@ -103,6 +104,8 @@ type UnifiedAlertingSettings struct { SkipClustering bool StateHistory UnifiedAlertingStateHistorySettings RemoteAlertmanager RemoteAlertmanagerSettings + RecordingRules RecordingRuleSettings + // MaxStateSaveConcurrency controls the number of goroutines (per rule) that can save alert state in parallel. MaxStateSaveConcurrency int StatePeriodicSaveInterval time.Duration @@ -112,6 +115,14 @@ type UnifiedAlertingSettings struct { NotificationLogRetention time.Duration } +type RecordingRuleSettings struct { + URL string + BasicAuthUsername string + BasicAuthPassword string + CustomHeaders map[string]string + Timeout time.Duration +} + // RemoteAlertmanagerSettings contains the configuration needed // to disable the internal Alertmanager and use an external one instead. type RemoteAlertmanagerSettings struct { @@ -395,6 +406,23 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error { } uaCfg.StateHistory = uaCfgStateHistory + rr := iniFile.Section("recording_rules") + uaCfgRecordingRules := RecordingRuleSettings{ + URL: rr.Key("url").MustString(""), + BasicAuthUsername: rr.Key("basic_auth_username").MustString(""), + BasicAuthPassword: rr.Key("basic_auth_password").MustString(""), + Timeout: rr.Key("timeout").MustDuration(defaultRecordingRequestTimeout), + } + + rrHeaders := iniFile.Section("recording_rules.custom_headers") + rrHeadersKeys := rrHeaders.Keys() + uaCfgRecordingRules.CustomHeaders = make(map[string]string, len(rrHeadersKeys)) + for _, key := range rrHeadersKeys { + uaCfgRecordingRules.CustomHeaders[key.Name()] = key.Value() + } + + uaCfg.RecordingRules = uaCfgRecordingRules + uaCfg.MaxStateSaveConcurrency = ua.Key("max_state_save_concurrency").MustInt(1) uaCfg.StatePeriodicSaveInterval, err = gtime.ParseDuration(valueAsString(ua, "state_periodic_save_interval", (time.Minute * 5).String()))