grafana/pkg/services/ngalert/ngalert.go

553 lines
23 KiB
Go

package ngalert
import (
"context"
"fmt"
"net/url"
"time"
"github.com/benbjohnson/clock"
"github.com/prometheus/alertmanager/featurecontrol"
"github.com/prometheus/alertmanager/matchers/compat"
"golang.org/x/sync/errgroup"
"github.com/grafana/grafana/pkg/api/routing"
"github.com/grafana/grafana/pkg/bus"
"github.com/grafana/grafana/pkg/events"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/infra/db"
"github.com/grafana/grafana/pkg/infra/kvstore"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/accesscontrol"
"github.com/grafana/grafana/pkg/services/annotations"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/datasourceproxy"
"github.com/grafana/grafana/pkg/services/datasources"
"github.com/grafana/grafana/pkg/services/featuremgmt"
"github.com/grafana/grafana/pkg/services/folder"
ac "github.com/grafana/grafana/pkg/services/ngalert/accesscontrol"
"github.com/grafana/grafana/pkg/services/ngalert/api"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
"github.com/grafana/grafana/pkg/services/ngalert/image"
"github.com/grafana/grafana/pkg/services/ngalert/metrics"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/notifier"
"github.com/grafana/grafana/pkg/services/ngalert/provisioning"
"github.com/grafana/grafana/pkg/services/ngalert/remote"
"github.com/grafana/grafana/pkg/services/ngalert/schedule"
"github.com/grafana/grafana/pkg/services/ngalert/sender"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/state/historian"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/services/notifications"
"github.com/grafana/grafana/pkg/services/pluginsintegration/pluginstore"
"github.com/grafana/grafana/pkg/services/quota"
"github.com/grafana/grafana/pkg/services/rendering"
"github.com/grafana/grafana/pkg/services/secrets"
"github.com/grafana/grafana/pkg/setting"
)
func ProvideService(
cfg *setting.Cfg,
featureToggles featuremgmt.FeatureToggles,
dataSourceCache datasources.CacheService,
dataSourceService datasources.DataSourceService,
routeRegister routing.RouteRegister,
sqlStore db.DB,
kvStore kvstore.KVStore,
expressionService *expr.Service,
dataProxy *datasourceproxy.DataSourceProxyService,
quotaService quota.Service,
secretsService secrets.Service,
notificationService notifications.Service,
m *metrics.NGAlert,
folderService folder.Service,
ac accesscontrol.AccessControl,
dashboardService dashboards.DashboardService,
renderService rendering.Service,
bus bus.Bus,
accesscontrolService accesscontrol.Service,
annotationsRepo annotations.Repository,
pluginsStore pluginstore.Store,
tracer tracing.Tracer,
ruleStore *store.DBstore,
) (*AlertNG, error) {
ng := &AlertNG{
Cfg: cfg,
FeatureToggles: featureToggles,
DataSourceCache: dataSourceCache,
DataSourceService: dataSourceService,
RouteRegister: routeRegister,
SQLStore: sqlStore,
KVStore: kvStore,
ExpressionService: expressionService,
DataProxy: dataProxy,
QuotaService: quotaService,
SecretsService: secretsService,
Metrics: m,
Log: log.New("ngalert"),
NotificationService: notificationService,
folderService: folderService,
accesscontrol: ac,
dashboardService: dashboardService,
renderService: renderService,
bus: bus,
accesscontrolService: accesscontrolService,
annotationsRepo: annotationsRepo,
pluginsStore: pluginsStore,
tracer: tracer,
store: ruleStore,
}
if ng.IsDisabled() {
return ng, nil
}
if err := ng.init(); err != nil {
return nil, err
}
return ng, nil
}
// AlertNG is the service for evaluating the condition of an alert definition.
type AlertNG struct {
Cfg *setting.Cfg
FeatureToggles featuremgmt.FeatureToggles
DataSourceCache datasources.CacheService
DataSourceService datasources.DataSourceService
RouteRegister routing.RouteRegister
SQLStore db.DB
KVStore kvstore.KVStore
ExpressionService *expr.Service
DataProxy *datasourceproxy.DataSourceProxyService
QuotaService quota.Service
SecretsService secrets.Service
Metrics *metrics.NGAlert
NotificationService notifications.Service
Log log.Logger
renderService rendering.Service
ImageService image.ImageService
schedule schedule.ScheduleService
stateManager *state.Manager
folderService folder.Service
dashboardService dashboards.DashboardService
api *api.API
// Alerting notification services
MultiOrgAlertmanager *notifier.MultiOrgAlertmanager
AlertsRouter *sender.AlertsRouter
accesscontrol accesscontrol.AccessControl
accesscontrolService accesscontrol.Service
annotationsRepo annotations.Repository
store *store.DBstore
bus bus.Bus
pluginsStore pluginstore.Store
tracer tracing.Tracer
}
func (ng *AlertNG) init() error {
// AlertNG should be initialized before the cancellation deadline of initCtx
initCtx, cancelFunc := context.WithTimeout(context.Background(), 30*time.Second)
defer cancelFunc()
ng.store.Logger = ng.Log
// This initializes the compat package in fallback mode with logging. It parses first
// using the UTF-8 parser and then fallsback to the classic parser on error.
// UTF-8 is permitted in label names. This should be removed when the compat package
// is removed from Alertmanager.
compat.InitFromFlags(ng.Log, featurecontrol.NoopFlags{})
// If enabled, configure the remote Alertmanager.
// - If several toggles are enabled, the order of precedence is RemoteOnly, RemotePrimary, RemoteSecondary
// - If no toggles are enabled, we default to using only the internal Alertmanager
// We currently support only remote secondary mode, so in case other toggles are enabled we fall back to remote secondary.
var overrides []notifier.Option
moaLogger := log.New("ngalert.multiorg.alertmanager")
remoteOnly := ng.FeatureToggles.IsEnabled(initCtx, featuremgmt.FlagAlertmanagerRemoteOnly)
remotePrimary := ng.FeatureToggles.IsEnabled(initCtx, featuremgmt.FlagAlertmanagerRemotePrimary)
remoteSecondary := ng.FeatureToggles.IsEnabled(initCtx, featuremgmt.FlagAlertmanagerRemoteSecondary)
if ng.Cfg.UnifiedAlerting.RemoteAlertmanager.Enable {
switch {
case remoteOnly, remotePrimary:
ng.Log.Warn("Only remote secondary mode is supported at the moment, falling back to remote secondary")
fallthrough
case remoteSecondary:
ng.Log.Debug("Starting Grafana with remote secondary mode enabled")
m := ng.Metrics.GetRemoteAlertmanagerMetrics()
m.Info.WithLabelValues(metrics.ModeRemoteSecondary).Set(1)
// This function will be used by the MOA to create new Alertmanagers.
override := notifier.WithAlertmanagerOverride(func(factoryFn notifier.OrgAlertmanagerFactory) notifier.OrgAlertmanagerFactory {
return func(ctx context.Context, orgID int64) (notifier.Alertmanager, error) {
// Create internal Alertmanager.
internalAM, err := factoryFn(ctx, orgID)
if err != nil {
return nil, err
}
// Create remote Alertmanager.
cfg := remote.AlertmanagerConfig{
BasicAuthPassword: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.Password,
DefaultConfig: ng.Cfg.UnifiedAlerting.DefaultConfiguration,
OrgID: orgID,
TenantID: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.TenantID,
URL: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.URL,
}
remoteAM, err := createRemoteAlertmanager(cfg, ng.KVStore, ng.SecretsService.Decrypt, m)
if err != nil {
moaLogger.Error("Failed to create remote Alertmanager, falling back to using only the internal one", "err", err)
return internalAM, nil
}
// Use both Alertmanager implementations in the forked Alertmanager.
rsCfg := remote.RemoteSecondaryConfig{
Logger: log.New("ngalert.forked-alertmanager.remote-secondary"),
OrgID: orgID,
Store: ng.store,
SyncInterval: ng.Cfg.UnifiedAlerting.RemoteAlertmanager.SyncInterval,
}
return remote.NewRemoteSecondaryForkedAlertmanager(rsCfg, internalAM, remoteAM)
}
})
overrides = append(overrides, override)
default:
ng.Log.Error("A mode should be selected when enabling the remote Alertmanager, falling back to using only the internal Alertmanager")
}
}
decryptFn := ng.SecretsService.GetDecryptedValue
multiOrgMetrics := ng.Metrics.GetMultiOrgAlertmanagerMetrics()
moa, err := notifier.NewMultiOrgAlertmanager(ng.Cfg, ng.store, ng.store, ng.KVStore, ng.store, decryptFn, multiOrgMetrics, ng.NotificationService, moaLogger, ng.SecretsService, ng.FeatureToggles, overrides...)
if err != nil {
return err
}
ng.MultiOrgAlertmanager = moa
imageService, err := image.NewScreenshotImageServiceFromCfg(ng.Cfg, ng.store, ng.dashboardService, ng.renderService, ng.Metrics.Registerer)
if err != nil {
return err
}
ng.ImageService = imageService
// Let's make sure we're able to complete an initial sync of Alertmanagers before we start the alerting components.
if err := ng.MultiOrgAlertmanager.LoadAndSyncAlertmanagersForOrgs(initCtx); err != nil {
return fmt.Errorf("failed to initialize alerting because multiorg alertmanager manager failed to warm up: %w", err)
}
appUrl, err := url.Parse(ng.Cfg.AppURL)
if err != nil {
ng.Log.Error("Failed to parse application URL. Continue without it.", "error", err)
appUrl = nil
}
clk := clock.New()
alertsRouter := sender.NewAlertsRouter(ng.MultiOrgAlertmanager, ng.store, clk, appUrl, ng.Cfg.UnifiedAlerting.DisabledOrgs,
ng.Cfg.UnifiedAlerting.AdminConfigPollInterval, ng.DataSourceService, ng.SecretsService)
// Make sure we sync at least once as Grafana starts to get the router up and running before we start sending any alerts.
if err := alertsRouter.SyncAndApplyConfigFromDatabase(); err != nil {
return fmt.Errorf("failed to initialize alerting because alert notifications router failed to warm up: %w", err)
}
ng.AlertsRouter = alertsRouter
evalFactory := eval.NewEvaluatorFactory(ng.Cfg.UnifiedAlerting, ng.DataSourceCache, ng.ExpressionService, ng.pluginsStore)
schedCfg := schedule.SchedulerCfg{
MaxAttempts: ng.Cfg.UnifiedAlerting.MaxAttempts,
C: clk,
BaseInterval: ng.Cfg.UnifiedAlerting.BaseInterval,
MinRuleInterval: ng.Cfg.UnifiedAlerting.MinInterval,
DisableGrafanaFolder: ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel),
JitterEvaluations: schedule.JitterStrategyFrom(ng.Cfg.UnifiedAlerting, ng.FeatureToggles),
AppURL: appUrl,
EvaluatorFactory: evalFactory,
RuleStore: ng.store,
Metrics: ng.Metrics.GetSchedulerMetrics(),
AlertSender: alertsRouter,
Tracer: ng.tracer,
Log: log.New("ngalert.scheduler"),
}
// There are a set of feature toggles available that act as short-circuits for common configurations.
// If any are set, override the config accordingly.
ApplyStateHistoryFeatureToggles(&ng.Cfg.UnifiedAlerting.StateHistory, ng.FeatureToggles, ng.Log)
history, err := configureHistorianBackend(initCtx, ng.Cfg.UnifiedAlerting.StateHistory, ng.annotationsRepo, ng.dashboardService, ng.store, ng.Metrics.GetHistorianMetrics(), ng.Log)
if err != nil {
return err
}
cfg := state.ManagerCfg{
Metrics: ng.Metrics.GetStateMetrics(),
ExternalURL: appUrl,
DisableExecution: !ng.Cfg.UnifiedAlerting.ExecuteAlerts,
InstanceStore: ng.store,
Images: ng.ImageService,
Clock: clk,
Historian: history,
DoNotSaveNormalState: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoNormalState),
ApplyNoDataAndErrorToAllStates: ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingNoDataErrorExecution),
MaxStateSaveConcurrency: ng.Cfg.UnifiedAlerting.MaxStateSaveConcurrency,
RulesPerRuleGroupLimit: ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit,
Tracer: ng.tracer,
Log: log.New("ngalert.state.manager"),
}
logger := log.New("ngalert.state.manager.persist")
statePersister := state.NewSyncStatePersisiter(logger, cfg)
if ng.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertingSaveStatePeriodic) {
ticker := clock.New().Ticker(ng.Cfg.UnifiedAlerting.StatePeriodicSaveInterval)
statePersister = state.NewAsyncStatePersister(logger, ticker, cfg)
}
stateManager := state.NewManager(cfg, statePersister)
scheduler := schedule.NewScheduler(schedCfg, stateManager)
// if it is required to include folder title to the alerts, we need to subscribe to changes of alert title
if !ng.Cfg.UnifiedAlerting.ReservedLabels.IsReservedLabelDisabled(models.FolderTitleLabel) {
subscribeToFolderChanges(ng.Log, ng.bus, ng.store)
}
ng.stateManager = stateManager
ng.schedule = scheduler
receiverService := notifier.NewReceiverService(ng.accesscontrol, ng.store, ng.store, ng.SecretsService, ng.store, ng.Log)
// Provisioning
policyService := provisioning.NewNotificationPolicyService(ng.store, ng.store, ng.store, ng.Cfg.UnifiedAlerting, ng.Log)
contactPointService := provisioning.NewContactPointService(ng.store, ng.SecretsService, ng.store, ng.store, receiverService, ng.Log, ng.store)
templateService := provisioning.NewTemplateService(ng.store, ng.store, ng.store, ng.Log)
muteTimingService := provisioning.NewMuteTimingService(ng.store, ng.store, ng.store, ng.Log)
alertRuleService := provisioning.NewAlertRuleService(ng.store, ng.store, ng.folderService, ng.dashboardService, ng.QuotaService, ng.store,
int64(ng.Cfg.UnifiedAlerting.DefaultRuleEvaluationInterval.Seconds()),
int64(ng.Cfg.UnifiedAlerting.BaseInterval.Seconds()),
ng.Cfg.UnifiedAlerting.RulesPerRuleGroupLimit, ng.Log, notifier.NewNotificationSettingsValidationService(ng.store),
ac.NewRuleService(ng.accesscontrol))
ng.api = &api.API{
Cfg: ng.Cfg,
DatasourceCache: ng.DataSourceCache,
DatasourceService: ng.DataSourceService,
RouteRegister: ng.RouteRegister,
DataProxy: ng.DataProxy,
QuotaService: ng.QuotaService,
TransactionManager: ng.store,
RuleStore: ng.store,
AlertingStore: ng.store,
AdminConfigStore: ng.store,
ProvenanceStore: ng.store,
MultiOrgAlertmanager: ng.MultiOrgAlertmanager,
StateManager: ng.stateManager,
AccessControl: ng.accesscontrol,
Policies: policyService,
ReceiverService: receiverService,
ContactPointService: contactPointService,
Templates: templateService,
MuteTimings: muteTimingService,
AlertRules: alertRuleService,
AlertsRouter: alertsRouter,
EvaluatorFactory: evalFactory,
FeatureManager: ng.FeatureToggles,
AppUrl: appUrl,
Historian: history,
Hooks: api.NewHooks(ng.Log),
Tracer: ng.tracer,
}
ng.api.RegisterAPIEndpoints(ng.Metrics.GetAPIMetrics())
if err := RegisterQuotas(ng.Cfg, ng.QuotaService, ng.store); err != nil {
return err
}
log.RegisterContextualLogProvider(func(ctx context.Context) ([]interface{}, bool) {
key, ok := models.RuleKeyFromContext(ctx)
if !ok {
return nil, false
}
return key.LogContext(), true
})
return DeclareFixedRoles(ng.accesscontrolService)
}
func subscribeToFolderChanges(logger log.Logger, bus bus.Bus, dbStore api.RuleStore) {
// if folder title is changed, we update all alert rules in that folder to make sure that all peers (in HA mode) will update folder title and
// clean up the current state
bus.AddEventListener(func(ctx context.Context, evt *events.FolderTitleUpdated) error {
logger.Info("Got folder title updated event. updating rules in the folder", "folderUID", evt.UID)
_, err := dbStore.IncreaseVersionForAllRulesInNamespace(ctx, evt.OrgID, evt.UID)
if err != nil {
logger.Error("Failed to update alert rules in the folder after its title was changed", "error", err, "folderUID", evt.UID, "folder", evt.Title)
return err
}
return nil
})
}
// Run starts the scheduler and Alertmanager.
func (ng *AlertNG) Run(ctx context.Context) error {
ng.Log.Debug("Starting", "execute_alerts", ng.Cfg.UnifiedAlerting.ExecuteAlerts)
children, subCtx := errgroup.WithContext(ctx)
children.Go(func() error {
return ng.MultiOrgAlertmanager.Run(subCtx)
})
children.Go(func() error {
return ng.AlertsRouter.Run(subCtx)
})
if ng.Cfg.UnifiedAlerting.ExecuteAlerts {
// Only Warm() the state manager if we are actually executing alerts.
// Doing so when we are not executing alerts is wasteful and could lead
// to misleading rule status queries, as the status returned will be
// always based on the state loaded from the database at startup, and
// not the most recent evaluation state.
//
// Also note that this runs synchronously to ensure state is loaded
// before rule evaluation begins, hence we use ctx and not subCtx.
//
ng.stateManager.Warm(ctx, ng.store)
children.Go(func() error {
return ng.schedule.Run(subCtx)
})
children.Go(func() error {
return ng.stateManager.Run(subCtx)
})
}
return children.Wait()
}
// IsDisabled returns true if the alerting service is disabled for this instance.
func (ng *AlertNG) IsDisabled() bool {
if ng.Cfg == nil {
return true
}
return !ng.Cfg.UnifiedAlerting.IsEnabled()
}
// GetHooks returns a facility for replacing handlers for paths. The handler hook for a path
// is invoked after all other middleware is invoked (authentication, instrumentation).
func (ng *AlertNG) GetHooks() *api.Hooks {
return ng.api.Hooks
}
type Historian interface {
api.Historian
state.Historian
}
func configureHistorianBackend(ctx context.Context, cfg setting.UnifiedAlertingStateHistorySettings, ar annotations.Repository, ds dashboards.DashboardService, rs historian.RuleStore, met *metrics.Historian, l log.Logger) (Historian, error) {
if !cfg.Enabled {
met.Info.WithLabelValues("noop").Set(0)
return historian.NewNopHistorian(), nil
}
backend, err := historian.ParseBackendType(cfg.Backend)
if err != nil {
return nil, err
}
met.Info.WithLabelValues(backend.String()).Set(1)
if backend == historian.BackendTypeMultiple {
primaryCfg := cfg
primaryCfg.Backend = cfg.MultiPrimary
primary, err := configureHistorianBackend(ctx, primaryCfg, ar, ds, rs, met, l)
if err != nil {
return nil, fmt.Errorf("multi-backend target \"%s\" was misconfigured: %w", cfg.MultiPrimary, err)
}
var secondaries []historian.Backend
for _, b := range cfg.MultiSecondaries {
secCfg := cfg
secCfg.Backend = b
sec, err := configureHistorianBackend(ctx, secCfg, ar, ds, rs, met, l)
if err != nil {
return nil, fmt.Errorf("multi-backend target \"%s\" was miconfigured: %w", b, err)
}
secondaries = append(secondaries, sec)
}
l.Info("State history is operating in multi-backend mode", "primary", cfg.MultiPrimary, "secondaries", cfg.MultiSecondaries)
return historian.NewMultipleBackend(primary, secondaries...), nil
}
if backend == historian.BackendTypeAnnotations {
store := historian.NewAnnotationStore(ar, ds, met)
annotationBackendLogger := log.New("ngalert.state.historian", "backend", "annotations")
return historian.NewAnnotationBackend(annotationBackendLogger, store, rs, met), nil
}
if backend == historian.BackendTypeLoki {
lcfg, err := historian.NewLokiConfig(cfg)
if err != nil {
return nil, fmt.Errorf("invalid remote loki configuration: %w", err)
}
req := historian.NewRequester()
lokiBackendLogger := log.New("ngalert.state.historian", "backend", "loki")
backend := historian.NewRemoteLokiBackend(lokiBackendLogger, lcfg, req, met)
testConnCtx, cancelFunc := context.WithTimeout(ctx, 10*time.Second)
defer cancelFunc()
if err := backend.TestConnection(testConnCtx); err != nil {
l.Error("Failed to communicate with configured remote Loki backend, state history may not be persisted", "error", err)
}
return backend, nil
}
return nil, fmt.Errorf("unrecognized state history backend: %s", backend)
}
// ApplyStateHistoryFeatureToggles edits state history configuration to comply with currently active feature toggles.
func ApplyStateHistoryFeatureToggles(cfg *setting.UnifiedAlertingStateHistorySettings, ft featuremgmt.FeatureToggles, logger log.Logger) {
backend, _ := historian.ParseBackendType(cfg.Backend)
// These feature toggles represent specific, common backend configurations.
// If all toggles are enabled, we listen to the state history config as written.
// If any of them are disabled, we ignore the configured backend and treat the toggles as an override.
// If multiple toggles are disabled, we go with the most "restrictive" one.
if !ft.IsEnabledGlobally(featuremgmt.FlagAlertStateHistoryLokiSecondary) {
// If we cannot even treat Loki as a secondary, we must use annotations only.
if backend == historian.BackendTypeMultiple || backend == historian.BackendTypeLoki {
logger.Info("Forcing Annotation backend due to state history feature toggles")
cfg.Backend = historian.BackendTypeAnnotations.String()
cfg.MultiPrimary = ""
cfg.MultiSecondaries = make([]string, 0)
}
return
}
if !ft.IsEnabledGlobally(featuremgmt.FlagAlertStateHistoryLokiPrimary) {
// If we're using multiple backends, Loki must be the secondary.
if backend == historian.BackendTypeMultiple {
logger.Info("Coercing Loki to a secondary backend due to state history feature toggles")
cfg.MultiPrimary = historian.BackendTypeAnnotations.String()
cfg.MultiSecondaries = []string{historian.BackendTypeLoki.String()}
}
// If we're using loki, we are only allowed to use it as a secondary. Dual write to it, plus annotations.
if backend == historian.BackendTypeLoki {
logger.Info("Coercing Loki to dual writes with a secondary backend due to state history feature toggles")
cfg.Backend = historian.BackendTypeMultiple.String()
cfg.MultiPrimary = historian.BackendTypeAnnotations.String()
cfg.MultiSecondaries = []string{historian.BackendTypeLoki.String()}
}
return
}
if !ft.IsEnabledGlobally(featuremgmt.FlagAlertStateHistoryLokiOnly) {
// If we're not allowed to use Loki only, make it the primary but keep the annotation writes.
if backend == historian.BackendTypeLoki {
logger.Info("Forcing dual writes to Loki and Annotations due to state history feature toggles")
cfg.Backend = historian.BackendTypeMultiple.String()
cfg.MultiPrimary = historian.BackendTypeLoki.String()
cfg.MultiSecondaries = []string{historian.BackendTypeAnnotations.String()}
}
return
}
}
func createRemoteAlertmanager(cfg remote.AlertmanagerConfig, kvstore kvstore.KVStore, decryptFn remote.DecryptFn, m *metrics.RemoteAlertmanager) (*remote.Alertmanager, error) {
return remote.NewAlertmanager(cfg, notifier.NewFileStore(cfg.OrgID, kvstore), decryptFn, m)
}