grafana/pkg/services/ngalert/migration/persist.go
Matthew Jacobson 46a77c0074
Alerting: Validate upgraded receivers early to display in preview (#82956)
Previously receivers were only validated before saving the alertmanager
configuration. This is a suboptimal experience for those upgrading with preview
as the failed channel upgrade will return an API error instead of being
summarized in the table.
2024-02-16 15:17:07 -05:00

597 lines
21 KiB
Go

package migration
import (
"context"
"encoding/json"
"errors"
"fmt"
"sort"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/services/accesscontrol"
legacymodels "github.com/grafana/grafana/pkg/services/alerting/models"
"github.com/grafana/grafana/pkg/services/folder"
apiModels "github.com/grafana/grafana/pkg/services/ngalert/api/tooling/definitions"
migmodels "github.com/grafana/grafana/pkg/services/ngalert/migration/models"
migrationStore "github.com/grafana/grafana/pkg/services/ngalert/migration/store"
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/store"
)
// sync is a helper struct for persisting migration changes to the database.
type sync struct {
log log.Logger
orgID int64
migrationStore migrationStore.Store
getDecryptedValue func(ctx context.Context, sjd map[string][]byte, key, fallback string) string
channelCache *ChannelCache
// Caches used during custom folder creation.
permissionsMap map[int64]map[permissionHash]*folder.Folder // Parent Folder ID -> unique dashboard permission -> custom folder.
folderCache map[int64]*folder.Folder // Folder ID -> Folder.
folderPermissionCache map[string][]accesscontrol.ResourcePermission // Folder UID -> Folder Permissions.
generalAlertingFolder *folder.Folder
}
// newSync creates a new migrationService for the given orgID.
func (ms *migrationService) newSync(orgID int64) *sync {
return &sync{
orgID: orgID,
log: ms.log.New("orgID", orgID),
migrationStore: ms.migrationStore,
getDecryptedValue: func(ctx context.Context, sjd map[string][]byte, key, fallback string) string {
return ms.encryptionService.GetDecryptedValue(ctx, sjd, key, fallback)
},
channelCache: ms.newChannelCache(orgID),
permissionsMap: make(map[int64]map[permissionHash]*folder.Folder),
folderCache: make(map[int64]*folder.Folder),
folderPermissionCache: make(map[string][]accesscontrol.ResourcePermission),
}
}
// syncAndSaveState persists the given dashboardUpgrades and contactPairs to the database.
func (sync *sync) syncAndSaveState(
ctx context.Context,
dashboardUpgrades []*migmodels.DashboardUpgrade,
contactPairs []*migmodels.ContactPair,
skipExisting bool,
) (apiModels.OrgMigrationSummary, error) {
oldState, err := sync.migrationStore.GetOrgMigrationState(ctx, sync.orgID)
if err != nil {
return apiModels.OrgMigrationSummary{}, fmt.Errorf("get state: %w", err)
}
delta := createDelta(oldState, dashboardUpgrades, contactPairs, skipExisting)
summary, err := sync.syncDelta(ctx, oldState, delta)
if err != nil {
return apiModels.OrgMigrationSummary{}, fmt.Errorf("sync state: %w", err)
}
err = sync.migrationStore.SetOrgMigrationState(ctx, sync.orgID, oldState)
if err != nil {
return apiModels.OrgMigrationSummary{}, fmt.Errorf("save state: %w", err)
}
return summary, nil
}
// StateDelta contains the changes to be made to the database based on the difference between
// existing migration state and new migration state.
type StateDelta struct {
AlertsToAdd []*migmodels.AlertPair
AlertsToDelete []*migmodels.AlertPair
DashboardsToAdd []*migmodels.DashboardUpgrade
DashboardsToDelete []*migrationStore.DashboardUpgrade
ChannelsToAdd []*migmodels.ContactPair
ChannelsToDelete []*migrationStore.ContactPair
}
// createDelta creates a StateDelta based on the difference between oldState and the new dashboardUpgrades and contactPairs.
// If skipExisting is true, existing alerts in each dashboard as well as existing channels will be not be deleted.
// If skipExisting is false, each given dashboard will be entirely replaced with the new state and all channels will be replaced.
func createDelta(
oldState *migrationStore.OrgMigrationState,
dashboardUpgrades []*migmodels.DashboardUpgrade,
contactPairs []*migmodels.ContactPair,
skipExisting bool,
) StateDelta {
delta := StateDelta{}
for _, du := range dashboardUpgrades {
oldDu, ok := oldState.MigratedDashboards[du.ID]
if !ok {
// Old state doesn't contain this dashboard, so add all alerts.
delta.DashboardsToAdd = append(delta.DashboardsToAdd, du)
continue
}
if !skipExisting {
delta.DashboardsToDelete = append(delta.DashboardsToDelete, oldDu)
delta.DashboardsToAdd = append(delta.DashboardsToAdd, du)
continue
}
for _, pair := range du.MigratedAlerts {
if _, ok := oldDu.MigratedAlerts[pair.LegacyRule.PanelID]; !ok {
// Only add alerts that don't already exist.
delta.AlertsToAdd = append(delta.AlertsToAdd, pair)
}
}
}
for _, newPair := range contactPairs {
oldPair, ok := oldState.MigratedChannels[newPair.Channel.ID]
if !ok || oldPair.NewReceiverUID == "" {
// Old state doesn't contain this channel, or this channel is a stub. We create a new contact point and route for it.
delta.ChannelsToAdd = append(delta.ChannelsToAdd, newPair)
continue
}
// Old state contains this channel, so it's either a re-migrated channel or a deleted channel.
if skipExisting {
// We're only migrating new channels, so we skip this one.
continue
}
delta.ChannelsToDelete = append(delta.ChannelsToDelete, oldPair)
delta.ChannelsToAdd = append(delta.ChannelsToAdd, newPair)
}
return delta
}
func summaryFromDelta(delta StateDelta) apiModels.OrgMigrationSummary {
summary := apiModels.OrgMigrationSummary{
NewDashboards: len(delta.DashboardsToAdd),
NewChannels: len(delta.ChannelsToAdd),
HasErrors: hasErrors(delta),
}
for _, du := range delta.DashboardsToAdd {
summary.NewAlerts += len(du.MigratedAlerts)
}
return summary
}
func hasErrors(delta StateDelta) bool {
for _, du := range delta.DashboardsToAdd {
for _, pair := range du.MigratedAlerts {
if pair.Error != nil {
return true
}
}
}
for _, pair := range delta.AlertsToAdd {
if pair.Error != nil {
return true
}
}
for _, pair := range delta.ChannelsToAdd {
if pair.Error != nil {
return true
}
}
return false
}
// syncDelta persists the given delta to the state and database.
func (sync *sync) syncDelta(ctx context.Context, state *migrationStore.OrgMigrationState, delta StateDelta) (apiModels.OrgMigrationSummary, error) {
amConfig, err := sync.handleAlertmanager(ctx, state, delta)
if err != nil {
return apiModels.OrgMigrationSummary{}, err
}
err = sync.handleDeleteRules(ctx, state, delta)
if err != nil {
return apiModels.OrgMigrationSummary{}, err
}
err = sync.handleAddRules(ctx, state, delta, amConfig)
if err != nil {
return apiModels.OrgMigrationSummary{}, err
}
return summaryFromDelta(delta), nil
}
// handleAlertmanager persists the given channel delta to the state and database.
func (sync *sync) handleAlertmanager(ctx context.Context, state *migrationStore.OrgMigrationState, delta StateDelta) (*migmodels.Alertmanager, error) {
cfg, err := sync.migrationStore.GetAlertmanagerConfig(ctx, sync.orgID)
if err != nil {
return nil, fmt.Errorf("get alertmanager config: %w", err)
}
amConfig := migmodels.FromPostableUserConfig(cfg)
if len(delta.ChannelsToDelete) == 0 && len(delta.ChannelsToAdd) == 0 {
return amConfig, nil
}
// Get the reverse relationship between rules and channels, so we can update labels on rules that reference modified channels.
rulesWithChannels := make(map[int64][]string)
for _, du := range state.MigratedDashboards {
for _, pair := range du.MigratedAlerts {
if pair.NewRuleUID != "" {
for _, id := range pair.ChannelIDs {
rulesWithChannels[id] = append(rulesWithChannels[id], pair.NewRuleUID)
}
}
}
}
// Information tracked to facilitate alert rule contact point label updates.
ruleToAddLabels := make(map[string][]string)
ruleToRemoveLabels := make(map[string][]string)
for _, pair := range delta.ChannelsToDelete {
delete(state.MigratedChannels, pair.LegacyID)
if pair.NewReceiverUID != "" {
label := amConfig.GetContactLabel(pair.NewReceiverUID)
for _, uid := range rulesWithChannels[pair.LegacyID] {
ruleToRemoveLabels[uid] = append(ruleToRemoveLabels[uid], label)
}
// Remove receivers and routes for channels that are being replaced.
amConfig.RemoveContactPointsAndRoutes(pair.NewReceiverUID)
}
}
for _, pair := range delta.ChannelsToAdd {
state.MigratedChannels[pair.Channel.ID] = newContactPair(pair)
amConfig.AddReceiver(pair.ContactPoint)
amConfig.AddRoute(pair.Route)
if pair.ContactPoint != nil {
for _, uid := range rulesWithChannels[pair.Channel.ID] {
ruleToAddLabels[uid] = append(ruleToAddLabels[uid], contactLabel(pair.ContactPoint.Name))
}
}
}
config := migmodels.CleanAlertmanager(amConfig)
// Validate the alertmanager configuration produced, this gives a chance to catch bad configuration at migration time.
// Validation between legacy and unified alerting can be different (e.g. due to bug fixes) so this would fail the migration in that case.
if err := sync.validateAlertmanagerConfig(config); err != nil {
return nil, fmt.Errorf("validate AlertmanagerConfig: %w", err)
}
sync.log.FromContext(ctx).Info("Writing alertmanager config", "receivers", len(config.AlertmanagerConfig.Receivers), "routes", len(config.AlertmanagerConfig.Route.Routes))
if err := sync.migrationStore.SaveAlertmanagerConfiguration(ctx, sync.orgID, config); err != nil {
return nil, fmt.Errorf("write AlertmanagerConfig: %w", err)
}
// For the channels that have been changed, we need to update the labels on the alert rules that reference them.
err = sync.replaceLabels(ctx, ruleToAddLabels, ruleToRemoveLabels)
if err != nil {
return nil, fmt.Errorf("replace labels: %w", err)
}
return amConfig, nil
}
// replaceLabels replaces labels for the given alert rule UIDs.
func (sync *sync) replaceLabels(ctx context.Context, ruleToAddLabels map[string][]string, ruleToRemoveLabels map[string][]string) error {
var ruleUIDs []string
for uid := range ruleToAddLabels {
ruleUIDs = append(ruleUIDs, uid)
}
for uid := range ruleToRemoveLabels {
if _, ok := ruleToAddLabels[uid]; !ok {
ruleUIDs = append(ruleUIDs, uid)
}
}
ruleLabels, err := sync.migrationStore.GetRuleLabels(ctx, sync.orgID, ruleUIDs)
if err != nil {
return fmt.Errorf("get rule labels: %w", err)
}
for key, labels := range ruleLabels {
if labelsToRemove, ok := ruleToRemoveLabels[key.UID]; ok {
for _, label := range labelsToRemove {
delete(labels, label)
}
}
if labelsToAdd, ok := ruleToAddLabels[key.UID]; ok {
for _, label := range labelsToAdd {
labels[label] = "true"
}
}
err := sync.migrationStore.UpdateRuleLabels(ctx, key, labels)
if err != nil {
return fmt.Errorf("update rule labels: %w", err)
}
}
return nil
}
// handleDeleteRules persists the given delete rule delta to the state and database.
func (sync *sync) handleDeleteRules(ctx context.Context, state *migrationStore.OrgMigrationState, delta StateDelta) error {
if len(delta.AlertsToDelete) == 0 && len(delta.DashboardsToDelete) == 0 {
return nil
}
// First we delete alerts so that empty folders can be deleted.
uids := make([]string, 0, len(delta.AlertsToDelete))
for _, pair := range delta.AlertsToDelete {
du, ok := state.MigratedDashboards[pair.LegacyRule.DashboardID]
if !ok {
return fmt.Errorf("dashboard '%d' not found in state", pair.LegacyRule.DashboardID)
}
delete(du.MigratedAlerts, pair.LegacyRule.PanelID)
if pair.Rule != nil {
uids = append(uids, pair.Rule.UID)
}
}
for _, du := range delta.DashboardsToDelete {
delete(state.MigratedDashboards, du.DashboardID)
for _, pair := range du.MigratedAlerts {
if pair.NewRuleUID != "" {
uids = append(uids, pair.NewRuleUID)
}
}
}
if len(uids) > 0 {
err := sync.migrationStore.DeleteAlertRules(ctx, sync.orgID, uids...)
if err != nil {
return fmt.Errorf("delete alert rules: %w", err)
}
// Attempt to delete folders that might be empty.
if len(delta.DashboardsToDelete) > 0 {
createdbyMigration := make(map[string]struct{}, len(state.CreatedFolders))
for _, uid := range state.CreatedFolders {
createdbyMigration[uid] = struct{}{}
}
for _, du := range delta.DashboardsToDelete {
if _, ok := createdbyMigration[du.AlertFolderUID]; ok {
err := sync.migrationStore.DeleteFolders(ctx, sync.orgID, du.AlertFolderUID)
if err != nil {
if !errors.Is(err, migrationStore.ErrFolderNotDeleted) {
return fmt.Errorf("delete folder '%s': %w", du.AlertFolderUID, err)
}
sync.log.FromContext(ctx).Info("Failed to delete folder during cleanup", "error", err)
} else {
delete(createdbyMigration, du.AlertFolderUID)
}
}
}
state.CreatedFolders = make([]string, 0, len(createdbyMigration))
for uid := range createdbyMigration {
state.CreatedFolders = append(state.CreatedFolders, uid)
}
}
}
return nil
}
// handleAddRules persists the given add rule delta to the state and database.
func (sync *sync) handleAddRules(ctx context.Context, state *migrationStore.OrgMigrationState, delta StateDelta, amConfig *migmodels.Alertmanager) error {
pairs := make([]*migmodels.AlertPair, 0, len(delta.AlertsToAdd))
for _, pair := range delta.AlertsToAdd {
du, ok := state.MigratedDashboards[pair.LegacyRule.DashboardID]
if !ok {
return fmt.Errorf("dashboard '%d' not found in state", pair.LegacyRule.DashboardID)
}
if pair.Rule != nil {
// These individually added alerts are created in the same folder as the existing ones.
pair.Rule.NamespaceUID = du.AlertFolderUID
pairs = append(pairs, pair)
}
du.MigratedAlerts[pair.LegacyRule.PanelID] = newAlertPair(pair)
}
if len(delta.DashboardsToAdd) > 0 {
createdFolderUIDs := make(map[string]struct{}, len(state.CreatedFolders))
for _, uid := range state.CreatedFolders {
createdFolderUIDs[uid] = struct{}{}
}
for _, duToAdd := range delta.DashboardsToAdd {
du := &migrationStore.DashboardUpgrade{
DashboardID: duToAdd.ID,
MigratedAlerts: make(map[int64]*migrationStore.AlertPair, len(duToAdd.MigratedAlerts)),
}
pairsWithRules := make([]*migmodels.AlertPair, 0, len(duToAdd.MigratedAlerts))
for _, pair := range duToAdd.MigratedAlerts {
du.MigratedAlerts[pair.LegacyRule.PanelID] = newAlertPair(pair)
if pair.Rule != nil {
pairsWithRules = append(pairsWithRules, pair)
}
}
if len(pairsWithRules) > 0 {
l := sync.log.FromContext(ctx).New("dashboardTitle", duToAdd.Title, "dashboardUid", duToAdd.UID)
migratedFolder, err := sync.migratedFolder(ctx, l, duToAdd.UID, duToAdd.FolderID)
if err != nil {
return err
}
du.AlertFolderUID = migratedFolder.uid
du.Warning = migratedFolder.warning
// Keep track of folders created by the migration.
if _, exists := createdFolderUIDs[migratedFolder.uid]; migratedFolder.created && !exists {
createdFolderUIDs[migratedFolder.uid] = struct{}{}
state.CreatedFolders = append(state.CreatedFolders, migratedFolder.uid)
}
for _, pair := range pairsWithRules {
pair.Rule.NamespaceUID = migratedFolder.uid
pairs = append(pairs, pair)
}
}
state.MigratedDashboards[du.DashboardID] = du
}
}
if len(pairs) > 0 {
sync.log.FromContext(ctx).Debug("Inserting migrated alert rules", "count", len(pairs))
// We ensure consistency in title deduplication as well as insertions by sorting pairs first.
sort.SliceStable(pairs, func(i, j int) bool {
return pairs[i].LegacyRule.ID < pairs[j].LegacyRule.ID
})
err := sync.deduplicateTitles(ctx, pairs)
if err != nil {
return fmt.Errorf("deduplicate titles: %w", err)
}
rules, err := sync.attachContactPointLabels(ctx, state, pairs, amConfig)
if err != nil {
return fmt.Errorf("attach contact point labels: %w", err)
}
err = sync.migrationStore.InsertAlertRules(ctx, rules...)
if err != nil {
return fmt.Errorf("insert alert rules: %w", err)
}
}
return nil
}
// deduplicateTitles ensures that the alert rule titles are unique within the folder.
func (sync *sync) deduplicateTitles(ctx context.Context, pairs []*migmodels.AlertPair) error {
// First pass to find namespaces.
seen := make(map[string]struct{})
namespaces := make([]string, 0)
for _, pair := range pairs {
if _, ok := seen[pair.Rule.NamespaceUID]; !ok {
namespaces = append(namespaces, pair.Rule.NamespaceUID)
seen[pair.Rule.NamespaceUID] = struct{}{}
}
}
// Populate deduplicators from database.
titles, err := sync.migrationStore.GetAlertRuleTitles(ctx, sync.orgID, namespaces...)
if err != nil {
sync.log.FromContext(ctx).Warn("Failed to get alert rule titles for title deduplication", "error", err)
}
titleDedups := make(map[string]*migmodels.Deduplicator, len(namespaces))
for _, ns := range namespaces {
titleDedups[ns] = migmodels.NewDeduplicator(sync.migrationStore.CaseInsensitive(), store.AlertDefinitionMaxTitleLength, titles[ns]...)
}
for _, pair := range pairs {
l := sync.log.FromContext(ctx).New("legacyRuleId", pair.LegacyRule.ID, "ruleUid", pair.Rule.UID)
// Here we ensure that the alert rule title is unique within the folder.
titleDeduplicator := titleDedups[pair.Rule.NamespaceUID]
name, err := titleDeduplicator.Deduplicate(pair.Rule.Title)
if err != nil {
return err
}
if name != pair.Rule.Title {
l.Info("Alert rule title modified to be unique within folder", "old", pair.Rule.Title, "new", name)
pair.Rule.Title = name
}
}
return nil
}
// attachContactPointLabels attaches contact point labels to the given alert rules.
func (sync *sync) attachContactPointLabels(ctx context.Context, state *migrationStore.OrgMigrationState, pairs []*migmodels.AlertPair, amConfig *migmodels.Alertmanager) ([]models.AlertRule, error) {
rules := make([]models.AlertRule, 0, len(pairs))
for _, pair := range pairs {
l := sync.log.FromContext(ctx).New("legacyRuleId", pair.LegacyRule.ID, "ruleUid", pair.Rule.UID)
alertChannels, err := sync.extractChannels(ctx, pair.LegacyRule)
if err != nil {
return nil, fmt.Errorf("extract channel IDs: %w", err)
}
statePair := state.MigratedDashboards[pair.LegacyRule.DashboardID].MigratedAlerts[pair.LegacyRule.PanelID]
statePair.ChannelIDs = make([]int64, 0, len(alertChannels))
for _, c := range alertChannels {
statePair.ChannelIDs = append(statePair.ChannelIDs, c.ID)
channelPair, ok := state.MigratedChannels[c.ID]
if ok {
label := amConfig.GetContactLabel(channelPair.NewReceiverUID)
if label != "" {
pair.Rule.Labels[label] = "true"
}
} else {
l.Warn("Failed to find migrated channel", "channel", c.Name)
// Creating stub so that when we eventually migrate the channel, we can update the labels on this rule.
state.MigratedChannels[c.ID] = &migrationStore.ContactPair{LegacyID: c.ID, Error: "channel not upgraded"}
}
}
rules = append(rules, *pair.Rule)
}
return rules, nil
}
// extractChannels extracts notification channels from the given legacy dashboard alert parsed settings.
func (sync *sync) extractChannels(ctx context.Context, alert *legacymodels.Alert) ([]*legacymodels.AlertNotification, error) {
l := sync.log.FromContext(ctx).New("ruleId", alert.ID, "ruleName", alert.Name)
rawSettings, err := json.Marshal(alert.Settings)
if err != nil {
return nil, fmt.Errorf("get settings: %w", err)
}
var parsedSettings dashAlertSettings
err = json.Unmarshal(rawSettings, &parsedSettings)
if err != nil {
return nil, fmt.Errorf("parse settings: %w", err)
}
// Extracting channels.
channels := make([]*legacymodels.AlertNotification, 0, len(parsedSettings.Notifications))
for _, key := range parsedSettings.Notifications {
// Either id or uid can be defined in the dashboard alert notification settings. See alerting.NewRuleFromDBAlert.
if key.ID == 0 && key.UID == "" {
l.Warn("Alert notification has no ID or UID, skipping", "notificationKey", key)
continue
}
if c, err := sync.channelCache.Get(ctx, key); err != nil {
return nil, fmt.Errorf("get alert notification: %w", err)
} else if c != nil {
channels = append(channels, c)
continue
}
l.Warn("Failed to get alert notification, skipping", "notificationKey", key)
}
return channels, nil
}
// validateAlertmanagerConfig validates the alertmanager configuration produced by the migration against the receivers.
func (sync *sync) validateAlertmanagerConfig(config *apiModels.PostableUserConfig) error {
for _, r := range config.AlertmanagerConfig.Receivers {
for _, gr := range r.GrafanaManagedReceivers {
err := validateReceiver(gr, sync.getDecryptedValue)
if err != nil {
return err
}
}
}
return nil
}
func newContactPair(pair *migmodels.ContactPair) *migrationStore.ContactPair {
p := &migrationStore.ContactPair{
LegacyID: pair.Channel.ID,
}
if pair.Error != nil {
p.Error = pair.Error.Error()
}
if pair.ContactPoint != nil {
p.NewReceiverUID = pair.ContactPoint.UID
}
return p
}
func newAlertPair(a *migmodels.AlertPair) *migrationStore.AlertPair {
pair := &migrationStore.AlertPair{}
if a.Error != nil {
pair.Error = a.Error.Error()
}
if a.LegacyRule != nil {
pair.LegacyID = a.LegacyRule.ID
pair.PanelID = a.LegacyRule.PanelID
}
if a.Rule != nil {
pair.NewRuleUID = a.Rule.UID
}
return pair
}