grafana/pkg/services/ngalert/migration/alert_rule.go

291 lines
10 KiB
Go
Raw Normal View History

package migration
import (
"context"
"encoding/json"
"fmt"
"time"
"github.com/prometheus/common/model"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/infra/log"
legacymodels "github.com/grafana/grafana/pkg/services/alerting/models"
"github.com/grafana/grafana/pkg/services/dashboards"
"github.com/grafana/grafana/pkg/services/datasources"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/tsdb/graphite"
"github.com/grafana/grafana/pkg/util"
)
func addLabelsAndAnnotations(l log.Logger, alert *legacymodels.Alert, dashboardUID string) (data.Labels, data.Labels) {
tags := alert.GetTagsFromSettings()
lbls := make(data.Labels, len(tags)+1)
for _, t := range tags {
lbls[t.Key] = t.Value
}
// Add a label for routing
lbls[ngmodels.MigratedUseLegacyChannelsLabel] = "true"
annotations := make(data.Labels, 4)
annotations[ngmodels.DashboardUIDAnnotation] = dashboardUID
annotations[ngmodels.PanelIDAnnotation] = fmt.Sprintf("%v", alert.PanelID)
annotations[ngmodels.MigratedAlertIdAnnotation] = fmt.Sprintf("%v", alert.ID)
message := MigrateTmpl(l.New("field", "message"), alert.Message)
annotations[ngmodels.MigratedMessageAnnotation] = message
return lbls, annotations
}
// migrateAlert migrates a single dashboard alert from legacy alerting to unified alerting.
func (om *OrgMigration) migrateAlert(ctx context.Context, l log.Logger, alert *legacymodels.Alert, dashboard *dashboards.Dashboard) (*ngmodels.AlertRule, error) {
l.Debug("Migrating alert rule to Unified Alerting")
rawSettings, err := json.Marshal(alert.Settings)
if err != nil {
return nil, fmt.Errorf("get settings: %w", err)
}
var parsedSettings dashAlertSettings
err = json.Unmarshal(rawSettings, &parsedSettings)
if err != nil {
return nil, fmt.Errorf("parse settings: %w", err)
}
cond, err := transConditions(ctx, l, parsedSettings, alert.OrgID, om.migrationStore)
if err != nil {
return nil, fmt.Errorf("transform conditions: %w", err)
}
lbls, annotations := addLabelsAndAnnotations(l, alert, dashboard.UID)
data, err := migrateAlertRuleQueries(l, cond.Data)
if err != nil {
return nil, fmt.Errorf("queries: %w", err)
}
isPaused := false
if alert.State == "paused" {
isPaused = true
}
dashUID := dashboard.UID
ar := &ngmodels.AlertRule{
OrgID: alert.OrgID,
Title: alert.Name, // Title will be deduplicated on persist.
UID: util.GenerateShortUID(),
Condition: cond.Condition,
Data: data,
IntervalSeconds: ruleAdjustInterval(alert.Frequency),
Version: 1,
NamespaceUID: "", // The folder for this alert is determined later.
Alerting: Handle custom dashboard permissions in migration service (#74504) * Fix migration of custom dashboard permissions Dashboard alert permissions were determined by both its dashboard and folder scoped permissions, while UA alert rules only have folder scoped permissions. This means, when migrating an alert, we'll need to decide if the parent folder is a correct location for the newly created alert rule so that users, teams, and org roles have the same access to it as they did in legacy. To do this, we translate both the folder and dashboard resource permissions to two sets of SetResourcePermissionCommands. Each of these encapsulates a mapping of all: OrgRoles -> Viewer/Editor/Admin Teams -> Viewer/Editor/Admin Users -> Viewer/Editor/Admin When the dashboard permissions (including those inherited from the parent folder) differ from the parent folder permissions alone, we need to create a new folder to represent the access-level of the legacy dashboard. Compromises: When determining the SetResourcePermissionCommands we only take into account managed and basic roles. Fixed and custom roles introduce significant complexity and synchronicity hurdles. Instead, we log a warning they had the potential to override the newly created folder permissions. Also, we don't attempt to reconcile datasource permissions that were not necessary in legacy alerting. Users without access to the necessary datasources to edit an alert rule will need to obtain said access separate from the migration.
2023-10-12 17:12:40 -05:00
DashboardUID: &dashUID,
PanelID: &alert.PanelID,
RuleGroup: groupName(ruleAdjustInterval(alert.Frequency), dashboard.Title),
For: alert.For,
Updated: time.Now().UTC(),
Annotations: annotations,
Labels: lbls,
RuleGroupIndex: 1, // Every rule is in its own group.
IsPaused: isPaused,
NoDataState: transNoData(l, parsedSettings.NoDataState),
ExecErrState: transExecErr(l, parsedSettings.ExecutionErrorState),
}
// Label for routing and silences.
n, v := getLabelForSilenceMatching(ar.UID)
ar.Labels[n] = v
if parsedSettings.ExecutionErrorState == string(legacymodels.ExecutionErrorKeepState) {
if err := om.addErrorSilence(ar); err != nil {
om.log.Error("Alert migration error: failed to create silence for Error", "rule_name", ar.Title, "err", err)
}
}
if parsedSettings.NoDataState == string(legacymodels.NoDataKeepState) {
if err := om.addNoDataSilence(ar); err != nil {
om.log.Error("Alert migration error: failed to create silence for NoData", "rule_name", ar.Title, "err", err)
}
}
// We do some validation and pre-save operations early in order to track these errors as part of the migration state.
if err := ar.ValidateAlertRule(om.cfg.UnifiedAlerting); err != nil {
return nil, err
}
if err := ar.PreSave(time.Now); err != nil {
return nil, err
}
return ar, nil
}
// migrateAlertRuleQueries attempts to fix alert rule queries so they can work in unified alerting. Queries of some data sources are not compatible with unified alerting.
func migrateAlertRuleQueries(l log.Logger, data []ngmodels.AlertQuery) ([]ngmodels.AlertQuery, error) {
result := make([]ngmodels.AlertQuery, 0, len(data))
for _, d := range data {
// queries that are expression are not relevant, skip them.
if d.DatasourceUID == expressionDatasourceUID {
result = append(result, d)
continue
}
var fixedData map[string]json.RawMessage
err := json.Unmarshal(d.Model, &fixedData)
if err != nil {
return nil, err
}
// remove hidden tag from the query (if exists)
delete(fixedData, "hide")
fixedData = fixGraphiteReferencedSubQueries(fixedData)
fixedData = fixPrometheusBothTypeQuery(l, fixedData)
updatedModel, err := json.Marshal(fixedData)
if err != nil {
return nil, err
}
d.Model = updatedModel
result = append(result, d)
}
return result, nil
}
// fixGraphiteReferencedSubQueries attempts to fix graphite referenced sub queries, given unified alerting does not support this.
// targetFull of Graphite data source contains the expanded version of field 'target', so let's copy that.
func fixGraphiteReferencedSubQueries(queryData map[string]json.RawMessage) map[string]json.RawMessage {
fullQuery, ok := queryData[graphite.TargetFullModelField]
if ok {
delete(queryData, graphite.TargetFullModelField)
queryData[graphite.TargetModelField] = fullQuery
}
return queryData
}
// fixPrometheusBothTypeQuery converts Prometheus 'Both' type queries to range queries.
func fixPrometheusBothTypeQuery(l log.Logger, queryData map[string]json.RawMessage) map[string]json.RawMessage {
// There is the possibility to support this functionality by:
// - Splitting the query into two: one for instant and one for range.
// - Splitting the condition into two: one for each query, separated by OR.
// However, relying on a 'Both' query instead of multiple conditions to do this in legacy is likely
// to be unintentional. In addition, this would require more robust operator precedence in classic conditions.
// Given these reasons, we opt to convert them to range queries and log a warning.
var instant bool
if instantRaw, ok := queryData["instant"]; ok {
if err := json.Unmarshal(instantRaw, &instant); err != nil {
// Nothing to do here, we can't parse the instant field.
if isPrometheus, _ := isPrometheusQuery(queryData); isPrometheus {
l.Info("Failed to parse instant field on Prometheus query", "instant", string(instantRaw), "err", err)
}
return queryData
}
}
var rng bool
if rangeRaw, ok := queryData["range"]; ok {
if err := json.Unmarshal(rangeRaw, &rng); err != nil {
// Nothing to do here, we can't parse the range field.
if isPrometheus, _ := isPrometheusQuery(queryData); isPrometheus {
l.Info("Failed to parse range field on Prometheus query", "range", string(rangeRaw), "err", err)
}
return queryData
}
}
if !instant || !rng {
// Only apply this fix to 'Both' type queries.
return queryData
}
isPrometheus, err := isPrometheusQuery(queryData)
if err != nil {
l.Info("Unable to convert alert rule that resembles a Prometheus 'Both' type query to 'Range'", "err", err)
return queryData
}
if !isPrometheus {
// Only apply this fix to Prometheus.
return queryData
}
// Convert 'Both' type queries to `Range` queries by disabling the `Instant` portion.
l.Warn("Prometheus 'Both' type queries are not supported in unified alerting. Converting to range query.")
queryData["instant"] = []byte("false")
return queryData
}
// isPrometheusQuery checks if the query is for Prometheus.
func isPrometheusQuery(queryData map[string]json.RawMessage) (bool, error) {
ds, ok := queryData["datasource"]
if !ok {
return false, fmt.Errorf("missing datasource field")
}
var datasource struct {
Type string `json:"type"`
}
if err := json.Unmarshal(ds, &datasource); err != nil {
return false, fmt.Errorf("parse datasource '%s': %w", string(ds), err)
}
if datasource.Type == "" {
return false, fmt.Errorf("missing type field '%s'", string(ds))
}
return datasource.Type == datasources.DS_PROMETHEUS, nil
}
func ruleAdjustInterval(freq int64) int64 {
// 10 corresponds to the SchedulerCfg, but TODO not worrying about fetching for now.
var baseFreq int64 = 10
if freq <= baseFreq {
return 10
}
return freq - (freq % baseFreq)
}
func transNoData(l log.Logger, s string) ngmodels.NoDataState {
switch legacymodels.NoDataOption(s) {
case legacymodels.NoDataSetOK:
return ngmodels.OK // values from ngalert/models/rule
case "", legacymodels.NoDataSetNoData:
return ngmodels.NoData
case legacymodels.NoDataSetAlerting:
return ngmodels.Alerting
case legacymodels.NoDataKeepState:
return ngmodels.NoData // "keep last state" translates to no data because we now emit a special alert when the state is "noData". The result is that the evaluation will not return firing and instead we'll raise the special alert.
default:
l.Warn("Unable to translate execution of NoData state. Using default execution", "old", s, "new", ngmodels.NoData)
return ngmodels.NoData
}
}
func transExecErr(l log.Logger, s string) ngmodels.ExecutionErrorState {
switch legacymodels.ExecutionErrorOption(s) {
case "", legacymodels.ExecutionErrorSetAlerting:
return ngmodels.AlertingErrState
case legacymodels.ExecutionErrorKeepState:
// Keep last state is translated to error as we now emit a
// DatasourceError alert when the state is error
return ngmodels.ErrorErrState
case legacymodels.ExecutionErrorSetOk:
return ngmodels.OkErrState
default:
l.Warn("Unable to translate execution of Error state. Using default execution", "old", s, "new", ngmodels.ErrorErrState)
return ngmodels.ErrorErrState
}
}
Alerting: Handle custom dashboard permissions in migration service (#74504) * Fix migration of custom dashboard permissions Dashboard alert permissions were determined by both its dashboard and folder scoped permissions, while UA alert rules only have folder scoped permissions. This means, when migrating an alert, we'll need to decide if the parent folder is a correct location for the newly created alert rule so that users, teams, and org roles have the same access to it as they did in legacy. To do this, we translate both the folder and dashboard resource permissions to two sets of SetResourcePermissionCommands. Each of these encapsulates a mapping of all: OrgRoles -> Viewer/Editor/Admin Teams -> Viewer/Editor/Admin Users -> Viewer/Editor/Admin When the dashboard permissions (including those inherited from the parent folder) differ from the parent folder permissions alone, we need to create a new folder to represent the access-level of the legacy dashboard. Compromises: When determining the SetResourcePermissionCommands we only take into account managed and basic roles. Fixed and custom roles introduce significant complexity and synchronicity hurdles. Instead, we log a warning they had the potential to override the newly created folder permissions. Also, we don't attempt to reconcile datasource permissions that were not necessary in legacy alerting. Users without access to the necessary datasources to edit an alert rule will need to obtain said access separate from the migration.
2023-10-12 17:12:40 -05:00
// truncate truncates the given name to the maximum allowed length.
func truncate(daName string, length int) string {
if len(daName) > length {
return daName[:length]
}
return daName
}
// groupName constructs a group name from the dashboard title and the interval. It truncates the dashboard title
// if necessary to ensure that the group name is not longer than the maximum allowed length.
func groupName(interval int64, dashboardTitle string) string {
duration := model.Duration(time.Duration(interval) * time.Second) // Humanize.
panelSuffix := fmt.Sprintf(" - %s", duration.String())
truncatedDashboard := truncate(dashboardTitle, store.AlertRuleMaxRuleGroupNameLength-len(panelSuffix))
return fmt.Sprintf("%s%s", truncatedDashboard, panelSuffix)
}