Alerting: Apply query optimization to eval endpoints (#78566)

* Alerting: Apply query optimization to eval endpoints

Previously, query optimization was applied to alert queries when scheduled but
not when ran through `api/v1/eval` or `/api/v1/rule/test/grafana`. This could
lead to discrepancies between preview and scheduled alert results.
This commit is contained in:
Matthew Jacobson 2023-11-28 19:44:28 -05:00 committed by GitHub
parent a6c9a9db92
commit ce90a1f2be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 167 additions and 140 deletions

View File

@ -27,6 +27,7 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
"github.com/grafana/grafana/pkg/services/ngalert/store"
"github.com/grafana/grafana/pkg/setting"
"github.com/grafana/grafana/pkg/util"
)
@ -68,6 +69,10 @@ func (srv TestingApiSrv) RouteTestGrafanaRuleConfig(c *contextmodel.ReqContext,
return errorToResponse(fmt.Errorf("%w to query one or many data sources used by the rule", accesscontrol.ErrAuthorization))
}
if _, err := store.OptimizeAlertQueries(rule.Data); err != nil {
return ErrResp(http.StatusInternalServerError, err, "Failed to optimize query")
}
evaluator, err := srv.evaluator.Create(eval.NewContext(c.Req.Context(), c.SignedInUser), rule.GetEvalCondition())
if err != nil {
return ErrResp(http.StatusBadRequest, err, "Failed to build evaluator for queries and expressions")
@ -159,6 +164,12 @@ func (srv TestingApiSrv) RouteEvalQueries(c *contextmodel.ReqContext, cmd apimod
if len(cmd.Data) > 0 {
cond.Condition = cmd.Data[0].RefID
}
_, err := store.OptimizeAlertQueries(cond.Data)
if err != nil {
return ErrResp(http.StatusInternalServerError, err, "Failed to optimize query")
}
evaluator, err := srv.evaluator.Create(eval.NewContext(c.Req.Context(), c.SignedInUser), cond)
if err != nil {

View File

@ -12,6 +12,7 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/expr"
"github.com/grafana/grafana/pkg/services/datasources"
"github.com/grafana/grafana/pkg/services/folder"
"github.com/grafana/grafana/pkg/util"
)
@ -448,6 +449,68 @@ func CreateClassicConditionExpression(refID string, inputRefID string, reducer s
}
}
func CreateReduceExpression(refID string, inputRefID string, reducer string) AlertQuery {
return AlertQuery{
RefID: refID,
QueryType: expr.DatasourceType,
DatasourceUID: expr.DatasourceUID,
Model: json.RawMessage(fmt.Sprintf(`
{
"refId": "%[1]s",
"hide": false,
"type": "reduce",
"expression": "%[2]s",
"reducer": "%[3]s",
"datasource": {
"uid": "%[4]s",
"type": "%[5]s"
}
}`, refID, inputRefID, reducer, expr.DatasourceUID, expr.DatasourceType)),
}
}
func CreatePrometheusQuery(refID string, expr string, intervalMs int64, maxDataPoints int64, isInstant bool, datasourceUID string) AlertQuery {
return AlertQuery{
RefID: refID,
QueryType: "",
DatasourceUID: datasourceUID,
Model: json.RawMessage(fmt.Sprintf(`
{
"refId": "%[1]s",
"expr": "%[2]s",
"intervalMs": %[3]d,
"maxDataPoints": %[4]d,
"exemplar": false,
"instant": %[5]t,
"range": %[6]t,
"datasource": {
"uid": "%[7]s",
"type": "%[8]s"
}
}`, refID, expr, intervalMs, maxDataPoints, isInstant, !isInstant, datasourceUID, datasources.DS_PROMETHEUS)),
}
}
func CreateLokiQuery(refID string, expr string, intervalMs int64, maxDataPoints int64, queryType string, datasourceUID string) AlertQuery {
return AlertQuery{
RefID: refID,
QueryType: queryType,
DatasourceUID: datasourceUID,
Model: json.RawMessage(fmt.Sprintf(`
{
"refId": "%[1]s",
"expr": "%[2]s",
"intervalMs": %[3]d,
"maxDataPoints": %[4]d,
"queryType": "%[5]s",
"datasource": {
"uid": "%[6]s",
"type": "%[7]s"
}
}`, refID, expr, intervalMs, maxDataPoints, queryType, datasourceUID, datasources.DS_LOKI)),
}
}
type AlertInstanceMutator func(*AlertInstance)
// AlertInstanceGen provides a factory function that generates a random AlertInstance.

View File

@ -552,16 +552,10 @@ func (st DBstore) GetAlertRulesForScheduling(ctx context.Context, query *ngmodel
st.Logger.Error("Invalid rule found in DB store, ignoring it", "func", "GetAlertRulesForScheduling", "error", err)
continue
}
// This was added to mitigate the high load that could be created by loki range queries.
// In previous versions of Grafana, Loki datasources would default to range queries
// instead of instant queries, sometimes creating unnecessary load. This is only
// done for Grafana Cloud.
if optimizations, migratable := canBeInstant(rule); migratable {
if err := migrateToInstant(rule, optimizations); err != nil {
st.Logger.Error("Could not migrate rule from range to instant query", "rule", rule.UID, "err", err)
} else {
st.Logger.Info("Migrated rule from range to instant query", "rule", rule.UID, "migrated_queries", len(optimizations))
}
if optimizations, err := OptimizeAlertQueries(rule.Data); err != nil {
st.Logger.Error("Could not migrate rule from range to instant query", "rule", rule.UID, "err", err)
} else if len(optimizations) > 0 {
st.Logger.Info("Migrated rule from range to instant query", "rule", rule.UID, "migrated_queries", len(optimizations))
}
rules = append(rules, rule)
}

View File

@ -14,6 +14,21 @@ const (
grafanaCloudUsage = "grafanacloud-usage"
)
// OptimizeAlertQueries was added to mitigate the high load that could be created by loki range queries.
// In previous versions of Grafana, Loki datasources would default to range queries
// instead of instant queries, sometimes creating unnecessary load. This is only
// done for Grafana Cloud.
func OptimizeAlertQueries(queries []models.AlertQuery) ([]Optimization, error) {
if optimizations, migratable := canBeInstant(queries); migratable {
err := migrateToInstant(queries, optimizations)
if err != nil {
return nil, err
}
return optimizations, nil
}
return nil, nil
}
// DSType can be used to check the datasource type if it's set in the model.
type dsType struct {
DS struct {
@ -22,7 +37,9 @@ type dsType struct {
Range bool `json:"range"`
}
type optimization struct {
type Optimization struct {
// RefID of the query that can be optimized
RefID string
// Index of the query that can be optimized
i int
// Type of the query that ca be optimized (loki,prometheus)
@ -31,19 +48,19 @@ type optimization struct {
// canBeInstant checks if any of the query nodes that are loki or prometheus range queries can be migrated to instant queries.
// If any are migratable, those indices are returned.
func canBeInstant(r *models.AlertRule) ([]optimization, bool) {
if len(r.Data) < 2 {
func canBeInstant(queries []models.AlertQuery) ([]Optimization, bool) {
if len(queries) < 2 {
return nil, false
}
var (
optimizableIndices []optimization
optimizableIndices []Optimization
canBeOptimized = false
)
// Loop over query nodes to find all range queries.
for i := range r.Data {
for i := range queries {
var t dsType
// We can ignore the error here, the query just won't be optimized.
_ = json.Unmarshal(r.Data[i].Model, &t)
_ = json.Unmarshal(queries[i].Model, &t)
switch t.DS.Type {
case datasources.DS_PROMETHEUS:
@ -51,13 +68,13 @@ func canBeInstant(r *models.AlertRule) ([]optimization, bool) {
continue
}
case datasources.DS_LOKI:
if r.Data[i].QueryType != "range" {
if queries[i].QueryType != "range" {
continue
}
default:
// The default datasource is not saved as datasource, this is why we need to check for the datasource name.
// Here we check the well-known grafana cloud datasources.
if r.Data[i].DatasourceUID != grafanaCloudProm && r.Data[i].DatasourceUID != grafanaCloudUsage {
if queries[i].DatasourceUID != grafanaCloudProm && queries[i].DatasourceUID != grafanaCloudUsage {
continue
}
if !t.Range {
@ -68,17 +85,17 @@ func canBeInstant(r *models.AlertRule) ([]optimization, bool) {
var validReducers bool
// Loop over all query nodes to find the reduce node.
for ii := range r.Data {
for ii := range queries {
// Second query part should be and expression.
if !expr.IsDataSource(r.Data[ii].DatasourceUID) {
if !expr.IsDataSource(queries[ii].DatasourceUID) {
continue
}
exprRaw := make(map[string]any)
if err := json.Unmarshal(r.Data[ii].Model, &exprRaw); err != nil {
if err := json.Unmarshal(queries[ii].Model, &exprRaw); err != nil {
continue
}
// Second query part should use first query part as expression.
if ref, ok := exprRaw["expression"].(string); !ok || ref != r.Data[i].RefID {
if ref, ok := exprRaw["expression"].(string); !ok || ref != queries[i].RefID {
continue
}
// Second query part should be "last()"
@ -91,9 +108,10 @@ func canBeInstant(r *models.AlertRule) ([]optimization, bool) {
// If we found a reduce node that uses last, we can add the query to the optimizations.
if validReducers {
canBeOptimized = true
optimizableIndices = append(optimizableIndices, optimization{
i: i,
t: t.DS.Type,
optimizableIndices = append(optimizableIndices, Optimization{
RefID: queries[i].RefID,
i: i,
t: t.DS.Type,
})
}
}
@ -101,10 +119,10 @@ func canBeInstant(r *models.AlertRule) ([]optimization, bool) {
}
// migrateToInstant will move the provided indices from a range-query to an instant query.
func migrateToInstant(r *models.AlertRule, optimizations []optimization) error {
func migrateToInstant(queries []models.AlertQuery, optimizations []Optimization) error {
for _, opti := range optimizations {
modelRaw := make(map[string]any)
if err := json.Unmarshal(r.Data[opti.i].Model, &modelRaw); err != nil {
if err := json.Unmarshal(queries[opti.i].Model, &modelRaw); err != nil {
return err
}
switch opti.t {
@ -115,15 +133,15 @@ func migrateToInstant(r *models.AlertRule, optimizations []optimization) error {
if err != nil {
return err
}
r.Data[opti.i].Model = model
queries[opti.i].Model = model
case datasources.DS_LOKI:
modelRaw["queryType"] = "instant"
model, err := json.Marshal(modelRaw)
if err != nil {
return err
}
r.Data[opti.i].Model = model
r.Data[opti.i].QueryType = "instant"
queries[opti.i].Model = model
queries[opti.i].QueryType = "instant"
default:
return fmt.Errorf("optimization for datasource of type %s not possible", opti.t)
}

View File

@ -21,25 +21,25 @@ func TestCanBeInstant(t *testing.T) {
tcs := []struct {
name string
expected bool
expectedOptimizations []optimization
expectedOptimizations []Optimization
rule *models.AlertRule
}{
{
name: "valid loki rule that can be migrated from range to instant",
expected: true,
expectedOptimizations: []optimization{{i: 0, t: datasources.DS_LOKI}},
expectedOptimizations: []Optimization{{i: 0, t: datasources.DS_LOKI, RefID: "A"}},
rule: createMigrateableLokiRule(t),
},
{
name: "valid prom rule that can be migrated from range to instant",
expected: true,
expectedOptimizations: []optimization{{i: 0, t: datasources.DS_PROMETHEUS}},
expectedOptimizations: []Optimization{{i: 0, t: datasources.DS_PROMETHEUS, RefID: "A"}},
rule: createMigratablePromRule(t),
},
{
name: "valid loki rule with external loki datasource",
expected: true,
expectedOptimizations: []optimization{{i: 0, t: datasources.DS_LOKI}},
expectedOptimizations: []Optimization{{i: 0, t: datasources.DS_LOKI, RefID: "A"}},
rule: createMigrateableLokiRule(t, func(r *models.AlertRule) {
r.Data[0].DatasourceUID = "something-external"
}),
@ -47,7 +47,7 @@ func TestCanBeInstant(t *testing.T) {
{
name: "valid prom rule with external prometheus datasource",
expected: true,
expectedOptimizations: []optimization{{i: 0, t: datasources.DS_PROMETHEUS}},
expectedOptimizations: []Optimization{{i: 0, t: datasources.DS_PROMETHEUS, RefID: "A"}},
rule: createMigratablePromRule(t, func(r *models.AlertRule) {
r.Data[0].DatasourceUID = "something-external"
}),
@ -55,7 +55,7 @@ func TestCanBeInstant(t *testing.T) {
{
name: "valid prom rule with missing datasource",
expected: true,
expectedOptimizations: []optimization{{i: 0, t: datasources.DS_PROMETHEUS}},
expectedOptimizations: []Optimization{{i: 0, t: datasources.DS_PROMETHEUS, RefID: "A"}},
rule: createMigratablePromRuleWithDefaultDS(t),
},
{
@ -73,18 +73,18 @@ func TestCanBeInstant(t *testing.T) {
{
name: "valid loki multi query rule with loki datasources",
expected: true,
expectedOptimizations: []optimization{
{i: 0, t: datasources.DS_LOKI},
{i: 1, t: datasources.DS_LOKI},
expectedOptimizations: []Optimization{
{i: 0, t: datasources.DS_LOKI, RefID: "TotalRequests"},
{i: 1, t: datasources.DS_LOKI, RefID: "TotalErrors"},
},
rule: createMultiQueryMigratableLokiRule(t),
},
{
name: "valid prom multi query rule with prom datasources",
expected: true,
expectedOptimizations: []optimization{
{i: 0, t: datasources.DS_PROMETHEUS},
{i: 1, t: datasources.DS_PROMETHEUS},
expectedOptimizations: []Optimization{
{i: 0, t: datasources.DS_PROMETHEUS, RefID: "TotalRequests"},
{i: 1, t: datasources.DS_PROMETHEUS, RefID: "TotalErrors"},
},
rule: createMultiQueryMigratablePromRule(t),
},
@ -138,7 +138,7 @@ func TestCanBeInstant(t *testing.T) {
}
for _, tc := range tcs {
t.Run(tc.name, func(t *testing.T) {
optimizations, canBe := canBeInstant(tc.rule)
optimizations, canBe := canBeInstant(tc.rule.Data)
require.Equal(t, tc.expected, canBe)
require.Equal(t, tc.expectedOptimizations, optimizations)
})
@ -147,106 +147,116 @@ func TestCanBeInstant(t *testing.T) {
func TestMigrateLokiQueryToInstant(t *testing.T) {
original := createMigrateableLokiRule(t)
mirgrated := createMigrateableLokiRule(t, func(r *models.AlertRule) {
migrated := createMigrateableLokiRule(t, func(r *models.AlertRule) {
r.Data[0] = lokiQuery(t, "A", "instant", "grafanacloud-logs")
})
optimizableIndices, canBeOptimized := canBeInstant(original)
optimizableIndices, canBeOptimized := canBeInstant(original.Data)
require.True(t, canBeOptimized)
require.NoError(t, migrateToInstant(original, optimizableIndices))
require.NoError(t, migrateToInstant(original.Data, optimizableIndices))
require.Equal(t, mirgrated.Data[0].QueryType, original.Data[0].QueryType)
require.Equal(t, migrated.Data[0].QueryType, original.Data[0].QueryType)
originalModel := make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[0].Model, &originalModel))
migratedModel := make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[0].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[0].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
_, canBeOptimized = canBeInstant(original)
_, canBeOptimized = canBeInstant(original.Data)
require.False(t, canBeOptimized)
}
func TestMigrateMultiLokiQueryToInstant(t *testing.T) {
original := createMultiQueryMigratableLokiRule(t)
mirgrated := createMultiQueryMigratableLokiRule(t, func(r *models.AlertRule) {
migrated := createMultiQueryMigratableLokiRule(t, func(r *models.AlertRule) {
r.Data[0] = lokiQuery(t, "TotalRequests", "instant", "grafanacloud-logs")
r.Data[1] = lokiQuery(t, "TotalErrors", "instant", "grafanacloud-logs")
})
optimizableIndices, canBeOptimized := canBeInstant(original)
_, canBeOptimized := canBeInstant(original.Data)
require.True(t, canBeOptimized)
require.NoError(t, migrateToInstant(original, optimizableIndices))
require.Equal(t, mirgrated.Data[0].QueryType, original.Data[0].QueryType)
require.Equal(t, mirgrated.Data[1].QueryType, original.Data[1].QueryType)
optimizations, err := OptimizeAlertQueries(original.Data)
require.NoError(t, err)
require.Equal(t, optimizations[0].RefID, original.Data[0].RefID)
require.Equal(t, optimizations[1].RefID, original.Data[1].RefID)
require.Equal(t, migrated.Data[0].QueryType, original.Data[0].QueryType)
require.Equal(t, migrated.Data[1].QueryType, original.Data[1].QueryType)
originalModel := make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[0].Model, &originalModel))
migratedModel := make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[0].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[0].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
originalModel = make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[1].Model, &originalModel))
migratedModel = make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[1].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[1].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
_, canBeOptimized = canBeInstant(original)
_, canBeOptimized = canBeInstant(original.Data)
require.False(t, canBeOptimized)
}
func TestMigratePromQueryToInstant(t *testing.T) {
original := createMigratablePromRule(t)
mirgrated := createMigratablePromRule(t, func(r *models.AlertRule) {
migrated := createMigratablePromRule(t, func(r *models.AlertRule) {
r.Data[0] = prometheusQuery(t, "A", promExternalDS, promIsInstant)
})
optimizableIndices, canBeOptimized := canBeInstant(original)
optimizableIndices, canBeOptimized := canBeInstant(original.Data)
require.True(t, canBeOptimized)
require.NoError(t, migrateToInstant(original, optimizableIndices))
require.NoError(t, migrateToInstant(original.Data, optimizableIndices))
originalModel := make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[0].Model, &originalModel))
migratedModel := make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[0].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[0].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
_, canBeOptimized = canBeInstant(original)
_, canBeOptimized = canBeInstant(original.Data)
require.False(t, canBeOptimized)
}
func TestMigrateMultiPromQueryToInstant(t *testing.T) {
original := createMultiQueryMigratablePromRule(t)
mirgrated := createMultiQueryMigratablePromRule(t, func(r *models.AlertRule) {
migrated := createMultiQueryMigratablePromRule(t, func(r *models.AlertRule) {
r.Data[0] = prometheusQuery(t, "TotalRequests", promExternalDS, promIsInstant)
r.Data[1] = prometheusQuery(t, "TotalErrors", promExternalDS, promIsInstant)
})
optimizableIndices, canBeOptimized := canBeInstant(original)
_, canBeOptimized := canBeInstant(original.Data)
require.True(t, canBeOptimized)
require.NoError(t, migrateToInstant(original, optimizableIndices))
optimizations, err := OptimizeAlertQueries(original.Data)
require.NoError(t, err)
require.Equal(t, optimizations[0].RefID, original.Data[0].RefID)
require.Equal(t, optimizations[1].RefID, original.Data[1].RefID)
originalModel := make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[0].Model, &originalModel))
migratedModel := make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[0].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[0].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
originalModel = make(map[string]any)
require.NoError(t, json.Unmarshal(original.Data[1].Model, &originalModel))
migratedModel = make(map[string]any)
require.NoError(t, json.Unmarshal(mirgrated.Data[1].Model, &migratedModel))
require.NoError(t, json.Unmarshal(migrated.Data[1].Model, &migratedModel))
require.Equal(t, migratedModel, originalModel)
_, canBeOptimized = canBeInstant(original)
_, canBeOptimized = canBeInstant(original.Data)
require.False(t, canBeOptimized)
}
@ -326,44 +336,12 @@ func createMultiQueryMigratablePromRule(t *testing.T, muts ...func(*models.Alert
func lokiQuery(t *testing.T, refID, queryType, datasourceUID string) models.AlertQuery {
t.Helper()
return models.AlertQuery{
RefID: refID,
QueryType: queryType,
DatasourceUID: datasourceUID,
Model: []byte(fmt.Sprintf(`{
"datasource": {
"type": "loki",
"uid": "%s"
},
"editorMode": "code",
"expr": "1",
"intervalMs": 1000,
"maxDataPoints": 43200,
"queryType": "%s",
"refId": "%s"
}`, datasourceUID, queryType, refID)),
}
return models.CreateLokiQuery(refID, "1", 1000, 43200, queryType, datasourceUID)
}
func prometheusQuery(t *testing.T, refID, datasourceUID string, isInstant bool) models.AlertQuery {
t.Helper()
return models.AlertQuery{
RefID: refID,
DatasourceUID: datasourceUID,
Model: []byte(fmt.Sprintf(`{
"datasource": {
"type": "prometheus",
"uid": "%s"
},
"instant": %t,
"range": %t,
"editorMode": "code",
"expr": "1",
"intervalMs": 1000,
"maxDataPoints": 43200,
"refId": "%s"
}`, datasourceUID, isInstant, !isInstant, refID)),
}
return models.CreatePrometheusQuery(refID, "1", 1000, 43200, isInstant, datasourceUID)
}
func prometheusQueryWithoutDS(t *testing.T, refID, datasourceUID string, isInstant bool) models.AlertQuery {
@ -385,42 +363,5 @@ func prometheusQueryWithoutDS(t *testing.T, refID, datasourceUID string, isInsta
func reducer(t *testing.T, refID, exp, op string) models.AlertQuery {
t.Helper()
return models.AlertQuery{
RefID: refID,
DatasourceUID: "__expr__",
Model: []byte(fmt.Sprintf(`{
"conditions": [
{
"evaluator": {
"params": [],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": [
"B"
]
},
"reducer": {
"params": [],
"type": "%s"
},
"type": "query"
}
],
"datasource": {
"type": "__expr__",
"uid": "__expr__"
},
"expression": "%s",
"hide": false,
"intervalMs": 1000,
"maxDataPoints": 43200,
"reducer": "%s",
"refId": "%s",
"type": "reduce"
}`, op, exp, op, refID)),
}
return models.CreateReduceExpression(refID, exp, op)
}