From 68691c9386996ccf64007c84a8998901c932eaa7 Mon Sep 17 00:00:00 2001
From: Alexander Akhmetov <me@alx.cx>
Date: Thu, 27 Jun 2024 09:45:15 +0200
Subject: [PATCH] Alerting: Add setting for maximum allowed rule evaluation
 results (#89468)

* Alerting: Add setting for maximum allowed rule evaluation results

Added a new configuration setting `quota.alerting_rule_evaluation_results` to set the maximum number of alert rule evaluation results per rule. If the limit is exceeded, the evaluation will result in an error.
---
 conf/defaults.ini                             |   5 +
 conf/sample.ini                               |   5 +
 .../setup-grafana/configure-grafana/_index.md |   4 +
 pkg/services/ngalert/eval/eval.go             |  36 +++--
 pkg/services/ngalert/eval/eval_test.go        | 140 ++++++++++++++++++
 pkg/setting/setting_unified_alerting.go       |   2 +
 6 files changed, 183 insertions(+), 9 deletions(-)
diff --git a/conf/defaults.ini b/conf/defaults.ini
index aa746872133..931b9268897 100644
--- a/conf/defaults.ini
+++ b/conf/defaults.ini
@@ -1171,6 +1171,11 @@ global_correlations = -1
 # This is not strictly enforced yet, but will be enforced over time.
 alerting_rule_group_rules = 100
 
+# Limit the number of query evaluation results per alert rule.
+# If the condition query of an alert rule produces more results than this limit,
+# the evaluation results in an error.
+alerting_rule_evaluation_results = -1
+
 #################################### Unified Alerting ####################
 [unified_alerting]
 # Enable the Alerting sub-system and interface.
diff --git a/conf/sample.ini b/conf/sample.ini
index e261822eb57..c9885cda40f 100644
--- a/conf/sample.ini
+++ b/conf/sample.ini
@@ -1157,6 +1157,11 @@
 # This is not strictly enforced yet, but will be enforced over time.
 ;alerting_rule_group_rules = 100
 
+# Limit the number of query evaluation results per alert rule.
+# If the condition query of an alert rule produces more results than this limit,
+# the evaluation results in an error.
+;alerting_rule_evaluation_results = -1
+
 #################################### Unified Alerting ####################
 [unified_alerting]
 #Enable the Unified Alerting sub-system and interface. When enabled we'll migrate all of your alert rules and notification channels to the new system. New alert rules will be created and your notification channels will be converted into an Alertmanager configuration. Previous data is preserved to enable backwards compatibility but new data is removed.```
diff --git a/docs/sources/setup-grafana/configure-grafana/_index.md b/docs/sources/setup-grafana/configure-grafana/_index.md
index bf95f825a07..8c2b089a81e 100644
--- a/docs/sources/setup-grafana/configure-grafana/_index.md
+++ b/docs/sources/setup-grafana/configure-grafana/_index.md
@@ -1554,6 +1554,10 @@ Sets a global limit on number of alert rules that can be created. Default is -1
 
 Sets a global limit on number of correlations that can be created. Default is -1 (unlimited).
 
+### alerting_rule_evaluation_results
+
+Limit the number of query evaluation results per alert rule. If the condition query of an alert rule produces more results than this limit, the evaluation results in an error. Default is -1 (unlimited).
+
 <hr>
 
 ## [unified_alerting]
diff --git a/pkg/services/ngalert/eval/eval.go b/pkg/services/ngalert/eval/eval.go
index c3ff8393c6a..300113fa868 100644
--- a/pkg/services/ngalert/eval/eval.go
+++ b/pkg/services/ngalert/eval/eval.go
@@ -52,6 +52,7 @@ type conditionEvaluator struct {
 	expressionService expressionService
 	condition         models.Condition
 	evalTimeout       time.Duration
+	evalResultLimit   int
 }
 
 func (r *conditionEvaluator) EvaluateRaw(ctx context.Context, now time.Time) (resp *backend.QueryDataResponse, err error) {
@@ -74,7 +75,21 @@ func (r *conditionEvaluator) EvaluateRaw(ctx context.Context, now time.Time) (re
 		execCtx = timeoutCtx
 	}
 	logger.FromContext(ctx).Debug("Executing pipeline", "commands", strings.Join(r.pipeline.GetCommandTypes(), ","), "datasources", strings.Join(r.pipeline.GetDatasourceTypes(), ","))
-	return r.expressionService.ExecutePipeline(execCtx, now, r.pipeline)
+	result, err := r.expressionService.ExecutePipeline(execCtx, now, r.pipeline)
+
+	// Check if the result of the condition evaluation is too large
+	if err == nil && result != nil && r.evalResultLimit > 0 {
+		conditionResultLength := 0
+		if conditionResponse, ok := result.Responses[r.condition.Condition]; ok {
+			conditionResultLength = len(conditionResponse.Frames)
+		}
+		if conditionResultLength > r.evalResultLimit {
+			logger.FromContext(ctx).Error("Query evaluation returned too many results", "limit", r.evalResultLimit, "actual", conditionResultLength)
+			return nil, fmt.Errorf("query evaluation returned too many results: %d (limit: %d)", conditionResultLength, r.evalResultLimit)
+		}
+	}
+
+	return result, err
 }
 
 // Evaluate evaluates the condition and converts the response to Results
@@ -87,10 +102,11 @@ func (r *conditionEvaluator) Evaluate(ctx context.Context, now time.Time) (Resul
 }
 
 type evaluatorImpl struct {
-	evaluationTimeout time.Duration
-	dataSourceCache   datasources.CacheService
-	expressionService *expr.Service
-	pluginsStore      pluginstore.Store
+	evaluationTimeout     time.Duration
+	evaluationResultLimit int
+	dataSourceCache       datasources.CacheService
+	expressionService     *expr.Service
+	pluginsStore          pluginstore.Store
 }
 
 func NewEvaluatorFactory(
@@ -100,10 +116,11 @@ func NewEvaluatorFactory(
 	pluginsStore pluginstore.Store,
 ) EvaluatorFactory {
 	return &evaluatorImpl{
-		evaluationTimeout: cfg.EvaluationTimeout,
-		dataSourceCache:   datasourceCache,
-		expressionService: expressionService,
-		pluginsStore:      pluginsStore,
+		evaluationTimeout:     cfg.EvaluationTimeout,
+		evaluationResultLimit: cfg.EvaluationResultLimit,
+		dataSourceCache:       datasourceCache,
+		expressionService:     expressionService,
+		pluginsStore:          pluginsStore,
 	}
 }
 
@@ -849,6 +866,7 @@ func (e *evaluatorImpl) create(condition models.Condition, req *expr.Request) (C
 				expressionService: e.expressionService,
 				condition:         condition,
 				evalTimeout:       e.evaluationTimeout,
+				evalResultLimit:   e.evaluationResultLimit,
 			}, nil
 		}
 		conditions = append(conditions, node.RefID())
diff --git a/pkg/services/ngalert/eval/eval_test.go b/pkg/services/ngalert/eval/eval_test.go
index 8f0c0e473ad..54b02340d6a 100644
--- a/pkg/services/ngalert/eval/eval_test.go
+++ b/pkg/services/ngalert/eval/eval_test.go
@@ -1029,6 +1029,146 @@ func TestEvaluateRaw(t *testing.T) {
 	})
 }
 
+func TestEvaluateRawLimit(t *testing.T) {
+	t.Run("should apply the limit to the successful query evaluation", func(t *testing.T) {
+		resp := backend.QueryDataResponse{
+			Responses: backend.Responses{
+				"A": {
+					Frames: []*data.Frame{{
+						RefID: "A",
+						Fields: []*data.Field{
+							data.NewField(
+								"Value",
+								data.Labels{"foo": "bar"},
+								[]*float64{util.Pointer(10.0)},
+							),
+						},
+					}},
+				},
+				"B": {
+					Frames: []*data.Frame{
+						{
+							RefID: "B",
+							Fields: []*data.Field{
+								data.NewField(
+									"Value",
+									data.Labels{"foo": "bar"},
+									[]*float64{util.Pointer(10.0)},
+								),
+							},
+						},
+						{
+							RefID: "B",
+							Fields: []*data.Field{
+								data.NewField(
+									"Value",
+									data.Labels{"foo": "baz"},
+									[]*float64{util.Pointer(10.0)},
+								),
+							},
+						},
+					},
+				},
+			},
+		}
+
+		cases := []struct {
+			desc            string
+			cond            models.Condition
+			evalResultLimit int
+			error           string
+		}{
+			{
+				desc:            "too many results from the condition query results in an error",
+				cond:            models.Condition{Condition: "B"},
+				evalResultLimit: 1,
+				error:           "query evaluation returned too many results: 2 (limit: 1)",
+			},
+			{
+				desc:            "if the limit equals to the number of condition query frames, no error is returned",
+				cond:            models.Condition{Condition: "B"},
+				evalResultLimit: len(resp.Responses["B"].Frames),
+			},
+			{
+				desc:            "if the limit is 0, no error is returned",
+				cond:            models.Condition{Condition: "B"},
+				evalResultLimit: 0,
+			},
+			{
+				desc:            "if the limit is -1, no error is returned",
+				cond:            models.Condition{Condition: "B"},
+				evalResultLimit: -1,
+			},
+		}
+
+		for _, tc := range cases {
+			t.Run(tc.desc, func(t *testing.T) {
+				e := conditionEvaluator{
+					pipeline: nil,
+					expressionService: &fakeExpressionService{
+						hook: func(ctx context.Context, now time.Time, pipeline expr.DataPipeline) (*backend.QueryDataResponse, error) {
+							return &resp, nil
+						},
+					},
+					condition:       tc.cond,
+					evalResultLimit: tc.evalResultLimit,
+				}
+
+				result, err := e.EvaluateRaw(context.Background(), time.Now())
+
+				if tc.error != "" {
+					require.Error(t, err)
+					require.EqualError(t, err, tc.error)
+				} else {
+					require.NoError(t, err)
+					require.NotNil(t, result)
+				}
+			})
+		}
+	})
+
+	t.Run("should return the original error if the evaluation did not succeed", func(t *testing.T) {
+		cases := []struct {
+			desc            string
+			queryEvalResult *backend.QueryDataResponse
+			queryEvalError  error
+			evalResultLimit int
+		}{
+			{
+				desc:            "the original query evaluation result is preserved",
+				queryEvalResult: &backend.QueryDataResponse{},
+				queryEvalError:  errors.New("some query error"),
+				evalResultLimit: 1,
+			},
+			{
+				desc:            "the original query evaluation result is preserved (no evaluation result)",
+				queryEvalResult: nil,
+				queryEvalError:  errors.New("some query error"),
+				evalResultLimit: 1,
+			},
+		}
+
+		for _, tc := range cases {
+			t.Run(tc.desc, func(t *testing.T) {
+				e := conditionEvaluator{
+					pipeline: nil,
+					expressionService: &fakeExpressionService{
+						hook: func(ctx context.Context, now time.Time, pipeline expr.DataPipeline) (*backend.QueryDataResponse, error) {
+							return tc.queryEvalResult, tc.queryEvalError
+						},
+					},
+					evalResultLimit: tc.evalResultLimit,
+				}
+
+				result, err := e.EvaluateRaw(context.Background(), time.Now())
+				require.Error(t, err)
+				require.Equal(t, err, tc.queryEvalError)
+				require.Equal(t, result, tc.queryEvalResult)
+			})
+		}
+	})
+}
+
 func TestResults_HasNonRetryableErrors(t *testing.T) {
 	tc := []struct {
 		name     string
diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go
index 0f6a199bf5f..5d848e5f705 100644
--- a/pkg/setting/setting_unified_alerting.go
+++ b/pkg/setting/setting_unified_alerting.go
@@ -91,6 +91,7 @@ type UnifiedAlertingSettings struct {
 	MaxAttempts                    int64
 	MinInterval                    time.Duration
 	EvaluationTimeout              time.Duration
+	EvaluationResultLimit          int
 	DisableJitter                  bool
 	ExecuteAlerts                  bool
 	DefaultConfiguration           string
@@ -355,6 +356,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
 
 	quotas := iniFile.Section("quota")
 	uaCfg.RulesPerRuleGroupLimit = quotas.Key("alerting_rule_group_rules").MustInt64(100)
+	uaCfg.EvaluationResultLimit = quotas.Key("alerting_rule_evaluation_results").MustInt(-1)
 
 	remoteAlertmanager := iniFile.Section("remote.alertmanager")
 	uaCfgRemoteAM := RemoteAlertmanagerSettings{