Alerting: Add max limit for Loki query size in state history API (#89646)

* add setting for query limit

* update BuildLogQuery to return error if limit is exceeded

* move tests for BuildLogQuery to separate suite
This commit is contained in:
Yuri Tseretyan 2024-06-25 09:20:38 -04:00 committed by GitHub
parent 9a3477dd11
commit 4a5aab54a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 159 additions and 70 deletions

View File

@ -1387,6 +1387,12 @@ loki_basic_auth_password =
# Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value. # Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value.
loki_max_query_length = 721h loki_max_query_length = 721h
# For "loki" only.
# Maximum size in bytes for queries sent to Loki. This limit is applied to user provided filters as well as system defined ones, e.g. applied by access control.
# If filter exceeds the limit, API returns error with code "alerting.state-history.loki.requestTooLong".
# Default is 64kb
loki_max_query_size = 65536
[unified_alerting.state_history.external_labels] [unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams. # Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided. # Any number of label key-value-pairs can be provided.

View File

@ -1379,6 +1379,12 @@ disabled_labels =
# Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value. # Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value.
; loki_max_query_length = 360h ; loki_max_query_length = 360h
# For "loki" only.
# Maximum size in bytes for queries sent to Loki. This limit is applied to user provided filters as well as system defined ones, e.g. applied by access control.
# If filter exceeds the limit, API returns error with code "alerting.state-history.loki.requestTooLong".
# Default is 64kb
;loki_max_query_size = 65536
[unified_alerting.state_history.external_labels] [unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams. # Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided. # Any number of label key-value-pairs can be provided.

View File

@ -45,6 +45,7 @@ var (
type lokiQueryClient interface { type lokiQueryClient interface {
RangeQuery(ctx context.Context, query string, start, end, limit int64) (historian.QueryRes, error) RangeQuery(ctx context.Context, query string, start, end, limit int64) (historian.QueryRes, error)
MaxQuerySize() int
} }
// LokiHistorianStore is a read store that queries Loki for alert state history. // LokiHistorianStore is a read store that queries Loki for alert state history.
@ -98,8 +99,12 @@ func (r *LokiHistorianStore) Get(ctx context.Context, query *annotations.ItemQue
} }
} }
logQL, err := historian.BuildLogQuery(buildHistoryQuery(query, accessResources.Dashboards, rule.UID)) logQL, err := historian.BuildLogQuery(buildHistoryQuery(query, accessResources.Dashboards, rule.UID), r.client.MaxQuerySize())
if err != nil { if err != nil {
grafanaErr := errutil.Error{}
if errors.As(err, &grafanaErr) {
return make([]*annotations.ItemDTO, 0), err
}
return make([]*annotations.ItemDTO, 0), ErrLokiStoreInternal.Errorf("failed to build loki query: %w", err) return make([]*annotations.ItemDTO, 0), ErrLokiStoreInternal.Errorf("failed to build loki query: %w", err)
} }

View File

@ -778,6 +778,7 @@ func NewFakeLokiClient() *FakeLokiClient {
ReadPathURL: url, ReadPathURL: url,
Encoder: historian.JsonEncoder{}, Encoder: historian.JsonEncoder{},
MaxQueryLength: 721 * time.Hour, MaxQueryLength: 721 * time.Hour,
MaxQuerySize: 65536,
}, },
metrics: metrics, metrics: metrics,
log: log.New("ngalert.state.historian", "backend", "loki"), log: log.New("ngalert.state.historian", "backend", "loki"),
@ -812,6 +813,10 @@ func (c *FakeLokiClient) RangeQuery(ctx context.Context, query string, from, to,
return res, nil return res, nil
} }
func (c *FakeLokiClient) MaxQuerySize() int {
return c.cfg.MaxQuerySize
}
func TestUseStore(t *testing.T) { func TestUseStore(t *testing.T) {
t.Run("false if state history disabled", func(t *testing.T) { t.Run("false if state history disabled", func(t *testing.T) {
cfg := setting.UnifiedAlertingStateHistorySettings{ cfg := setting.UnifiedAlertingStateHistorySettings{

View File

@ -12,6 +12,7 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana-plugin-sdk-go/data"
"go.opentelemetry.io/otel/trace" "go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/apimachinery/errutil"
"github.com/grafana/grafana/pkg/components/simplejson" "github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/infra/tracing"
@ -41,10 +42,30 @@ const (
const defaultQueryRange = 6 * time.Hour const defaultQueryRange = 6 * time.Hour
var (
ErrLokiQueryTooLong = errutil.BadRequest("loki.requestTooLong").MustTemplate(
"Request to Loki exceeded ({{.Public.QuerySize}} bytes) configured maximum size of {{.Public.MaxLimit}} bytes",
errutil.WithPublic("Query for Loki exceeded the configured limit of {{.Public.MaxLimit}} bytes. Remove some filters and try again."),
)
)
func NewErrLokiQueryTooLong(query string, maxLimit int) error {
return ErrLokiQueryTooLong.Build(errutil.TemplateData{
Private: map[string]any{
"query": query,
},
Public: map[string]any{
"MaxLimit": maxLimit,
"QuerySize": len(query),
},
})
}
type remoteLokiClient interface { type remoteLokiClient interface {
Ping(context.Context) error Ping(context.Context) error
Push(context.Context, []Stream) error Push(context.Context, []Stream) error
RangeQuery(ctx context.Context, logQL string, start, end, limit int64) (QueryRes, error) RangeQuery(ctx context.Context, logQL string, start, end, limit int64) (QueryRes, error)
MaxQuerySize() int
} }
// RemoteLokibackend is a state.Historian that records state history to an external Loki instance. // RemoteLokibackend is a state.Historian that records state history to an external Loki instance.
@ -112,7 +133,7 @@ func (h *RemoteLokiBackend) Record(ctx context.Context, rule history_model.RuleM
// Query retrieves state history entries from an external Loki instance and formats the results into a dataframe. // Query retrieves state history entries from an external Loki instance and formats the results into a dataframe.
func (h *RemoteLokiBackend) Query(ctx context.Context, query models.HistoryQuery) (*data.Frame, error) { func (h *RemoteLokiBackend) Query(ctx context.Context, query models.HistoryQuery) (*data.Frame, error) {
logQL, err := BuildLogQuery(query) logQL, err := BuildLogQuery(query, h.client.MaxQuerySize())
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -366,7 +387,9 @@ func isValidOperator(op string) bool {
return false return false
} }
func BuildLogQuery(query models.HistoryQuery) (string, error) { // BuildLogQuery converts models.HistoryQuery and a list of folder UIDs to a Loki query.
// Returns a Loki query or error if log query cannot be constructed. If user-defined query exceeds maximum allowed size returns ErrLokiQueryTooLong
func BuildLogQuery(query models.HistoryQuery, maxQuerySize int) (string, error) {
selectors, err := buildSelectors(query) selectors, err := buildSelectors(query)
if err != nil { if err != nil {
return "", fmt.Errorf("failed to build the provided selectors: %w", err) return "", fmt.Errorf("failed to build the provided selectors: %w", err)
@ -400,6 +423,10 @@ func BuildLogQuery(query models.HistoryQuery) (string, error) {
} }
logQL += labelFilters logQL += labelFilters
if len(logQL) > maxQuerySize {
// if the query is too long even without filter by folders, then fail
return "", NewErrLokiQueryTooLong(logQL, maxQuerySize)
}
return logQL, nil return logQL, nil
} }

View File

@ -42,6 +42,7 @@ type LokiConfig struct {
ExternalLabels map[string]string ExternalLabels map[string]string
Encoder encoder Encoder encoder
MaxQueryLength time.Duration MaxQueryLength time.Duration
MaxQuerySize int
} }
func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig, error) { func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig, error) {
@ -77,6 +78,7 @@ func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig,
TenantID: cfg.LokiTenantID, TenantID: cfg.LokiTenantID,
ExternalLabels: cfg.ExternalLabels, ExternalLabels: cfg.ExternalLabels,
MaxQueryLength: cfg.LokiMaxQueryLength, MaxQueryLength: cfg.LokiMaxQueryLength,
MaxQuerySize: cfg.LokiMaxQuerySize,
// Snappy-compressed protobuf is the default, same goes for Promtail. // Snappy-compressed protobuf is the default, same goes for Promtail.
Encoder: SnappyProtoEncoder{}, Encoder: SnappyProtoEncoder{},
}, nil }, nil
@ -281,6 +283,10 @@ func (c *HttpLokiClient) RangeQuery(ctx context.Context, logQL string, start, en
return result, nil return result, nil
} }
func (c *HttpLokiClient) MaxQuerySize() int {
return c.cfg.MaxQuerySize
}
type QueryRes struct { type QueryRes struct {
Data QueryData `json:"data"` Data QueryData `json:"data"`
} }

View File

@ -8,10 +8,15 @@ import (
"io" "io"
"net/http" "net/http"
"net/url" "net/url"
"strings"
"testing" "testing"
"time" "time"
"github.com/grafana/grafana-plugin-sdk-go/data" "github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/infra/log" "github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing" "github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/client" "github.com/grafana/grafana/pkg/services/ngalert/client"
@ -20,9 +25,6 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/models" "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state" "github.com/grafana/grafana/pkg/services/ngalert/state"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model" history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
) )
func TestRemoteLokiBackend(t *testing.T) { func TestRemoteLokiBackend(t *testing.T) {
@ -214,81 +216,110 @@ func TestRemoteLokiBackend(t *testing.T) {
selector, err = NewSelector("label", "invalid", "value") selector, err = NewSelector("label", "invalid", "value")
require.Error(t, err) require.Error(t, err)
}) })
}
t.Run("buildLogQuery", func(t *testing.T) { func TestBuildLogQuery(t *testing.T) {
cases := []struct { maxQuerySize := 110
name string cases := []struct {
query models.HistoryQuery name string
exp string query models.HistoryQuery
}{ exp string
{ expErr error
name: "default includes state history label and orgID label", }{
query: models.HistoryQuery{}, {
exp: `{orgID="0",from="state-history"}`, name: "default includes state history label and orgID label",
query: models.HistoryQuery{},
exp: `{orgID="0",from="state-history"}`,
},
{
name: "adds stream label filter for orgID",
query: models.HistoryQuery{
OrgID: 123,
}, },
{ exp: `{orgID="123",from="state-history"}`,
name: "adds stream label filter for orgID", },
query: models.HistoryQuery{ {
OrgID: 123, name: "filters ruleUID in log line",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
},
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid"`,
},
{
name: "filters dashboardUID in log line",
query: models.HistoryQuery{
OrgID: 123,
DashboardUID: "dash-uid",
},
exp: `{orgID="123",from="state-history"} | json | dashboardUID="dash-uid"`,
},
{
name: "filters panelID in log line",
query: models.HistoryQuery{
OrgID: 123,
PanelID: 456,
},
exp: `{orgID="123",from="state-history"} | json | panelID=456`,
},
{
name: "filters instance labels in log line",
query: models.HistoryQuery{
OrgID: 123,
Labels: map[string]string{
"customlabel": "customvalue",
"labeltwo": "labelvaluetwo",
}, },
exp: `{orgID="123",from="state-history"}`,
}, },
{ exp: `{orgID="123",from="state-history"} | json | labels_customlabel="customvalue" | labels_labeltwo="labelvaluetwo"`,
name: "filters ruleUID in log line", },
query: models.HistoryQuery{ {
OrgID: 123, name: "filters both instance labels + ruleUID",
RuleUID: "rule-uid", query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": "customvalue",
}, },
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid"`,
}, },
{ exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="customvalue"`,
name: "filters dashboardUID in log line", },
query: models.HistoryQuery{ {
OrgID: 123, name: "should return if query does not exceed max limit",
DashboardUID: "dash-uid", query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": strings.Repeat("!", 24),
}, },
exp: `{orgID="123",from="state-history"} | json | dashboardUID="dash-uid"`,
}, },
{ exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="!!!!!!!!!!!!!!!!!!!!!!!!"`,
name: "filters panelID in log line", },
query: models.HistoryQuery{ {
OrgID: 123, name: "should return error if query is too long",
PanelID: 456, query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": strings.Repeat("!", 25),
}, },
exp: `{orgID="123",from="state-history"} | json | panelID=456`,
}, },
{ expErr: ErrLokiQueryTooLong,
name: "filters instance labels in log line", },
query: models.HistoryQuery{ }
OrgID: 123,
Labels: map[string]string{
"customlabel": "customvalue",
"labeltwo": "labelvaluetwo",
},
},
exp: `{orgID="123",from="state-history"} | json | labels_customlabel="customvalue" | labels_labeltwo="labelvaluetwo"`,
},
{
name: "filters both instance labels + ruleUID",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": "customvalue",
},
},
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="customvalue"`,
},
}
for _, tc := range cases { for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
res, err := BuildLogQuery(tc.query) res, err := BuildLogQuery(tc.query, maxQuerySize)
require.NoError(t, err) if tc.expErr != nil {
require.Equal(t, tc.exp, res) require.ErrorIs(t, err, tc.expErr)
}) return
} }
}) require.LessOrEqual(t, len(res), maxQuerySize)
require.NoError(t, err)
require.Equal(t, tc.exp, res)
})
}
} }
func TestMerge(t *testing.T) { func TestMerge(t *testing.T) {

View File

@ -64,6 +64,7 @@ const (
stateHistoryDefaultEnabled = true stateHistoryDefaultEnabled = true
lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki
defaultRecordingRequestTimeout = 10 * time.Second defaultRecordingRequestTimeout = 10 * time.Second
lokiDefaultMaxQuerySize = 65536 // 64kb
) )
type UnifiedAlertingSettings struct { type UnifiedAlertingSettings struct {
@ -160,6 +161,7 @@ type UnifiedAlertingStateHistorySettings struct {
LokiBasicAuthPassword string LokiBasicAuthPassword string
LokiBasicAuthUsername string LokiBasicAuthUsername string
LokiMaxQueryLength time.Duration LokiMaxQueryLength time.Duration
LokiMaxQuerySize int
MultiPrimary string MultiPrimary string
MultiSecondaries []string MultiSecondaries []string
ExternalLabels map[string]string ExternalLabels map[string]string
@ -404,6 +406,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""), LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""),
LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""), LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""),
LokiMaxQueryLength: stateHistory.Key("loki_max_query_length").MustDuration(lokiDefaultMaxQueryLength), LokiMaxQueryLength: stateHistory.Key("loki_max_query_length").MustDuration(lokiDefaultMaxQueryLength),
LokiMaxQuerySize: stateHistory.Key("loki_max_query_size").MustInt(lokiDefaultMaxQuerySize),
MultiPrimary: stateHistory.Key("primary").MustString(""), MultiPrimary: stateHistory.Key("primary").MustString(""),
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","), MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
ExternalLabels: stateHistoryLabels.KeysHash(), ExternalLabels: stateHistoryLabels.KeysHash(),