Alerting: Add max limit for Loki query size in state history API (#89646)

* add setting for query limit

* update BuildLogQuery to return error if limit is exceeded

* move tests for BuildLogQuery to separate suite
This commit is contained in:
Yuri Tseretyan 2024-06-25 09:20:38 -04:00 committed by GitHub
parent 9a3477dd11
commit 4a5aab54a5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 159 additions and 70 deletions

View File

@ -1387,6 +1387,12 @@ loki_basic_auth_password =
# Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value.
loki_max_query_length = 721h
# For "loki" only.
# Maximum size in bytes for queries sent to Loki. This limit is applied to user provided filters as well as system defined ones, e.g. applied by access control.
# If filter exceeds the limit, API returns error with code "alerting.state-history.loki.requestTooLong".
# Default is 64kb
loki_max_query_size = 65536
[unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided.

View File

@ -1379,6 +1379,12 @@ disabled_labels =
# Optional max query length for queries sent to Loki. Default is 721h which matches the default Loki value.
; loki_max_query_length = 360h
# For "loki" only.
# Maximum size in bytes for queries sent to Loki. This limit is applied to user provided filters as well as system defined ones, e.g. applied by access control.
# If filter exceeds the limit, API returns error with code "alerting.state-history.loki.requestTooLong".
# Default is 64kb
;loki_max_query_size = 65536
[unified_alerting.state_history.external_labels]
# Optional extra labels to attach to outbound state history records or log streams.
# Any number of label key-value-pairs can be provided.

View File

@ -45,6 +45,7 @@ var (
type lokiQueryClient interface {
RangeQuery(ctx context.Context, query string, start, end, limit int64) (historian.QueryRes, error)
MaxQuerySize() int
}
// LokiHistorianStore is a read store that queries Loki for alert state history.
@ -98,8 +99,12 @@ func (r *LokiHistorianStore) Get(ctx context.Context, query *annotations.ItemQue
}
}
logQL, err := historian.BuildLogQuery(buildHistoryQuery(query, accessResources.Dashboards, rule.UID))
logQL, err := historian.BuildLogQuery(buildHistoryQuery(query, accessResources.Dashboards, rule.UID), r.client.MaxQuerySize())
if err != nil {
grafanaErr := errutil.Error{}
if errors.As(err, &grafanaErr) {
return make([]*annotations.ItemDTO, 0), err
}
return make([]*annotations.ItemDTO, 0), ErrLokiStoreInternal.Errorf("failed to build loki query: %w", err)
}

View File

@ -778,6 +778,7 @@ func NewFakeLokiClient() *FakeLokiClient {
ReadPathURL: url,
Encoder: historian.JsonEncoder{},
MaxQueryLength: 721 * time.Hour,
MaxQuerySize: 65536,
},
metrics: metrics,
log: log.New("ngalert.state.historian", "backend", "loki"),
@ -812,6 +813,10 @@ func (c *FakeLokiClient) RangeQuery(ctx context.Context, query string, from, to,
return res, nil
}
func (c *FakeLokiClient) MaxQuerySize() int {
return c.cfg.MaxQuerySize
}
func TestUseStore(t *testing.T) {
t.Run("false if state history disabled", func(t *testing.T) {
cfg := setting.UnifiedAlertingStateHistorySettings{

View File

@ -12,6 +12,7 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/data"
"go.opentelemetry.io/otel/trace"
"github.com/grafana/grafana/pkg/apimachinery/errutil"
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
@ -41,10 +42,30 @@ const (
const defaultQueryRange = 6 * time.Hour
var (
ErrLokiQueryTooLong = errutil.BadRequest("loki.requestTooLong").MustTemplate(
"Request to Loki exceeded ({{.Public.QuerySize}} bytes) configured maximum size of {{.Public.MaxLimit}} bytes",
errutil.WithPublic("Query for Loki exceeded the configured limit of {{.Public.MaxLimit}} bytes. Remove some filters and try again."),
)
)
func NewErrLokiQueryTooLong(query string, maxLimit int) error {
return ErrLokiQueryTooLong.Build(errutil.TemplateData{
Private: map[string]any{
"query": query,
},
Public: map[string]any{
"MaxLimit": maxLimit,
"QuerySize": len(query),
},
})
}
type remoteLokiClient interface {
Ping(context.Context) error
Push(context.Context, []Stream) error
RangeQuery(ctx context.Context, logQL string, start, end, limit int64) (QueryRes, error)
MaxQuerySize() int
}
// RemoteLokibackend is a state.Historian that records state history to an external Loki instance.
@ -112,7 +133,7 @@ func (h *RemoteLokiBackend) Record(ctx context.Context, rule history_model.RuleM
// Query retrieves state history entries from an external Loki instance and formats the results into a dataframe.
func (h *RemoteLokiBackend) Query(ctx context.Context, query models.HistoryQuery) (*data.Frame, error) {
logQL, err := BuildLogQuery(query)
logQL, err := BuildLogQuery(query, h.client.MaxQuerySize())
if err != nil {
return nil, err
}
@ -366,7 +387,9 @@ func isValidOperator(op string) bool {
return false
}
func BuildLogQuery(query models.HistoryQuery) (string, error) {
// BuildLogQuery converts models.HistoryQuery and a list of folder UIDs to a Loki query.
// Returns a Loki query or error if log query cannot be constructed. If user-defined query exceeds maximum allowed size returns ErrLokiQueryTooLong
func BuildLogQuery(query models.HistoryQuery, maxQuerySize int) (string, error) {
selectors, err := buildSelectors(query)
if err != nil {
return "", fmt.Errorf("failed to build the provided selectors: %w", err)
@ -400,6 +423,10 @@ func BuildLogQuery(query models.HistoryQuery) (string, error) {
}
logQL += labelFilters
if len(logQL) > maxQuerySize {
// if the query is too long even without filter by folders, then fail
return "", NewErrLokiQueryTooLong(logQL, maxQuerySize)
}
return logQL, nil
}

View File

@ -42,6 +42,7 @@ type LokiConfig struct {
ExternalLabels map[string]string
Encoder encoder
MaxQueryLength time.Duration
MaxQuerySize int
}
func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig, error) {
@ -77,6 +78,7 @@ func NewLokiConfig(cfg setting.UnifiedAlertingStateHistorySettings) (LokiConfig,
TenantID: cfg.LokiTenantID,
ExternalLabels: cfg.ExternalLabels,
MaxQueryLength: cfg.LokiMaxQueryLength,
MaxQuerySize: cfg.LokiMaxQuerySize,
// Snappy-compressed protobuf is the default, same goes for Promtail.
Encoder: SnappyProtoEncoder{},
}, nil
@ -281,6 +283,10 @@ func (c *HttpLokiClient) RangeQuery(ctx context.Context, logQL string, start, en
return result, nil
}
func (c *HttpLokiClient) MaxQuerySize() int {
return c.cfg.MaxQuerySize
}
type QueryRes struct {
Data QueryData `json:"data"`
}

View File

@ -8,10 +8,15 @@ import (
"io"
"net/http"
"net/url"
"strings"
"testing"
"time"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
"github.com/grafana/grafana/pkg/infra/log"
"github.com/grafana/grafana/pkg/infra/tracing"
"github.com/grafana/grafana/pkg/services/ngalert/client"
@ -20,9 +25,6 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/state"
history_model "github.com/grafana/grafana/pkg/services/ngalert/state/historian/model"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/require"
)
func TestRemoteLokiBackend(t *testing.T) {
@ -214,81 +216,110 @@ func TestRemoteLokiBackend(t *testing.T) {
selector, err = NewSelector("label", "invalid", "value")
require.Error(t, err)
})
}
t.Run("buildLogQuery", func(t *testing.T) {
cases := []struct {
name string
query models.HistoryQuery
exp string
}{
{
name: "default includes state history label and orgID label",
query: models.HistoryQuery{},
exp: `{orgID="0",from="state-history"}`,
func TestBuildLogQuery(t *testing.T) {
maxQuerySize := 110
cases := []struct {
name string
query models.HistoryQuery
exp string
expErr error
}{
{
name: "default includes state history label and orgID label",
query: models.HistoryQuery{},
exp: `{orgID="0",from="state-history"}`,
},
{
name: "adds stream label filter for orgID",
query: models.HistoryQuery{
OrgID: 123,
},
{
name: "adds stream label filter for orgID",
query: models.HistoryQuery{
OrgID: 123,
exp: `{orgID="123",from="state-history"}`,
},
{
name: "filters ruleUID in log line",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
},
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid"`,
},
{
name: "filters dashboardUID in log line",
query: models.HistoryQuery{
OrgID: 123,
DashboardUID: "dash-uid",
},
exp: `{orgID="123",from="state-history"} | json | dashboardUID="dash-uid"`,
},
{
name: "filters panelID in log line",
query: models.HistoryQuery{
OrgID: 123,
PanelID: 456,
},
exp: `{orgID="123",from="state-history"} | json | panelID=456`,
},
{
name: "filters instance labels in log line",
query: models.HistoryQuery{
OrgID: 123,
Labels: map[string]string{
"customlabel": "customvalue",
"labeltwo": "labelvaluetwo",
},
exp: `{orgID="123",from="state-history"}`,
},
{
name: "filters ruleUID in log line",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
exp: `{orgID="123",from="state-history"} | json | labels_customlabel="customvalue" | labels_labeltwo="labelvaluetwo"`,
},
{
name: "filters both instance labels + ruleUID",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": "customvalue",
},
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid"`,
},
{
name: "filters dashboardUID in log line",
query: models.HistoryQuery{
OrgID: 123,
DashboardUID: "dash-uid",
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="customvalue"`,
},
{
name: "should return if query does not exceed max limit",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": strings.Repeat("!", 24),
},
exp: `{orgID="123",from="state-history"} | json | dashboardUID="dash-uid"`,
},
{
name: "filters panelID in log line",
query: models.HistoryQuery{
OrgID: 123,
PanelID: 456,
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="!!!!!!!!!!!!!!!!!!!!!!!!"`,
},
{
name: "should return error if query is too long",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": strings.Repeat("!", 25),
},
exp: `{orgID="123",from="state-history"} | json | panelID=456`,
},
{
name: "filters instance labels in log line",
query: models.HistoryQuery{
OrgID: 123,
Labels: map[string]string{
"customlabel": "customvalue",
"labeltwo": "labelvaluetwo",
},
},
exp: `{orgID="123",from="state-history"} | json | labels_customlabel="customvalue" | labels_labeltwo="labelvaluetwo"`,
},
{
name: "filters both instance labels + ruleUID",
query: models.HistoryQuery{
OrgID: 123,
RuleUID: "rule-uid",
Labels: map[string]string{
"customlabel": "customvalue",
},
},
exp: `{orgID="123",from="state-history"} | json | ruleUID="rule-uid" | labels_customlabel="customvalue"`,
},
}
expErr: ErrLokiQueryTooLong,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
res, err := BuildLogQuery(tc.query)
require.NoError(t, err)
require.Equal(t, tc.exp, res)
})
}
})
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
res, err := BuildLogQuery(tc.query, maxQuerySize)
if tc.expErr != nil {
require.ErrorIs(t, err, tc.expErr)
return
}
require.LessOrEqual(t, len(res), maxQuerySize)
require.NoError(t, err)
require.Equal(t, tc.exp, res)
})
}
}
func TestMerge(t *testing.T) {

View File

@ -64,6 +64,7 @@ const (
stateHistoryDefaultEnabled = true
lokiDefaultMaxQueryLength = 721 * time.Hour // 30d1h, matches the default value in Loki
defaultRecordingRequestTimeout = 10 * time.Second
lokiDefaultMaxQuerySize = 65536 // 64kb
)
type UnifiedAlertingSettings struct {
@ -160,6 +161,7 @@ type UnifiedAlertingStateHistorySettings struct {
LokiBasicAuthPassword string
LokiBasicAuthUsername string
LokiMaxQueryLength time.Duration
LokiMaxQuerySize int
MultiPrimary string
MultiSecondaries []string
ExternalLabels map[string]string
@ -404,6 +406,7 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
LokiBasicAuthUsername: stateHistory.Key("loki_basic_auth_username").MustString(""),
LokiBasicAuthPassword: stateHistory.Key("loki_basic_auth_password").MustString(""),
LokiMaxQueryLength: stateHistory.Key("loki_max_query_length").MustDuration(lokiDefaultMaxQueryLength),
LokiMaxQuerySize: stateHistory.Key("loki_max_query_size").MustInt(lokiDefaultMaxQuerySize),
MultiPrimary: stateHistory.Key("primary").MustString(""),
MultiSecondaries: splitTrim(stateHistory.Key("secondaries").MustString(""), ","),
ExternalLabels: stateHistoryLabels.KeysHash(),