CloudWatch Logs: Add retry strategy for hitting max concurrent queries (#39290)

* Add error passing and retry strategy

* Change generic error to specific one

* Make the error more generic

* Refactor retry strategy

* Add retry that handles multiple queries

* Rollback some backend changes

* Remove simple retry strategy

* Add comments

* Add tests

* Small test fixes

* Add log timeout config

* Fix tests

* Fix tests

* Add validation

* Remove commented code and add comment

* Fix snapshots

* Remove unnecessary cast
This commit is contained in:
Andrej Ocenas
2021-11-17 21:46:13 +01:00
committed by GitHub
parent 3dd73387fa
commit e237ff20a9
10 changed files with 659 additions and 58 deletions

View File

@@ -14,9 +14,22 @@ import (
"github.com/grafana/grafana-plugin-sdk-go/backend"
"github.com/grafana/grafana-plugin-sdk-go/data"
"github.com/grafana/grafana/pkg/components/simplejson"
"github.com/grafana/grafana/pkg/util/errutil"
"golang.org/x/sync/errgroup"
)
var LimitExceededException = "LimitExceededException"
type AWSError struct {
Code string
Message string
Payload map[string]string
}
func (e *AWSError) Error() string {
return fmt.Sprintf("%s: %s", e.Code, e.Message)
}
func (e *cloudWatchExecutor) executeLogActions(ctx context.Context, req *backend.QueryDataRequest) (*backend.QueryDataResponse, error) {
resp := backend.NewQueryDataResponse()
@@ -33,6 +46,13 @@ func (e *cloudWatchExecutor) executeLogActions(ctx context.Context, req *backend
eg.Go(func() error {
dataframe, err := e.executeLogAction(ectx, model, query, req.PluginContext)
if err != nil {
var AWSError *AWSError
if errors.As(err, &AWSError) {
resultChan <- backend.Responses{
query.RefID: backend.DataResponse{Frames: data.Frames{}, Error: AWSError},
}
return nil
}
return err
}
@@ -56,6 +76,7 @@ func (e *cloudWatchExecutor) executeLogActions(ctx context.Context, req *backend
for refID, response := range result {
respD := resp.Responses[refID]
respD.Frames = response.Frames
respD.Error = response.Error
resp.Responses[refID] = respD
}
}
@@ -96,7 +117,7 @@ func (e *cloudWatchExecutor) executeLogAction(ctx context.Context, model *simple
data, err = e.handleGetLogEvents(ctx, logsClient, model)
}
if err != nil {
return nil, err
return nil, errutil.Wrapf(err, "failed to execute log action with subtype: %s", subType)
}
return data, nil
@@ -224,6 +245,11 @@ func (e *cloudWatchExecutor) handleStartQuery(ctx context.Context, logsClient cl
model *simplejson.Json, timeRange backend.TimeRange, refID string) (*data.Frame, error) {
startQueryResponse, err := e.executeStartQuery(ctx, logsClient, model, timeRange)
if err != nil {
var awsErr awserr.Error
if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" {
plog.Debug("executeStartQuery limit exceeded", "err", awsErr)
return nil, &AWSError{Code: LimitExceededException, Message: err.Error()}
}
return nil, err
}