diff --git a/pkg/tsdb/cloudwatch/log_actions.go b/pkg/tsdb/cloudwatch/log_actions.go index d4097f4fa85..28616531d5b 100644 --- a/pkg/tsdb/cloudwatch/log_actions.go +++ b/pkg/tsdb/cloudwatch/log_actions.go @@ -25,6 +25,7 @@ import ( const ( limitExceededException = "LimitExceededException" + throttlingException = "ThrottlingException" defaultEventLimit = int64(10) defaultLogGroupLimit = int64(50) logIdentifierInternal = "__log__grafana_internal__" @@ -233,6 +234,9 @@ func (e *cloudWatchExecutor) executeStartQuery(ctx context.Context, logsClient c if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" { e.logger.FromContext(ctx).Debug("ExecuteStartQuery limit exceeded", "err", awsErr) err = &AWSError{Code: limitExceededException, Message: err.Error()} + } else if errors.As(err, &awsErr) && awsErr.Code() == "ThrottlingException" { + e.logger.FromContext(ctx).Debug("ExecuteStartQuery rate exceeded", "err", awsErr) + err = &AWSError{Code: throttlingException, Message: err.Error()} } err = errorsource.DownstreamError(err, false) } diff --git a/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchLogsQueryRunner.test.ts b/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchLogsQueryRunner.test.ts index 21a964561e0..c809ef0f794 100644 --- a/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchLogsQueryRunner.test.ts +++ b/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchLogsQueryRunner.test.ts @@ -208,6 +208,53 @@ describe('CloudWatchLogsQueryRunner', () => { }); }); + it('should call getQueryResults until the query returns even if it the startQuery gets a throttling error from aws', async () => { + const { runner } = setupMockedLogsQueryRunner(); + + const options: DataQueryRequest = { + ...LogsRequestMock, + targets: rawLogQueriesStub, + }; + + const queryFn = jest + .fn() + .mockReturnValueOnce(of(startQueryErrorWhenThrottlingResponseStub)) + .mockReturnValueOnce(of(startQuerySuccessResponseStub)) + .mockReturnValueOnce(of(getQuerySuccessResponseStub)); + + const response = runner.handleLogQueries(rawLogQueriesStub, options, queryFn); + const results = await lastValueFrom(response); + expect(queryFn).toHaveBeenCalledTimes(3); + + // first call + expect(queryFn).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ + targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]), + }) + ); + // we retry because the first call failed with the rate limiting error + expect(queryFn).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ + targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]), + }) + ); + // we get results because second call was successful + expect(queryFn).toHaveBeenNthCalledWith( + 3, + expect.objectContaining({ + targets: expect.arrayContaining([expect.objectContaining({ subtype: 'GetQueryResults' })]), + }) + ); + + expect(results).toEqual({ + ...getQuerySuccessResponseStub, + errors: [], + key: 'test-key', + }); + }); + it('should return an error if it timesout before the start queries can get past a rate limiting error', async () => { const { runner } = setupMockedLogsQueryRunner(); // first time timeout is called it will not be timed out, second time it will be timed out @@ -469,6 +516,18 @@ const startQueryErrorWhenRateLimitedResponseStub = { ], }; +const startQueryErrorWhenThrottlingResponseStub = { + data: [], + errors: [ + { + refId: 'A', + message: + 'failed to execute log action with subtype: StartQuery: ThrottlingException: ThrottlingException: Rate exceeded', + status: 500, + }, + ], +}; + const startQueryErrorWhenBadSyntaxResponseStub = { data: [], state: 'Error', diff --git a/public/app/plugins/datasource/cloudwatch/utils/logsRetry.ts b/public/app/plugins/datasource/cloudwatch/utils/logsRetry.ts index fa7d3e51582..deae2449531 100644 --- a/public/app/plugins/datasource/cloudwatch/utils/logsRetry.ts +++ b/public/app/plugins/datasource/cloudwatch/utils/logsRetry.ts @@ -94,7 +94,10 @@ function splitErrorsData(errors: DataQueryError[]) { const refIdsForRequestsToRetry: string[] = []; const errorsNotToRetry: DataQueryError[] = []; errors.map((err) => { - if (err?.message?.includes('LimitExceededException') && err.refId) { + if ( + err?.refId && + (err.message?.includes('LimitExceededException') || err.message?.includes('ThrottlingException')) + ) { refIdsForRequestsToRetry.push(err.refId); } else { errorsNotToRetry.push(err);