CloudWatch: Logs queries should retry on throttling errors (#92535)

CloudWatch: logs queries should retry on throttling errors
This commit is contained in:
Isabella Siu 2024-08-30 13:55:00 -04:00 committed by GitHub
parent d6f871490e
commit 2d10068714
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 67 additions and 1 deletions

View File

@ -25,6 +25,7 @@ import (
const ( const (
limitExceededException = "LimitExceededException" limitExceededException = "LimitExceededException"
throttlingException = "ThrottlingException"
defaultEventLimit = int64(10) defaultEventLimit = int64(10)
defaultLogGroupLimit = int64(50) defaultLogGroupLimit = int64(50)
logIdentifierInternal = "__log__grafana_internal__" logIdentifierInternal = "__log__grafana_internal__"
@ -233,6 +234,9 @@ func (e *cloudWatchExecutor) executeStartQuery(ctx context.Context, logsClient c
if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" { if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" {
e.logger.FromContext(ctx).Debug("ExecuteStartQuery limit exceeded", "err", awsErr) e.logger.FromContext(ctx).Debug("ExecuteStartQuery limit exceeded", "err", awsErr)
err = &AWSError{Code: limitExceededException, Message: err.Error()} err = &AWSError{Code: limitExceededException, Message: err.Error()}
} else if errors.As(err, &awsErr) && awsErr.Code() == "ThrottlingException" {
e.logger.FromContext(ctx).Debug("ExecuteStartQuery rate exceeded", "err", awsErr)
err = &AWSError{Code: throttlingException, Message: err.Error()}
} }
err = errorsource.DownstreamError(err, false) err = errorsource.DownstreamError(err, false)
} }

View File

@ -208,6 +208,53 @@ describe('CloudWatchLogsQueryRunner', () => {
}); });
}); });
it('should call getQueryResults until the query returns even if it the startQuery gets a throttling error from aws', async () => {
const { runner } = setupMockedLogsQueryRunner();
const options: DataQueryRequest<CloudWatchLogsQuery> = {
...LogsRequestMock,
targets: rawLogQueriesStub,
};
const queryFn = jest
.fn()
.mockReturnValueOnce(of(startQueryErrorWhenThrottlingResponseStub))
.mockReturnValueOnce(of(startQuerySuccessResponseStub))
.mockReturnValueOnce(of(getQuerySuccessResponseStub));
const response = runner.handleLogQueries(rawLogQueriesStub, options, queryFn);
const results = await lastValueFrom(response);
expect(queryFn).toHaveBeenCalledTimes(3);
// first call
expect(queryFn).toHaveBeenNthCalledWith(
1,
expect.objectContaining({
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]),
})
);
// we retry because the first call failed with the rate limiting error
expect(queryFn).toHaveBeenNthCalledWith(
2,
expect.objectContaining({
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]),
})
);
// we get results because second call was successful
expect(queryFn).toHaveBeenNthCalledWith(
3,
expect.objectContaining({
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'GetQueryResults' })]),
})
);
expect(results).toEqual({
...getQuerySuccessResponseStub,
errors: [],
key: 'test-key',
});
});
it('should return an error if it timesout before the start queries can get past a rate limiting error', async () => { it('should return an error if it timesout before the start queries can get past a rate limiting error', async () => {
const { runner } = setupMockedLogsQueryRunner(); const { runner } = setupMockedLogsQueryRunner();
// first time timeout is called it will not be timed out, second time it will be timed out // first time timeout is called it will not be timed out, second time it will be timed out
@ -469,6 +516,18 @@ const startQueryErrorWhenRateLimitedResponseStub = {
], ],
}; };
const startQueryErrorWhenThrottlingResponseStub = {
data: [],
errors: [
{
refId: 'A',
message:
'failed to execute log action with subtype: StartQuery: ThrottlingException: ThrottlingException: Rate exceeded',
status: 500,
},
],
};
const startQueryErrorWhenBadSyntaxResponseStub = { const startQueryErrorWhenBadSyntaxResponseStub = {
data: [], data: [],
state: 'Error', state: 'Error',

View File

@ -94,7 +94,10 @@ function splitErrorsData(errors: DataQueryError[]) {
const refIdsForRequestsToRetry: string[] = []; const refIdsForRequestsToRetry: string[] = [];
const errorsNotToRetry: DataQueryError[] = []; const errorsNotToRetry: DataQueryError[] = [];
errors.map((err) => { errors.map((err) => {
if (err?.message?.includes('LimitExceededException') && err.refId) { if (
err?.refId &&
(err.message?.includes('LimitExceededException') || err.message?.includes('ThrottlingException'))
) {
refIdsForRequestsToRetry.push(err.refId); refIdsForRequestsToRetry.push(err.refId);
} else { } else {
errorsNotToRetry.push(err); errorsNotToRetry.push(err);