mirror of
https://github.com/grafana/grafana.git
synced 2025-02-25 18:55:37 -06:00
CloudWatch: Logs queries should retry on throttling errors (#92535)
CloudWatch: logs queries should retry on throttling errors
This commit is contained in:
parent
d6f871490e
commit
2d10068714
@ -25,6 +25,7 @@ import (
|
|||||||
|
|
||||||
const (
|
const (
|
||||||
limitExceededException = "LimitExceededException"
|
limitExceededException = "LimitExceededException"
|
||||||
|
throttlingException = "ThrottlingException"
|
||||||
defaultEventLimit = int64(10)
|
defaultEventLimit = int64(10)
|
||||||
defaultLogGroupLimit = int64(50)
|
defaultLogGroupLimit = int64(50)
|
||||||
logIdentifierInternal = "__log__grafana_internal__"
|
logIdentifierInternal = "__log__grafana_internal__"
|
||||||
@ -233,6 +234,9 @@ func (e *cloudWatchExecutor) executeStartQuery(ctx context.Context, logsClient c
|
|||||||
if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" {
|
if errors.As(err, &awsErr) && awsErr.Code() == "LimitExceededException" {
|
||||||
e.logger.FromContext(ctx).Debug("ExecuteStartQuery limit exceeded", "err", awsErr)
|
e.logger.FromContext(ctx).Debug("ExecuteStartQuery limit exceeded", "err", awsErr)
|
||||||
err = &AWSError{Code: limitExceededException, Message: err.Error()}
|
err = &AWSError{Code: limitExceededException, Message: err.Error()}
|
||||||
|
} else if errors.As(err, &awsErr) && awsErr.Code() == "ThrottlingException" {
|
||||||
|
e.logger.FromContext(ctx).Debug("ExecuteStartQuery rate exceeded", "err", awsErr)
|
||||||
|
err = &AWSError{Code: throttlingException, Message: err.Error()}
|
||||||
}
|
}
|
||||||
err = errorsource.DownstreamError(err, false)
|
err = errorsource.DownstreamError(err, false)
|
||||||
}
|
}
|
||||||
|
@ -208,6 +208,53 @@ describe('CloudWatchLogsQueryRunner', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should call getQueryResults until the query returns even if it the startQuery gets a throttling error from aws', async () => {
|
||||||
|
const { runner } = setupMockedLogsQueryRunner();
|
||||||
|
|
||||||
|
const options: DataQueryRequest<CloudWatchLogsQuery> = {
|
||||||
|
...LogsRequestMock,
|
||||||
|
targets: rawLogQueriesStub,
|
||||||
|
};
|
||||||
|
|
||||||
|
const queryFn = jest
|
||||||
|
.fn()
|
||||||
|
.mockReturnValueOnce(of(startQueryErrorWhenThrottlingResponseStub))
|
||||||
|
.mockReturnValueOnce(of(startQuerySuccessResponseStub))
|
||||||
|
.mockReturnValueOnce(of(getQuerySuccessResponseStub));
|
||||||
|
|
||||||
|
const response = runner.handleLogQueries(rawLogQueriesStub, options, queryFn);
|
||||||
|
const results = await lastValueFrom(response);
|
||||||
|
expect(queryFn).toHaveBeenCalledTimes(3);
|
||||||
|
|
||||||
|
// first call
|
||||||
|
expect(queryFn).toHaveBeenNthCalledWith(
|
||||||
|
1,
|
||||||
|
expect.objectContaining({
|
||||||
|
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
// we retry because the first call failed with the rate limiting error
|
||||||
|
expect(queryFn).toHaveBeenNthCalledWith(
|
||||||
|
2,
|
||||||
|
expect.objectContaining({
|
||||||
|
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'StartQuery' })]),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
// we get results because second call was successful
|
||||||
|
expect(queryFn).toHaveBeenNthCalledWith(
|
||||||
|
3,
|
||||||
|
expect.objectContaining({
|
||||||
|
targets: expect.arrayContaining([expect.objectContaining({ subtype: 'GetQueryResults' })]),
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
expect(results).toEqual({
|
||||||
|
...getQuerySuccessResponseStub,
|
||||||
|
errors: [],
|
||||||
|
key: 'test-key',
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
it('should return an error if it timesout before the start queries can get past a rate limiting error', async () => {
|
it('should return an error if it timesout before the start queries can get past a rate limiting error', async () => {
|
||||||
const { runner } = setupMockedLogsQueryRunner();
|
const { runner } = setupMockedLogsQueryRunner();
|
||||||
// first time timeout is called it will not be timed out, second time it will be timed out
|
// first time timeout is called it will not be timed out, second time it will be timed out
|
||||||
@ -469,6 +516,18 @@ const startQueryErrorWhenRateLimitedResponseStub = {
|
|||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const startQueryErrorWhenThrottlingResponseStub = {
|
||||||
|
data: [],
|
||||||
|
errors: [
|
||||||
|
{
|
||||||
|
refId: 'A',
|
||||||
|
message:
|
||||||
|
'failed to execute log action with subtype: StartQuery: ThrottlingException: ThrottlingException: Rate exceeded',
|
||||||
|
status: 500,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
const startQueryErrorWhenBadSyntaxResponseStub = {
|
const startQueryErrorWhenBadSyntaxResponseStub = {
|
||||||
data: [],
|
data: [],
|
||||||
state: 'Error',
|
state: 'Error',
|
||||||
|
@ -94,7 +94,10 @@ function splitErrorsData(errors: DataQueryError[]) {
|
|||||||
const refIdsForRequestsToRetry: string[] = [];
|
const refIdsForRequestsToRetry: string[] = [];
|
||||||
const errorsNotToRetry: DataQueryError[] = [];
|
const errorsNotToRetry: DataQueryError[] = [];
|
||||||
errors.map((err) => {
|
errors.map((err) => {
|
||||||
if (err?.message?.includes('LimitExceededException') && err.refId) {
|
if (
|
||||||
|
err?.refId &&
|
||||||
|
(err.message?.includes('LimitExceededException') || err.message?.includes('ThrottlingException'))
|
||||||
|
) {
|
||||||
refIdsForRequestsToRetry.push(err.refId);
|
refIdsForRequestsToRetry.push(err.refId);
|
||||||
} else {
|
} else {
|
||||||
errorsNotToRetry.push(err);
|
errorsNotToRetry.push(err);
|
||||||
|
Loading…
Reference in New Issue
Block a user