Cloudwatch Metrics: Adjust error handling (#79911)

This commit is contained in:
Ida Štambuk 2024-01-15 17:19:26 +01:00 committed by GitHub
parent 406ace27c0
commit d3a89a28fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 263 additions and 198 deletions

View File

@ -3,6 +3,7 @@ package cloudwatch
import (
"context"
"fmt"
"regexp"
"github.com/grafana/grafana-plugin-sdk-go/backend"
"golang.org/x/sync/errgroup"
@ -121,6 +122,7 @@ func (e *cloudWatchExecutor) executeTimeSeriesQuery(ctx context.Context, logger
Error: fmt.Errorf("metric request error: %q", err),
}
resultChan <- &responseWrapper{
RefId: getQueryRefIdFromErrorString(err.Error(), requestQueries),
DataResponse: &dataResponse,
}
}
@ -132,3 +134,17 @@ func (e *cloudWatchExecutor) executeTimeSeriesQuery(ctx context.Context, logger
return resp, nil
}
func getQueryRefIdFromErrorString(err string, queries []*models.CloudWatchQuery) string {
// error can be in format "Error in expression 'test': Invalid syntax"
// so we can find the query id or ref id between the quotations
erroredRefId := ""
for _, query := range queries {
if regexp.MustCompile(`'`+query.RefId+`':`).MatchString(err) || regexp.MustCompile(`'`+query.Id+`':`).MatchString(err) {
erroredRefId = query.RefId
}
}
// if errorRefId is empty, it means the error concerns all queries (error metric limit exceeded, for example)
return erroredRefId
}

View File

@ -1,7 +1,6 @@
import { of, throwError } from 'rxjs';
import { of } from 'rxjs';
import { CustomVariableModel, DataQueryError, DataQueryRequest, DataSourceInstanceSettings } from '@grafana/data';
import { BackendDataSourceResponse, toDataQueryResponse } from '@grafana/runtime';
import { CustomVariableModel, DataQueryRequest, DataQueryResponse, DataSourceInstanceSettings } from '@grafana/data';
import { CloudWatchMetricsQueryRunner } from '../query-runner/CloudWatchMetricsQueryRunner';
import { CloudWatchJsonData, CloudWatchQuery } from '../types';
@ -10,23 +9,17 @@ import { CloudWatchSettings, setupMockedTemplateService } from './CloudWatchData
import { TimeRangeMock } from './timeRange';
export function setupMockedMetricsQueryRunner({
data = {
results: {},
},
response = { data: [] },
variables,
errorResponse,
instanceSettings = CloudWatchSettings,
}: {
data?: BackendDataSourceResponse;
response?: DataQueryResponse;
variables?: CustomVariableModel[];
errorResponse?: DataQueryError;
instanceSettings?: DataSourceInstanceSettings<CloudWatchJsonData>;
} = {}) {
const templateService = setupMockedTemplateService(variables);
const queryMock = errorResponse
? jest.fn().mockImplementation(() => throwError(errorResponse))
: jest.fn().mockReturnValue(of(toDataQueryResponse({ data })));
const queryMock = jest.fn().mockImplementation(() => of(response));
const runner = new CloudWatchMetricsQueryRunner(instanceSettings, templateService);
const request: DataQueryRequest<CloudWatchQuery> = {

View File

@ -2,7 +2,7 @@ import { of } from 'rxjs';
import { CustomVariableModel, getFrameDisplayName, VariableHide } from '@grafana/data';
import { dateTime } from '@grafana/data/src/datetime/moment_wrapper';
import { BackendDataSourceResponse } from '@grafana/runtime';
import { toDataQueryResponse } from '@grafana/runtime';
import * as redux from 'app/store/store';
import {
@ -17,18 +17,28 @@ import {
import { initialVariableModelState } from '../__mocks__/CloudWatchVariables';
import { setupMockedMetricsQueryRunner } from '../__mocks__/MetricsQueryRunner';
import { validMetricSearchBuilderQuery, validMetricSearchCodeQuery } from '../__mocks__/queries';
import { MetricQueryType, MetricEditorMode, CloudWatchMetricsQuery, DataQueryError } from '../types';
import { MetricQueryType, MetricEditorMode, CloudWatchMetricsQuery } from '../types';
describe('CloudWatchMetricsQueryRunner', () => {
describe('performTimeSeriesQuery', () => {
it('should return the same length of data as result', async () => {
const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
const resultsFromBEQuery = {
data: {
results: {
a: { refId: 'a', series: [{ target: 'cpu', datapoints: [[1, 1]] }] },
b: { refId: 'b', series: [{ target: 'memory', datapoints: [[2, 2]] }] },
a: {
refId: 'a',
series: [{ target: 'cpu', datapoints: [[1, 2]], meta: { custom: { period: 60 } } }],
},
b: {
refId: 'b',
series: [{ target: 'cpu', datapoints: [[1, 2]], meta: { custom: { period: 120 } } }],
},
},
},
};
const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(resultsFromBEQuery),
});
const observable = runner.performTimeSeriesQuery(
@ -47,7 +57,7 @@ describe('CloudWatchMetricsQueryRunner', () => {
});
it('sets fields.config.interval based on period', async () => {
const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
const resultsFromBEQuery = {
data: {
results: {
a: {
@ -60,6 +70,10 @@ describe('CloudWatchMetricsQueryRunner', () => {
},
},
},
};
const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(resultsFromBEQuery),
});
const observable = runner.performTimeSeriesQuery(
@ -98,31 +112,36 @@ describe('CloudWatchMetricsQueryRunner', () => {
},
];
const data: BackendDataSourceResponse = {
results: {
A: {
tables: [],
error: '',
refId: 'A',
series: [
{
target: 'CPUUtilization_Average',
datapoints: [
[1, 1483228800000],
[2, 1483229100000],
[5, 1483229700000],
],
tags: {
InstanceId: 'i-12345678',
const resultsFromBEQuery = {
data: {
results: {
A: {
tables: [],
error: '',
refId: 'A',
series: [
{
target: 'CPUUtilization_Average',
datapoints: [
[1, 1483228800000],
[2, 1483229100000],
[5, 1483229700000],
],
tags: {
InstanceId: 'i-12345678',
},
},
},
],
],
},
},
},
};
it('should generate the correct query', async () => {
const { runner, queryMock, request } = setupMockedMetricsQueryRunner({ data });
const { runner, queryMock, request } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(resultsFromBEQuery),
});
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith(() => {
expect(queryMock.mock.calls[0][0].targets).toMatchObject(
@ -159,7 +178,8 @@ describe('CloudWatchMetricsQueryRunner', () => {
];
const { runner, queryMock, request } = setupMockedMetricsQueryRunner({
data,
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(resultsFromBEQuery),
variables: [periodIntervalVariable],
});
@ -169,110 +189,111 @@ describe('CloudWatchMetricsQueryRunner', () => {
});
it('should return series list', async () => {
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ data });
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(resultsFromBEQuery),
});
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
const result = received[0];
expect(getFrameDisplayName(result.data[0])).toBe(
data.results.A.series?.length && data.results.A.series[0].target
);
expect(result.data[0].fields[1].values[0]).toBe(
data.results.A.series?.length && data.results.A.series[0].datapoints[0][0]
);
expect(getFrameDisplayName(result.data[0])).toBe('CPUUtilization_Average');
expect(result.data[0].fields[1].values[0]).toBe(1);
});
});
});
describe('and throttling exception is thrown', () => {
const partialQuery: CloudWatchMetricsQuery = {
metricQueryType: MetricQueryType.Search,
metricEditorMode: MetricEditorMode.Builder,
queryMode: 'Metrics',
namespace: 'AWS/EC2',
metricName: 'CPUUtilization',
dimensions: {
InstanceId: 'i-12345678',
},
statistic: 'Average',
period: '300',
expression: '',
id: '',
region: '',
refId: '',
};
const queries: CloudWatchMetricsQuery[] = [
{ ...partialQuery, refId: 'A', region: 'us-east-1' },
{ ...partialQuery, refId: 'B', region: 'us-east-2' },
{ ...partialQuery, refId: 'C', region: 'us-east-1' },
{ ...partialQuery, refId: 'D', region: 'us-east-2' },
{ ...partialQuery, refId: 'E', region: 'eu-north-1' },
];
const dataWithThrottlingError = {
data: {
message: 'Throttling: exception',
results: {
A: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'A',
meta: {},
},
B: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'B',
meta: {},
},
C: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'C',
meta: {},
},
D: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'D',
meta: {},
},
E: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'E',
meta: {},
},
},
},
};
beforeEach(() => {
redux.setStore({
...redux.store,
dispatch: jest.fn(),
});
});
describe('and throttling exception is thrown', () => {
const partialQuery: CloudWatchMetricsQuery = {
metricQueryType: MetricQueryType.Search,
metricEditorMode: MetricEditorMode.Builder,
queryMode: 'Metrics',
namespace: 'AWS/EC2',
metricName: 'CPUUtilization',
dimensions: {
InstanceId: 'i-12345678',
},
statistic: 'Average',
period: '300',
expression: '',
id: '',
region: '',
refId: '',
};
const queries: CloudWatchMetricsQuery[] = [
{ ...partialQuery, refId: 'A', region: 'us-east-1' },
{ ...partialQuery, refId: 'B', region: 'us-east-2' },
{ ...partialQuery, refId: 'C', region: 'us-east-1' },
{ ...partialQuery, refId: 'D', region: 'us-east-2' },
{ ...partialQuery, refId: 'E', region: 'eu-north-1' },
];
const backendErrorResponse: DataQueryError<CloudWatchMetricsQuery> = {
data: {
message: 'Throttling: exception',
results: {
A: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'A',
meta: {},
},
B: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'B',
meta: {},
},
C: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'C',
meta: {},
},
D: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'D',
meta: {},
},
E: {
frames: [],
series: [],
tables: [],
error: 'Throttling: exception',
refId: 'E',
meta: {},
},
},
},
};
beforeEach(() => {
redux.setStore({
...redux.store,
dispatch: jest.fn(),
});
it('should display one alert error message per region+datasource combination', async () => {
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
response: toDataQueryResponse(dataWithThrottlingError),
});
const memoizedDebounceSpy = jest.spyOn(runner, 'debouncedThrottlingAlert');
it('should display one alert error message per region+datasource combination', async () => {
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ errorResponse: backendErrorResponse });
const memoizedDebounceSpy = jest.spyOn(runner, 'debouncedAlert');
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith(() => {
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-1');
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-2');
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'eu-north-1');
expect(memoizedDebounceSpy).toBeCalledTimes(3);
});
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
expect(received[0].errors).toHaveLength(5);
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-1');
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-2');
expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'eu-north-1');
expect(memoizedDebounceSpy).toBeCalledTimes(3);
});
});
});
@ -298,43 +319,80 @@ describe('CloudWatchMetricsQueryRunner', () => {
},
];
const data: BackendDataSourceResponse = {
results: {
A: {
tables: [],
error: '',
refId: 'A',
series: [
{
target: 'TargetResponseTime_p90.00',
datapoints: [
[1, 1483228800000],
[2, 1483229100000],
[5, 1483229700000],
],
tags: {
LoadBalancer: 'lb',
TargetGroup: 'tg',
const responseFromBEQuery = {
data: {
results: {
A: {
tables: [],
error: '',
refId: 'A',
series: [
{
target: 'TargetResponseTime_p90.00',
datapoints: [
[1, 1483228800000],
[2, 1483229100000],
[5, 1483229700000],
],
tags: {
LoadBalancer: 'lb',
TargetGroup: 'tg',
},
},
},
],
],
},
},
},
};
it('should return series list', async () => {
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ data });
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(responseFromBEQuery),
});
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
const result = received[0];
expect(getFrameDisplayName(result.data[0])).toBe(
data.results.A.series?.length && data.results.A.series[0].target
responseFromBEQuery.data.results.A.series?.length && responseFromBEQuery.data.results.A.series[0].target
);
expect(result.data[0].fields[1].values[0]).toBe(
data.results.A.series?.length && data.results.A.series[0].datapoints[0][0]
responseFromBEQuery.data.results.A.series?.length &&
responseFromBEQuery.data.results.A.series[0].datapoints[0][0]
);
});
});
it('should pass the error list from DatasourceWithBackend srv', async () => {
const dataWithError = {
data: {
results: {
A: {
error:
"metric request error: \"ValidationError: Error in expression 'query': Invalid syntax\\n\\tstatus code: 400",
status: 500,
},
},
},
status: 500,
statusText: 'Internal Server Error',
};
const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
// DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
response: toDataQueryResponse(dataWithError),
});
await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
const result = received[0];
expect(result.data).toEqual([]);
expect(result.errors).toEqual([
{
message:
"metric request error: \"ValidationError: Error in expression 'query': Invalid syntax\\n\\tstatus code: 400",
status: 500,
refId: 'A',
},
]);
});
});
});
describe('template variable interpolation', () => {

View File

@ -1,9 +1,10 @@
import { findLast, isEmpty } from 'lodash';
import { isEmpty } from 'lodash';
import React from 'react';
import { catchError, map, Observable, of, throwError } from 'rxjs';
import { catchError, map, Observable, of } from 'rxjs';
import {
DataFrame,
DataQueryError,
DataQueryRequest,
DataQueryResponse,
DataSourceInstanceSettings,
@ -21,7 +22,7 @@ import { AppNotificationTimeout } from 'app/types';
import { ThrottlingErrorMessage } from '../components/Errors/ThrottlingErrorMessage';
import memoizedDebounce from '../memoizedDebounce';
import { migrateMetricQuery } from '../migrations/metricQueryMigrations';
import { CloudWatchJsonData, CloudWatchMetricsQuery, CloudWatchQuery, DataQueryError } from '../types';
import { CloudWatchJsonData, CloudWatchMetricsQuery, CloudWatchQuery } from '../types';
import { filterMetricsQuery } from '../utils/utils';
import { CloudWatchRequest } from './CloudWatchRequest';
@ -37,9 +38,10 @@ const displayAlert = (datasourceName: string, region: string) =>
)
)
);
// This class handles execution of CloudWatch metrics query data queries
export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
debouncedAlert: (datasourceName: string, region: string) => void = memoizedDebounce(
debouncedThrottlingAlert: (datasourceName: string, region: string) => void = memoizedDebounce(
displayAlert,
AppNotificationTimeout.Error
);
@ -109,12 +111,7 @@ export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
): Observable<DataQueryResponse> {
return queryFn(request).pipe(
map((res) => {
const dataframes: DataFrame[] = res.data;
if (!dataframes || dataframes.length <= 0) {
return { data: [] };
}
const lastError = findLast(res.data, (v) => !!v.error);
const dataframes: DataFrame[] = res.data || [];
dataframes.forEach((frame) => {
frame.fields.forEach((field) => {
@ -125,47 +122,48 @@ export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
});
});
if (res.errors?.length) {
this.alertOnErrors(res.errors, request);
}
return {
data: dataframes,
error: lastError ? { message: lastError.error } : undefined,
// DataSourceWithBackend will not throw an error, instead it will return "errors" field along with the response
errors: res.errors,
};
}),
catchError((err: DataQueryError<CloudWatchMetricsQuery>) => {
const isFrameError = err.data?.results;
// Error is not frame specific
if (!isFrameError && err.data && err.data.message === 'Metric request error' && err.data.error) {
err.message = err.data.error;
return throwError(() => err);
catchError((err: unknown) => {
if (Array.isArray(err)) {
return of({ data: [], errors: err });
} else {
return of({ data: [], errors: [{ message: err }] });
}
// The error is either for a specific frame or for all the frames
const results: Array<{ error?: string }> = Object.values(err.data?.results ?? {});
const firstErrorResult = results.find((r) => r.error);
if (firstErrorResult) {
err.message = firstErrorResult.error;
}
if (results.some((r) => r.error && /^Throttling:.*/.test(r.error))) {
const failedRedIds = Object.keys(err.data?.results ?? {});
const regionsAffected = Object.values(request.targets).reduce(
(res: string[], { refId, region }) =>
(refId && !failedRedIds.includes(refId)) || res.includes(region) ? res : [...res, region],
[]
);
regionsAffected.forEach((region) => {
const actualRegion = this.getActualRegion(region);
if (actualRegion) {
this.debouncedAlert(this.instanceSettings.name, actualRegion);
}
});
}
return throwError(() => err);
})
);
}
alertOnErrors(errors: DataQueryError[], request: DataQueryRequest<CloudWatchQuery>) {
const hasThrottlingError = errors.some(
(err) => err.message && (/^Throttling:.*/.test(err.message) || /^Rate exceeded.*/.test(err.message))
);
if (hasThrottlingError) {
const failedRefIds = errors.map((error) => error.refId).filter((refId) => refId);
if (failedRefIds.length > 0) {
const regionsAffected = Object.values(request.targets).reduce(
(res: string[], { refId, region }) =>
(refId && !failedRefIds.includes(refId)) || res.includes(region) ? res : [...res, region],
[]
);
regionsAffected.forEach((region) => {
const actualRegion = this.getActualRegion(region);
if (actualRegion) {
this.debouncedThrottlingAlert(this.instanceSettings.name, actualRegion);
}
});
}
}
}
filterMetricQuery(query: CloudWatchMetricsQuery): boolean {
return filterMetricsQuery(query);
}