Cloudwatch Metrics: Adjust error handling (#79911)

2025-02-25 18:55:37 -06:00 · 2024-01-15 17:19:26 +01:00 · 2024-01-15 17:19:26 +01:00 · d3a89a28fa
commit d3a89a28fa
parent 406ace27c0
4 changed files with 263 additions and 198 deletions
--- a/pkg/tsdb/cloudwatch/time_series_query.go
+++ b/pkg/tsdb/cloudwatch/time_series_query.go
@ -3,6 +3,7 @@ package cloudwatch
 import (
 	"context"
 	"fmt"
+	"regexp"

 	"github.com/grafana/grafana-plugin-sdk-go/backend"
 	"golang.org/x/sync/errgroup"
@ -121,6 +122,7 @@ func (e *cloudWatchExecutor) executeTimeSeriesQuery(ctx context.Context, logger
 			Error: fmt.Errorf("metric request error: %q", err),
 		}
 		resultChan <- &responseWrapper{
+			RefId:        getQueryRefIdFromErrorString(err.Error(), requestQueries),
 			DataResponse: &dataResponse,
 		}
 	}
@ -132,3 +134,17 @@ func (e *cloudWatchExecutor) executeTimeSeriesQuery(ctx context.Context, logger

 	return resp, nil
 }
+
+func getQueryRefIdFromErrorString(err string, queries []*models.CloudWatchQuery) string {
+	// error can be in format "Error in expression 'test': Invalid syntax"
+	// so we can find the query id or ref id between the quotations
+	erroredRefId := ""
+
+	for _, query := range queries {
+		if regexp.MustCompile(`'`+query.RefId+`':`).MatchString(err) || regexp.MustCompile(`'`+query.Id+`':`).MatchString(err) {
+			erroredRefId = query.RefId
+		}
+	}
+	// if errorRefId is empty, it means the error concerns all queries (error metric limit exceeded, for example)
+	return erroredRefId
+}
--- a/public/app/plugins/datasource/cloudwatch/mocks/MetricsQueryRunner.ts
+++ b/public/app/plugins/datasource/cloudwatch/mocks/MetricsQueryRunner.ts
@ -1,7 +1,6 @@
-import { of, throwError } from 'rxjs';
+import { of } from 'rxjs';

-import { CustomVariableModel, DataQueryError, DataQueryRequest, DataSourceInstanceSettings } from '@grafana/data';
-import { BackendDataSourceResponse, toDataQueryResponse } from '@grafana/runtime';
+import { CustomVariableModel, DataQueryRequest, DataQueryResponse, DataSourceInstanceSettings } from '@grafana/data';

 import { CloudWatchMetricsQueryRunner } from '../query-runner/CloudWatchMetricsQueryRunner';
 import { CloudWatchJsonData, CloudWatchQuery } from '../types';
@ -10,23 +9,17 @@ import { CloudWatchSettings, setupMockedTemplateService } from './CloudWatchData
 import { TimeRangeMock } from './timeRange';

 export function setupMockedMetricsQueryRunner({
-  data = {
-    results: {},
-  },
+  response = { data: [] },
  variables,
-  errorResponse,
  instanceSettings = CloudWatchSettings,
 }: {
-  data?: BackendDataSourceResponse;
+  response?: DataQueryResponse;
  variables?: CustomVariableModel[];
-  errorResponse?: DataQueryError;
  instanceSettings?: DataSourceInstanceSettings<CloudWatchJsonData>;
 } = {}) {
  const templateService = setupMockedTemplateService(variables);

-  const queryMock = errorResponse
-    ? jest.fn().mockImplementation(() => throwError(errorResponse))
-    : jest.fn().mockReturnValue(of(toDataQueryResponse({ data })));
+  const queryMock = jest.fn().mockImplementation(() => of(response));
  const runner = new CloudWatchMetricsQueryRunner(instanceSettings, templateService);

  const request: DataQueryRequest<CloudWatchQuery> = {
--- a/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchMetricsQueryRunner.test.ts
+++ b/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchMetricsQueryRunner.test.ts
@ -2,7 +2,7 @@ import { of } from 'rxjs';

 import { CustomVariableModel, getFrameDisplayName, VariableHide } from '@grafana/data';
 import { dateTime } from '@grafana/data/src/datetime/moment_wrapper';
-import { BackendDataSourceResponse } from '@grafana/runtime';
+import { toDataQueryResponse } from '@grafana/runtime';
 import * as redux from 'app/store/store';

 import {
@ -17,18 +17,28 @@ import {
 import { initialVariableModelState } from '../__mocks__/CloudWatchVariables';
 import { setupMockedMetricsQueryRunner } from '../__mocks__/MetricsQueryRunner';
 import { validMetricSearchBuilderQuery, validMetricSearchCodeQuery } from '../__mocks__/queries';
-import { MetricQueryType, MetricEditorMode, CloudWatchMetricsQuery, DataQueryError } from '../types';
+import { MetricQueryType, MetricEditorMode, CloudWatchMetricsQuery } from '../types';

 describe('CloudWatchMetricsQueryRunner', () => {
  describe('performTimeSeriesQuery', () => {
    it('should return the same length of data as result', async () => {
-      const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
+      const resultsFromBEQuery = {
        data: {
          results: {
-            a: { refId: 'a', series: [{ target: 'cpu', datapoints: [[1, 1]] }] },
-            b: { refId: 'b', series: [{ target: 'memory', datapoints: [[2, 2]] }] },
+            a: {
+              refId: 'a',
+              series: [{ target: 'cpu', datapoints: [[1, 2]], meta: { custom: { period: 60 } } }],
+            },
+            b: {
+              refId: 'b',
+              series: [{ target: 'cpu', datapoints: [[1, 2]], meta: { custom: { period: 120 } } }],
+            },
          },
        },
+      };
+      const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
+        // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+        response: toDataQueryResponse(resultsFromBEQuery),
      });

      const observable = runner.performTimeSeriesQuery(
@ -47,7 +57,7 @@ describe('CloudWatchMetricsQueryRunner', () => {
    });

    it('sets fields.config.interval based on period', async () => {
-      const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
+      const resultsFromBEQuery = {
        data: {
          results: {
            a: {
@ -60,6 +70,10 @@ describe('CloudWatchMetricsQueryRunner', () => {
            },
          },
        },
+      };
+      const { runner, timeRange, request, queryMock } = setupMockedMetricsQueryRunner({
+        // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+        response: toDataQueryResponse(resultsFromBEQuery),
      });

      const observable = runner.performTimeSeriesQuery(
@ -98,31 +112,36 @@ describe('CloudWatchMetricsQueryRunner', () => {
        },
      ];

-      const data: BackendDataSourceResponse = {
-        results: {
-          A: {
-            tables: [],
-            error: '',
-            refId: 'A',
-            series: [
-              {
-                target: 'CPUUtilization_Average',
-                datapoints: [
-                  [1, 1483228800000],
-                  [2, 1483229100000],
-                  [5, 1483229700000],
-                ],
-                tags: {
-                  InstanceId: 'i-12345678',
+      const resultsFromBEQuery = {
+        data: {
+          results: {
+            A: {
+              tables: [],
+              error: '',
+              refId: 'A',
+              series: [
+                {
+                  target: 'CPUUtilization_Average',
+                  datapoints: [
+                    [1, 1483228800000],
+                    [2, 1483229100000],
+                    [5, 1483229700000],
+                  ],
+                  tags: {
+                    InstanceId: 'i-12345678',
+                  },
                },
-              },
-            ],
+              ],
+            },
          },
        },
      };

      it('should generate the correct query', async () => {
-        const { runner, queryMock, request } = setupMockedMetricsQueryRunner({ data });
+        const { runner, queryMock, request } = setupMockedMetricsQueryRunner({
+          // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+          response: toDataQueryResponse(resultsFromBEQuery),
+        });

        await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith(() => {
          expect(queryMock.mock.calls[0][0].targets).toMatchObject(
@ -159,7 +178,8 @@ describe('CloudWatchMetricsQueryRunner', () => {
        ];

        const { runner, queryMock, request } = setupMockedMetricsQueryRunner({
-          data,
+          // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+          response: toDataQueryResponse(resultsFromBEQuery),
          variables: [periodIntervalVariable],
        });

@ -169,110 +189,111 @@ describe('CloudWatchMetricsQueryRunner', () => {
      });

      it('should return series list', async () => {
-        const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ data });
+        const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
+          // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+          response: toDataQueryResponse(resultsFromBEQuery),
+        });

        await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
          const result = received[0];
-          expect(getFrameDisplayName(result.data[0])).toBe(
-            data.results.A.series?.length && data.results.A.series[0].target
-          );
-          expect(result.data[0].fields[1].values[0]).toBe(
-            data.results.A.series?.length && data.results.A.series[0].datapoints[0][0]
-          );
+          expect(getFrameDisplayName(result.data[0])).toBe('CPUUtilization_Average');
+          expect(result.data[0].fields[1].values[0]).toBe(1);
+        });
+      });
+    });
+    describe('and throttling exception is thrown', () => {
+      const partialQuery: CloudWatchMetricsQuery = {
+        metricQueryType: MetricQueryType.Search,
+        metricEditorMode: MetricEditorMode.Builder,
+        queryMode: 'Metrics',
+        namespace: 'AWS/EC2',
+        metricName: 'CPUUtilization',
+        dimensions: {
+          InstanceId: 'i-12345678',
+        },
+        statistic: 'Average',
+        period: '300',
+        expression: '',
+        id: '',
+        region: '',
+        refId: '',
+      };
+
+      const queries: CloudWatchMetricsQuery[] = [
+        { ...partialQuery, refId: 'A', region: 'us-east-1' },
+        { ...partialQuery, refId: 'B', region: 'us-east-2' },
+        { ...partialQuery, refId: 'C', region: 'us-east-1' },
+        { ...partialQuery, refId: 'D', region: 'us-east-2' },
+        { ...partialQuery, refId: 'E', region: 'eu-north-1' },
+      ];
+
+      const dataWithThrottlingError = {
+        data: {
+          message: 'Throttling: exception',
+          results: {
+            A: {
+              frames: [],
+              series: [],
+              tables: [],
+              error: 'Throttling: exception',
+              refId: 'A',
+              meta: {},
+            },
+            B: {
+              frames: [],
+              series: [],
+              tables: [],
+              error: 'Throttling: exception',
+              refId: 'B',
+              meta: {},
+            },
+            C: {
+              frames: [],
+              series: [],
+              tables: [],
+              error: 'Throttling: exception',
+              refId: 'C',
+              meta: {},
+            },
+            D: {
+              frames: [],
+              series: [],
+              tables: [],
+              error: 'Throttling: exception',
+              refId: 'D',
+              meta: {},
+            },
+            E: {
+              frames: [],
+              series: [],
+              tables: [],
+              error: 'Throttling: exception',
+              refId: 'E',
+              meta: {},
+            },
+          },
+        },
+      };
+
+      beforeEach(() => {
+        redux.setStore({
+          ...redux.store,
+          dispatch: jest.fn(),
        });
      });

-      describe('and throttling exception is thrown', () => {
-        const partialQuery: CloudWatchMetricsQuery = {
-          metricQueryType: MetricQueryType.Search,
-          metricEditorMode: MetricEditorMode.Builder,
-          queryMode: 'Metrics',
-          namespace: 'AWS/EC2',
-          metricName: 'CPUUtilization',
-          dimensions: {
-            InstanceId: 'i-12345678',
-          },
-          statistic: 'Average',
-          period: '300',
-          expression: '',
-          id: '',
-          region: '',
-          refId: '',
-        };
-
-        const queries: CloudWatchMetricsQuery[] = [
-          { ...partialQuery, refId: 'A', region: 'us-east-1' },
-          { ...partialQuery, refId: 'B', region: 'us-east-2' },
-          { ...partialQuery, refId: 'C', region: 'us-east-1' },
-          { ...partialQuery, refId: 'D', region: 'us-east-2' },
-          { ...partialQuery, refId: 'E', region: 'eu-north-1' },
-        ];
-
-        const backendErrorResponse: DataQueryError<CloudWatchMetricsQuery> = {
-          data: {
-            message: 'Throttling: exception',
-            results: {
-              A: {
-                frames: [],
-                series: [],
-                tables: [],
-                error: 'Throttling: exception',
-                refId: 'A',
-                meta: {},
-              },
-              B: {
-                frames: [],
-                series: [],
-                tables: [],
-                error: 'Throttling: exception',
-                refId: 'B',
-                meta: {},
-              },
-              C: {
-                frames: [],
-                series: [],
-                tables: [],
-                error: 'Throttling: exception',
-                refId: 'C',
-                meta: {},
-              },
-              D: {
-                frames: [],
-                series: [],
-                tables: [],
-                error: 'Throttling: exception',
-                refId: 'D',
-                meta: {},
-              },
-              E: {
-                frames: [],
-                series: [],
-                tables: [],
-                error: 'Throttling: exception',
-                refId: 'E',
-                meta: {},
-              },
-            },
-          },
-        };
-
-        beforeEach(() => {
-          redux.setStore({
-            ...redux.store,
-            dispatch: jest.fn(),
-          });
+      it('should display one alert error message per region+datasource combination', async () => {
+        const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
+          response: toDataQueryResponse(dataWithThrottlingError),
        });
+        const memoizedDebounceSpy = jest.spyOn(runner, 'debouncedThrottlingAlert');

-        it('should display one alert error message per region+datasource combination', async () => {
-          const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ errorResponse: backendErrorResponse });
-          const memoizedDebounceSpy = jest.spyOn(runner, 'debouncedAlert');
-
-          await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith(() => {
-            expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-1');
-            expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-2');
-            expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'eu-north-1');
-            expect(memoizedDebounceSpy).toBeCalledTimes(3);
-          });
+        await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
+          expect(received[0].errors).toHaveLength(5);
+          expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-1');
+          expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'us-east-2');
+          expect(memoizedDebounceSpy).toHaveBeenCalledWith('CloudWatch Test Datasource', 'eu-north-1');
+          expect(memoizedDebounceSpy).toBeCalledTimes(3);
        });
      });
    });
@ -298,43 +319,80 @@ describe('CloudWatchMetricsQueryRunner', () => {
      },
    ];

-    const data: BackendDataSourceResponse = {
-      results: {
-        A: {
-          tables: [],
-          error: '',
-          refId: 'A',
-          series: [
-            {
-              target: 'TargetResponseTime_p90.00',
-              datapoints: [
-                [1, 1483228800000],
-                [2, 1483229100000],
-                [5, 1483229700000],
-              ],
-              tags: {
-                LoadBalancer: 'lb',
-                TargetGroup: 'tg',
+    const responseFromBEQuery = {
+      data: {
+        results: {
+          A: {
+            tables: [],
+            error: '',
+            refId: 'A',
+            series: [
+              {
+                target: 'TargetResponseTime_p90.00',
+                datapoints: [
+                  [1, 1483228800000],
+                  [2, 1483229100000],
+                  [5, 1483229700000],
+                ],
+                tags: {
+                  LoadBalancer: 'lb',
+                  TargetGroup: 'tg',
+                },
              },
-            },
-          ],
+            ],
+          },
        },
      },
    };

    it('should return series list', async () => {
-      const { runner, request, queryMock } = setupMockedMetricsQueryRunner({ data });
+      const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
+        // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+        response: toDataQueryResponse(responseFromBEQuery),
+      });

      await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
        const result = received[0];
        expect(getFrameDisplayName(result.data[0])).toBe(
-          data.results.A.series?.length && data.results.A.series[0].target
+          responseFromBEQuery.data.results.A.series?.length && responseFromBEQuery.data.results.A.series[0].target
        );
        expect(result.data[0].fields[1].values[0]).toBe(
-          data.results.A.series?.length && data.results.A.series[0].datapoints[0][0]
+          responseFromBEQuery.data.results.A.series?.length &&
+            responseFromBEQuery.data.results.A.series[0].datapoints[0][0]
        );
      });
    });
+    it('should pass the error list from DatasourceWithBackend srv', async () => {
+      const dataWithError = {
+        data: {
+          results: {
+            A: {
+              error:
+                "metric request error: \"ValidationError: Error in expression 'query': Invalid syntax\\n\\tstatus code: 400",
+              status: 500,
+            },
+          },
+        },
+        status: 500,
+        statusText: 'Internal Server Error',
+      };
+      const { runner, request, queryMock } = setupMockedMetricsQueryRunner({
+        // DataSourceWithBackend runs toDataQueryResponse({response from CW backend})
+        response: toDataQueryResponse(dataWithError),
+      });
+      await expect(runner.handleMetricQueries(queries, request, queryMock)).toEmitValuesWith((received) => {
+        const result = received[0];
+        expect(result.data).toEqual([]);
+        expect(result.errors).toEqual([
+          {
+            message:
+              "metric request error: \"ValidationError: Error in expression 'query': Invalid syntax\\n\\tstatus code: 400",
+            status: 500,
+            refId: 'A',
+          },
+        ]);
+      });
+    });
  });

  describe('template variable interpolation', () => {
--- a/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchMetricsQueryRunner.ts
+++ b/public/app/plugins/datasource/cloudwatch/query-runner/CloudWatchMetricsQueryRunner.ts
@ -1,9 +1,10 @@
-import { findLast, isEmpty } from 'lodash';
+import { isEmpty } from 'lodash';
 import React from 'react';
-import { catchError, map, Observable, of, throwError } from 'rxjs';
+import { catchError, map, Observable, of } from 'rxjs';

 import {
  DataFrame,
+  DataQueryError,
  DataQueryRequest,
  DataQueryResponse,
  DataSourceInstanceSettings,
@ -21,7 +22,7 @@ import { AppNotificationTimeout } from 'app/types';
 import { ThrottlingErrorMessage } from '../components/Errors/ThrottlingErrorMessage';
 import memoizedDebounce from '../memoizedDebounce';
 import { migrateMetricQuery } from '../migrations/metricQueryMigrations';
-import { CloudWatchJsonData, CloudWatchMetricsQuery, CloudWatchQuery, DataQueryError } from '../types';
+import { CloudWatchJsonData, CloudWatchMetricsQuery, CloudWatchQuery } from '../types';
 import { filterMetricsQuery } from '../utils/utils';

 import { CloudWatchRequest } from './CloudWatchRequest';
@ -37,9 +38,10 @@ const displayAlert = (datasourceName: string, region: string) =>
      )
    )
  );
+
 // This class handles execution of CloudWatch metrics query data queries
 export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
-  debouncedAlert: (datasourceName: string, region: string) => void = memoizedDebounce(
+  debouncedThrottlingAlert: (datasourceName: string, region: string) => void = memoizedDebounce(
    displayAlert,
    AppNotificationTimeout.Error
  );
@ -109,12 +111,7 @@ export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
  ): Observable<DataQueryResponse> {
    return queryFn(request).pipe(
      map((res) => {
-        const dataframes: DataFrame[] = res.data;
-        if (!dataframes || dataframes.length <= 0) {
-          return { data: [] };
-        }
-
-        const lastError = findLast(res.data, (v) => !!v.error);
+        const dataframes: DataFrame[] = res.data || [];

        dataframes.forEach((frame) => {
          frame.fields.forEach((field) => {
@ -125,47 +122,48 @@ export class CloudWatchMetricsQueryRunner extends CloudWatchRequest {
          });
        });

+        if (res.errors?.length) {
+          this.alertOnErrors(res.errors, request);
+        }
+
        return {
          data: dataframes,
-          error: lastError ? { message: lastError.error } : undefined,
+          // DataSourceWithBackend will not throw an error, instead it will return "errors" field along with the response
+          errors: res.errors,
        };
      }),
-      catchError((err: DataQueryError<CloudWatchMetricsQuery>) => {
-        const isFrameError = err.data?.results;
-
-        // Error is not frame specific
-        if (!isFrameError && err.data && err.data.message === 'Metric request error' && err.data.error) {
-          err.message = err.data.error;
-          return throwError(() => err);
+      catchError((err: unknown) => {
+        if (Array.isArray(err)) {
+          return of({ data: [], errors: err });
+        } else {
+          return of({ data: [], errors: [{ message: err }] });
        }
-
-        // The error is either for a specific frame or for all the frames
-        const results: Array<{ error?: string }> = Object.values(err.data?.results ?? {});
-        const firstErrorResult = results.find((r) => r.error);
-        if (firstErrorResult) {
-          err.message = firstErrorResult.error;
-        }
-
-        if (results.some((r) => r.error && /^Throttling:.*/.test(r.error))) {
-          const failedRedIds = Object.keys(err.data?.results ?? {});
-          const regionsAffected = Object.values(request.targets).reduce(
-            (res: string[], { refId, region }) =>
-              (refId && !failedRedIds.includes(refId)) || res.includes(region) ? res : [...res, region],
-            []
-          );
-          regionsAffected.forEach((region) => {
-            const actualRegion = this.getActualRegion(region);
-            if (actualRegion) {
-              this.debouncedAlert(this.instanceSettings.name, actualRegion);
-            }
-          });
-        }
-
-        return throwError(() => err);
      })
    );
  }

+  alertOnErrors(errors: DataQueryError[], request: DataQueryRequest<CloudWatchQuery>) {
+    const hasThrottlingError = errors.some(
+      (err) => err.message && (/^Throttling:.*/.test(err.message) || /^Rate exceeded.*/.test(err.message))
+    );
+    if (hasThrottlingError) {
+      const failedRefIds = errors.map((error) => error.refId).filter((refId) => refId);
+      if (failedRefIds.length > 0) {
+        const regionsAffected = Object.values(request.targets).reduce(
+          (res: string[], { refId, region }) =>
+            (refId && !failedRefIds.includes(refId)) || res.includes(region) ? res : [...res, region],
+          []
+        );
+        regionsAffected.forEach((region) => {
+          const actualRegion = this.getActualRegion(region);
+          if (actualRegion) {
+            this.debouncedThrottlingAlert(this.instanceSettings.name, actualRegion);
+          }
+        });
+      }
+    }
+  }
+
  filterMetricQuery(query: CloudWatchMetricsQuery): boolean {
    return filterMetricsQuery(query);
  }