diff --git a/public/app/plugins/datasource/tempo/datasource.test.ts b/public/app/plugins/datasource/tempo/datasource.test.ts index 444f5c44772..0e557d2b37b 100644 --- a/public/app/plugins/datasource/tempo/datasource.test.ts +++ b/public/app/plugins/datasource/tempo/datasource.test.ts @@ -209,7 +209,7 @@ const backendSrvWithPrometheus = { if (uid === 'prom') { return { query() { - return of({ data: [totalsPromMetric, secondsPromMetric] }); + return of({ data: [totalsPromMetric, secondsPromMetric, failedPromMetric] }); }, }; } @@ -280,6 +280,19 @@ const secondsPromMetric = new MutableDataFrame({ ], }); +const failedPromMetric = new MutableDataFrame({ + refId: 'traces_service_graph_request_failed_total', + fields: [ + { name: 'Time', values: [1628169788000, 1628169788000] }, + { name: 'client', values: ['app', 'lb'] }, + { name: 'instance', values: ['127.0.0.1:12345', '127.0.0.1:12345'] }, + { name: 'job', values: ['local_scrape', 'local_scrape'] }, + { name: 'server', values: ['db', 'app'] }, + { name: 'tempo_config', values: ['default', 'default'] }, + { name: 'Value #traces_service_graph_request_failed_total', values: [2, 15] }, + ], +}); + const mockInvalidJson = { batches: [ { diff --git a/public/app/plugins/datasource/tempo/datasource.ts b/public/app/plugins/datasource/tempo/datasource.ts index 20359b6443c..51069ad85a0 100644 --- a/public/app/plugins/datasource/tempo/datasource.ts +++ b/public/app/plugins/datasource/tempo/datasource.ts @@ -292,9 +292,6 @@ function serviceMapQuery(request: DataQueryRequest, datasourceUid: s data: mapPromMetricsToServiceMap(responses, request.range), state: LoadingState.Done, }; - }), - catchError((error) => { - return of({ error: { message: error.message }, data: [] }); }) ); } diff --git a/public/app/plugins/datasource/tempo/graphTransform.test.ts b/public/app/plugins/datasource/tempo/graphTransform.test.ts index 43f595b9471..639fcd22852 100644 --- a/public/app/plugins/datasource/tempo/graphTransform.test.ts +++ b/public/app/plugins/datasource/tempo/graphTransform.test.ts @@ -64,16 +64,21 @@ describe('mapPromMetricsToServiceMap', () => { from: dateTime('2000-01-01T00:00:00'), to: dateTime('2000-01-01T00:01:00'), }; - const [nodes, edges] = mapPromMetricsToServiceMap([{ data: [totalsPromMetric, secondsPromMetric] }], { - ...range, - raw: range, - }); + const [nodes, edges] = mapPromMetricsToServiceMap( + [{ data: [totalsPromMetric, secondsPromMetric, failedPromMetric] }], + { + ...range, + raw: range, + } + ); expect(nodes.fields).toMatchObject([ { name: 'id', values: new ArrayVector(['db', 'app', 'lb']) }, { name: 'title', values: new ArrayVector(['db', 'app', 'lb']) }, { name: 'mainStat', values: new ArrayVector([1000, 2000, NaN]) }, { name: 'secondaryStat', values: new ArrayVector([0.17, 0.33, NaN]) }, + { name: 'arc__success', values: new ArrayVector([0.8, 0.25, 1]) }, + { name: 'arc__failed', values: new ArrayVector([0.2, 0.75, 0]) }, ]); expect(edges.fields).toMatchObject([ { name: 'id', values: new ArrayVector(['app_db', 'lb_app']) }, @@ -134,3 +139,16 @@ const secondsPromMetric = new MutableDataFrame({ { name: 'Value #traces_service_graph_request_server_seconds_sum', values: [10, 40] }, ], }); + +const failedPromMetric = new MutableDataFrame({ + refId: 'traces_service_graph_request_failed_total', + fields: [ + { name: 'Time', values: [1628169788000, 1628169788000] }, + { name: 'client', values: ['app', 'lb'] }, + { name: 'instance', values: ['127.0.0.1:12345', '127.0.0.1:12345'] }, + { name: 'job', values: ['local_scrape', 'local_scrape'] }, + { name: 'server', values: ['db', 'app'] }, + { name: 'tempo_config', values: ['default', 'default'] }, + { name: 'Value #traces_service_graph_request_failed_total', values: [2, 15] }, + ], +}); diff --git a/public/app/plugins/datasource/tempo/graphTransform.ts b/public/app/plugins/datasource/tempo/graphTransform.ts index 3de895b2cf9..5ce34e5806a 100644 --- a/public/app/plugins/datasource/tempo/graphTransform.ts +++ b/public/app/plugins/datasource/tempo/graphTransform.ts @@ -1,8 +1,8 @@ -import { groupBy } from 'lodash'; import { DataFrame, DataFrameView, DataQueryResponse, + FieldColorModeId, FieldDTO, MutableDataFrame, NodeGraphDataFrameFieldNames as Fields, @@ -10,6 +10,9 @@ import { } from '@grafana/data'; import { getNonOverlappingDuration, getStats, makeFrames, makeSpanMap } from '../../../core/utils/tracing'; +/** + * Row in a trace dataFrame + */ interface Row { traceID: string; spanID: string; @@ -129,10 +132,12 @@ function findTraceDuration(view: DataFrameView): number { const secondsMetric = 'traces_service_graph_request_server_seconds_sum'; const totalsMetric = 'traces_service_graph_request_total'; +const failedMetric = 'traces_service_graph_request_failed_total'; export const serviceMapMetrics = [ secondsMetric, totalsMetric, + failedMetric, // We don't show histogram in node graph at the moment but we could later add that into a node context menu. // 'traces_service_graph_request_seconds_bucket', // 'traces_service_graph_request_seconds_count', @@ -147,14 +152,15 @@ export const serviceMapMetrics = [ * @param range */ export function mapPromMetricsToServiceMap(responses: DataQueryResponse[], range: TimeRange): [DataFrame, DataFrame] { - const [totalsDFView, secondsDFView] = getMetricFrames(responses); + const frames = getMetricFrames(responses); // First just collect data from the metrics into a map with nodes and edges as keys - const nodesMap: Record = {}; - const edgesMap: Record = {}; + const nodesMap: Record = {}; + const edgesMap: Record = {}; // At this moment we don't have any error/success or other counts so we just use these 2 - collectMetricData(totalsDFView, 'total', totalsMetric, nodesMap, edgesMap); - collectMetricData(secondsDFView, 'seconds', secondsMetric, nodesMap, edgesMap); + collectMetricData(frames[totalsMetric], 'total', totalsMetric, nodesMap, edgesMap); + collectMetricData(frames[secondsMetric], 'seconds', secondsMetric, nodesMap, edgesMap); + collectMetricData(frames[failedMetric], 'failed', failedMetric, nodesMap, edgesMap); return convertToDataFrames(nodesMap, edgesMap, range); } @@ -172,6 +178,14 @@ function createServiceMapDataFrames() { name: Fields.secondaryStat, config: { unit: 'r/sec', displayName: 'Requests per second' }, }, + { + name: Fields.arc + 'success', + config: { displayName: 'Success', color: { fixedColor: 'green', mode: FieldColorModeId.Fixed } }, + }, + { + name: Fields.arc + 'failed', + config: { displayName: 'Failed', color: { fixedColor: 'red', mode: FieldColorModeId.Fixed } }, + }, ]); const edges = createDF('Edges', [ { name: Fields.id }, @@ -184,13 +198,29 @@ function createServiceMapDataFrames() { return [nodes, edges]; } -function getMetricFrames(responses: DataQueryResponse[]) { - const responsesMap = groupBy(responses[0].data, (data) => data.refId); - const totalsDFView = new DataFrameView(responsesMap[totalsMetric][0]); - const secondsDFView = new DataFrameView(responsesMap[secondsMetric][0]); - return [totalsDFView, secondsDFView]; +/** + * Group frames from response based on ref id which is set the same as the metric name so we know which metric is where + * and also put it into DataFrameView so it's easier to work with. + * @param responses + */ +function getMetricFrames(responses: DataQueryResponse[]): Record { + return responses[0].data.reduce>((acc, frame) => { + acc[frame.refId] = new DataFrameView(frame); + return acc; + }, {}); } +type ServiceMapStatistics = { + total?: number; + seconds?: number; + failed?: number; +}; + +type EdgeObject = ServiceMapStatistics & { + source: string; + target: string; +}; + /** * Collect data from a metric into a map of nodes and edges. The metric data is modeled as counts of metric per edge * which is a pair of client-server nodes. This means we convert each row of the metric 1-1 to edges and than we assign @@ -203,12 +233,16 @@ function getMetricFrames(responses: DataQueryResponse[]) { * @param edgesMap */ function collectMetricData( - frame: DataFrameView, - stat: 'total' | 'seconds', + frame: DataFrameView | undefined, + stat: keyof ServiceMapStatistics, metric: string, - nodesMap: Record, - edgesMap: Record + nodesMap: Record, + edgesMap: Record ) { + if (!frame) { + return; + } + // The name of the value column is in this format // TODO figure out if it can be changed const valueName = `Value #${metric}`; @@ -218,24 +252,32 @@ function collectMetricData( const edgeId = `${row.client}_${row.server}`; if (!edgesMap[edgeId]) { + // Create edge as it does not exist yet edgesMap[edgeId] = { target: row.server, source: row.client, [stat]: row[valueName], }; } else { + // Add stat to edge + // We are adding the values if exists but that should not happen in general as there should be single row for + // an edge. edgesMap[edgeId][stat] = (edgesMap[edgeId][stat] || 0) + row[valueName]; } if (!nodesMap[row.server]) { + // Create node for server nodesMap[row.server] = { [stat]: row[valueName], }; } else { + // Add stat to server node. Sum up values if there are multiple edges targeting this server node. nodesMap[row.server][stat] = (nodesMap[row.server][stat] || 0) + row[valueName]; } if (!nodesMap[row.client]) { + // Create the client node but don't add the stat as edge stats are attributed to the server node. This means for + // example that the number of requests in a node show how many requests it handled not how many it generated. nodesMap[row.client] = { [stat]: 0, }; @@ -244,8 +286,8 @@ function collectMetricData( } function convertToDataFrames( - nodesMap: Record, - edgesMap: Record, + nodesMap: Record, + edgesMap: Record, range: TimeRange ): [DataFrame, DataFrame] { const rangeMs = range.to.valueOf() - range.from.valueOf(); @@ -253,22 +295,24 @@ function convertToDataFrames( for (const nodeId of Object.keys(nodesMap)) { const node = nodesMap[nodeId]; nodes.add({ - id: nodeId, - title: nodeId, + [Fields.id]: nodeId, + [Fields.title]: nodeId, // NaN will not be shown in the node graph. This happens for a root client node which did not process // any requests itself. - mainStat: node.total ? (node.seconds / node.total) * 1000 : Number.NaN, // Average response time - secondaryStat: node.total ? Math.round((node.total / (rangeMs / 1000)) * 100) / 100 : Number.NaN, // Request per second (to 2 decimals) + [Fields.mainStat]: node.total ? (node.seconds! / node.total) * 1000 : Number.NaN, // Average response time + [Fields.secondaryStat]: node.total ? Math.round((node.total / (rangeMs / 1000)) * 100) / 100 : Number.NaN, // Request per second (to 2 decimals) + [Fields.arc + 'success']: node.total ? (node.total - (node.failed || 0)) / node.total : 1, + [Fields.arc + 'failed']: node.total ? (node.failed || 0) / node.total : 0, }); } for (const edgeId of Object.keys(edgesMap)) { const edge = edgesMap[edgeId]; edges.add({ - id: edgeId, - source: edge.source, - target: edge.target, - mainStat: edge.total, // Requests - secondaryStat: edge.total ? (edge.seconds / edge.total) * 1000 : Number.NaN, // Average response time + [Fields.id]: edgeId, + [Fields.source]: edge.source, + [Fields.target]: edge.target, + [Fields.mainStat]: edge.total, // Requests + [Fields.secondaryStat]: edge.total ? (edge.seconds! / edge.total) * 1000 : Number.NaN, // Average response time }); }