mirror of
https://github.com/grafana/grafana.git
synced 2025-02-11 16:15:42 -06:00
Tempo: Add error metric to service graph (#40774)
* Add typings * Query error metric for service graph
This commit is contained in:
parent
ca4b35e596
commit
73e66d97a5
@ -209,7 +209,7 @@ const backendSrvWithPrometheus = {
|
||||
if (uid === 'prom') {
|
||||
return {
|
||||
query() {
|
||||
return of({ data: [totalsPromMetric, secondsPromMetric] });
|
||||
return of({ data: [totalsPromMetric, secondsPromMetric, failedPromMetric] });
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -280,6 +280,19 @@ const secondsPromMetric = new MutableDataFrame({
|
||||
],
|
||||
});
|
||||
|
||||
const failedPromMetric = new MutableDataFrame({
|
||||
refId: 'traces_service_graph_request_failed_total',
|
||||
fields: [
|
||||
{ name: 'Time', values: [1628169788000, 1628169788000] },
|
||||
{ name: 'client', values: ['app', 'lb'] },
|
||||
{ name: 'instance', values: ['127.0.0.1:12345', '127.0.0.1:12345'] },
|
||||
{ name: 'job', values: ['local_scrape', 'local_scrape'] },
|
||||
{ name: 'server', values: ['db', 'app'] },
|
||||
{ name: 'tempo_config', values: ['default', 'default'] },
|
||||
{ name: 'Value #traces_service_graph_request_failed_total', values: [2, 15] },
|
||||
],
|
||||
});
|
||||
|
||||
const mockInvalidJson = {
|
||||
batches: [
|
||||
{
|
||||
|
@ -292,9 +292,6 @@ function serviceMapQuery(request: DataQueryRequest<TempoQuery>, datasourceUid: s
|
||||
data: mapPromMetricsToServiceMap(responses, request.range),
|
||||
state: LoadingState.Done,
|
||||
};
|
||||
}),
|
||||
catchError((error) => {
|
||||
return of({ error: { message: error.message }, data: [] });
|
||||
})
|
||||
);
|
||||
}
|
||||
|
@ -64,16 +64,21 @@ describe('mapPromMetricsToServiceMap', () => {
|
||||
from: dateTime('2000-01-01T00:00:00'),
|
||||
to: dateTime('2000-01-01T00:01:00'),
|
||||
};
|
||||
const [nodes, edges] = mapPromMetricsToServiceMap([{ data: [totalsPromMetric, secondsPromMetric] }], {
|
||||
...range,
|
||||
raw: range,
|
||||
});
|
||||
const [nodes, edges] = mapPromMetricsToServiceMap(
|
||||
[{ data: [totalsPromMetric, secondsPromMetric, failedPromMetric] }],
|
||||
{
|
||||
...range,
|
||||
raw: range,
|
||||
}
|
||||
);
|
||||
|
||||
expect(nodes.fields).toMatchObject([
|
||||
{ name: 'id', values: new ArrayVector(['db', 'app', 'lb']) },
|
||||
{ name: 'title', values: new ArrayVector(['db', 'app', 'lb']) },
|
||||
{ name: 'mainStat', values: new ArrayVector([1000, 2000, NaN]) },
|
||||
{ name: 'secondaryStat', values: new ArrayVector([0.17, 0.33, NaN]) },
|
||||
{ name: 'arc__success', values: new ArrayVector([0.8, 0.25, 1]) },
|
||||
{ name: 'arc__failed', values: new ArrayVector([0.2, 0.75, 0]) },
|
||||
]);
|
||||
expect(edges.fields).toMatchObject([
|
||||
{ name: 'id', values: new ArrayVector(['app_db', 'lb_app']) },
|
||||
@ -134,3 +139,16 @@ const secondsPromMetric = new MutableDataFrame({
|
||||
{ name: 'Value #traces_service_graph_request_server_seconds_sum', values: [10, 40] },
|
||||
],
|
||||
});
|
||||
|
||||
const failedPromMetric = new MutableDataFrame({
|
||||
refId: 'traces_service_graph_request_failed_total',
|
||||
fields: [
|
||||
{ name: 'Time', values: [1628169788000, 1628169788000] },
|
||||
{ name: 'client', values: ['app', 'lb'] },
|
||||
{ name: 'instance', values: ['127.0.0.1:12345', '127.0.0.1:12345'] },
|
||||
{ name: 'job', values: ['local_scrape', 'local_scrape'] },
|
||||
{ name: 'server', values: ['db', 'app'] },
|
||||
{ name: 'tempo_config', values: ['default', 'default'] },
|
||||
{ name: 'Value #traces_service_graph_request_failed_total', values: [2, 15] },
|
||||
],
|
||||
});
|
||||
|
@ -1,8 +1,8 @@
|
||||
import { groupBy } from 'lodash';
|
||||
import {
|
||||
DataFrame,
|
||||
DataFrameView,
|
||||
DataQueryResponse,
|
||||
FieldColorModeId,
|
||||
FieldDTO,
|
||||
MutableDataFrame,
|
||||
NodeGraphDataFrameFieldNames as Fields,
|
||||
@ -10,6 +10,9 @@ import {
|
||||
} from '@grafana/data';
|
||||
import { getNonOverlappingDuration, getStats, makeFrames, makeSpanMap } from '../../../core/utils/tracing';
|
||||
|
||||
/**
|
||||
* Row in a trace dataFrame
|
||||
*/
|
||||
interface Row {
|
||||
traceID: string;
|
||||
spanID: string;
|
||||
@ -129,10 +132,12 @@ function findTraceDuration(view: DataFrameView<Row>): number {
|
||||
|
||||
const secondsMetric = 'traces_service_graph_request_server_seconds_sum';
|
||||
const totalsMetric = 'traces_service_graph_request_total';
|
||||
const failedMetric = 'traces_service_graph_request_failed_total';
|
||||
|
||||
export const serviceMapMetrics = [
|
||||
secondsMetric,
|
||||
totalsMetric,
|
||||
failedMetric,
|
||||
// We don't show histogram in node graph at the moment but we could later add that into a node context menu.
|
||||
// 'traces_service_graph_request_seconds_bucket',
|
||||
// 'traces_service_graph_request_seconds_count',
|
||||
@ -147,14 +152,15 @@ export const serviceMapMetrics = [
|
||||
* @param range
|
||||
*/
|
||||
export function mapPromMetricsToServiceMap(responses: DataQueryResponse[], range: TimeRange): [DataFrame, DataFrame] {
|
||||
const [totalsDFView, secondsDFView] = getMetricFrames(responses);
|
||||
const frames = getMetricFrames(responses);
|
||||
|
||||
// First just collect data from the metrics into a map with nodes and edges as keys
|
||||
const nodesMap: Record<string, any> = {};
|
||||
const edgesMap: Record<string, any> = {};
|
||||
const nodesMap: Record<string, ServiceMapStatistics> = {};
|
||||
const edgesMap: Record<string, EdgeObject> = {};
|
||||
// At this moment we don't have any error/success or other counts so we just use these 2
|
||||
collectMetricData(totalsDFView, 'total', totalsMetric, nodesMap, edgesMap);
|
||||
collectMetricData(secondsDFView, 'seconds', secondsMetric, nodesMap, edgesMap);
|
||||
collectMetricData(frames[totalsMetric], 'total', totalsMetric, nodesMap, edgesMap);
|
||||
collectMetricData(frames[secondsMetric], 'seconds', secondsMetric, nodesMap, edgesMap);
|
||||
collectMetricData(frames[failedMetric], 'failed', failedMetric, nodesMap, edgesMap);
|
||||
|
||||
return convertToDataFrames(nodesMap, edgesMap, range);
|
||||
}
|
||||
@ -172,6 +178,14 @@ function createServiceMapDataFrames() {
|
||||
name: Fields.secondaryStat,
|
||||
config: { unit: 'r/sec', displayName: 'Requests per second' },
|
||||
},
|
||||
{
|
||||
name: Fields.arc + 'success',
|
||||
config: { displayName: 'Success', color: { fixedColor: 'green', mode: FieldColorModeId.Fixed } },
|
||||
},
|
||||
{
|
||||
name: Fields.arc + 'failed',
|
||||
config: { displayName: 'Failed', color: { fixedColor: 'red', mode: FieldColorModeId.Fixed } },
|
||||
},
|
||||
]);
|
||||
const edges = createDF('Edges', [
|
||||
{ name: Fields.id },
|
||||
@ -184,13 +198,29 @@ function createServiceMapDataFrames() {
|
||||
return [nodes, edges];
|
||||
}
|
||||
|
||||
function getMetricFrames(responses: DataQueryResponse[]) {
|
||||
const responsesMap = groupBy(responses[0].data, (data) => data.refId);
|
||||
const totalsDFView = new DataFrameView(responsesMap[totalsMetric][0]);
|
||||
const secondsDFView = new DataFrameView(responsesMap[secondsMetric][0]);
|
||||
return [totalsDFView, secondsDFView];
|
||||
/**
|
||||
* Group frames from response based on ref id which is set the same as the metric name so we know which metric is where
|
||||
* and also put it into DataFrameView so it's easier to work with.
|
||||
* @param responses
|
||||
*/
|
||||
function getMetricFrames(responses: DataQueryResponse[]): Record<string, DataFrameView> {
|
||||
return responses[0].data.reduce<Record<string, DataFrameView>>((acc, frame) => {
|
||||
acc[frame.refId] = new DataFrameView(frame);
|
||||
return acc;
|
||||
}, {});
|
||||
}
|
||||
|
||||
type ServiceMapStatistics = {
|
||||
total?: number;
|
||||
seconds?: number;
|
||||
failed?: number;
|
||||
};
|
||||
|
||||
type EdgeObject = ServiceMapStatistics & {
|
||||
source: string;
|
||||
target: string;
|
||||
};
|
||||
|
||||
/**
|
||||
* Collect data from a metric into a map of nodes and edges. The metric data is modeled as counts of metric per edge
|
||||
* which is a pair of client-server nodes. This means we convert each row of the metric 1-1 to edges and than we assign
|
||||
@ -203,12 +233,16 @@ function getMetricFrames(responses: DataQueryResponse[]) {
|
||||
* @param edgesMap
|
||||
*/
|
||||
function collectMetricData(
|
||||
frame: DataFrameView,
|
||||
stat: 'total' | 'seconds',
|
||||
frame: DataFrameView | undefined,
|
||||
stat: keyof ServiceMapStatistics,
|
||||
metric: string,
|
||||
nodesMap: Record<string, any>,
|
||||
edgesMap: Record<string, any>
|
||||
nodesMap: Record<string, ServiceMapStatistics>,
|
||||
edgesMap: Record<string, EdgeObject>
|
||||
) {
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
// The name of the value column is in this format
|
||||
// TODO figure out if it can be changed
|
||||
const valueName = `Value #${metric}`;
|
||||
@ -218,24 +252,32 @@ function collectMetricData(
|
||||
const edgeId = `${row.client}_${row.server}`;
|
||||
|
||||
if (!edgesMap[edgeId]) {
|
||||
// Create edge as it does not exist yet
|
||||
edgesMap[edgeId] = {
|
||||
target: row.server,
|
||||
source: row.client,
|
||||
[stat]: row[valueName],
|
||||
};
|
||||
} else {
|
||||
// Add stat to edge
|
||||
// We are adding the values if exists but that should not happen in general as there should be single row for
|
||||
// an edge.
|
||||
edgesMap[edgeId][stat] = (edgesMap[edgeId][stat] || 0) + row[valueName];
|
||||
}
|
||||
|
||||
if (!nodesMap[row.server]) {
|
||||
// Create node for server
|
||||
nodesMap[row.server] = {
|
||||
[stat]: row[valueName],
|
||||
};
|
||||
} else {
|
||||
// Add stat to server node. Sum up values if there are multiple edges targeting this server node.
|
||||
nodesMap[row.server][stat] = (nodesMap[row.server][stat] || 0) + row[valueName];
|
||||
}
|
||||
|
||||
if (!nodesMap[row.client]) {
|
||||
// Create the client node but don't add the stat as edge stats are attributed to the server node. This means for
|
||||
// example that the number of requests in a node show how many requests it handled not how many it generated.
|
||||
nodesMap[row.client] = {
|
||||
[stat]: 0,
|
||||
};
|
||||
@ -244,8 +286,8 @@ function collectMetricData(
|
||||
}
|
||||
|
||||
function convertToDataFrames(
|
||||
nodesMap: Record<string, any>,
|
||||
edgesMap: Record<string, any>,
|
||||
nodesMap: Record<string, ServiceMapStatistics>,
|
||||
edgesMap: Record<string, EdgeObject>,
|
||||
range: TimeRange
|
||||
): [DataFrame, DataFrame] {
|
||||
const rangeMs = range.to.valueOf() - range.from.valueOf();
|
||||
@ -253,22 +295,24 @@ function convertToDataFrames(
|
||||
for (const nodeId of Object.keys(nodesMap)) {
|
||||
const node = nodesMap[nodeId];
|
||||
nodes.add({
|
||||
id: nodeId,
|
||||
title: nodeId,
|
||||
[Fields.id]: nodeId,
|
||||
[Fields.title]: nodeId,
|
||||
// NaN will not be shown in the node graph. This happens for a root client node which did not process
|
||||
// any requests itself.
|
||||
mainStat: node.total ? (node.seconds / node.total) * 1000 : Number.NaN, // Average response time
|
||||
secondaryStat: node.total ? Math.round((node.total / (rangeMs / 1000)) * 100) / 100 : Number.NaN, // Request per second (to 2 decimals)
|
||||
[Fields.mainStat]: node.total ? (node.seconds! / node.total) * 1000 : Number.NaN, // Average response time
|
||||
[Fields.secondaryStat]: node.total ? Math.round((node.total / (rangeMs / 1000)) * 100) / 100 : Number.NaN, // Request per second (to 2 decimals)
|
||||
[Fields.arc + 'success']: node.total ? (node.total - (node.failed || 0)) / node.total : 1,
|
||||
[Fields.arc + 'failed']: node.total ? (node.failed || 0) / node.total : 0,
|
||||
});
|
||||
}
|
||||
for (const edgeId of Object.keys(edgesMap)) {
|
||||
const edge = edgesMap[edgeId];
|
||||
edges.add({
|
||||
id: edgeId,
|
||||
source: edge.source,
|
||||
target: edge.target,
|
||||
mainStat: edge.total, // Requests
|
||||
secondaryStat: edge.total ? (edge.seconds / edge.total) * 1000 : Number.NaN, // Average response time
|
||||
[Fields.id]: edgeId,
|
||||
[Fields.source]: edge.source,
|
||||
[Fields.target]: edge.target,
|
||||
[Fields.mainStat]: edge.total, // Requests
|
||||
[Fields.secondaryStat]: edge.total ? (edge.seconds! / edge.total) * 1000 : Number.NaN, // Average response time
|
||||
});
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user