Traces: APM table (#48654)

* APM table

* Remove unnecessary changes in Explore.tsx

* Enhancements

* Show empty table if filter returns no results for table

* Error checking

* Combine extra filter options

* Convert service map query to apm query for apm requests

* Simplify links

* Better query building

* Extract method and clean up

* Upgrade filter

* Self review

* Export method

* Update test

* Switch area-chart to lcd-gauge

* Remove AreaChartCell

* Colors, units and other UI upgrades

* Remove 2 queries and reuse existing queries

* Nested observables

* rate/error rate/duration links: open new pane with range query and exemplars turned on

* Align error rate values to rate values col according to span_name

* Rearrange code

* Several improvements

* Filter by span_name

* align the table col values to the same row name (rateName) across the table

* On click tempo link filter by span_name

* Filtering updates

* Ensure serviceQuery null is ok

* Update link expr

* Update duration p90 unit

* Tempo links case

* Update tests

* Self review

* Tests

* Empty state

* Remove some code

* Test: should build expr correctly

* Test: should build link expr correctly

* Test: should get rate aligned values correctly

* Test: should make apm request correctly

* Test: should make tempo link correctly

* Move apm table tests to its own describe

* Feature toggle

* Added to docs for APM table

* Add screenshot to APM table section in docs

* Add feature toggle to test

* Split queries and make method names clearer

* Doc changes

* Add back in config import removed by merge

* Update tempo link and tests

* Update filter

* Set field filterable: false

* Updated doc

* Remove @end from queries

* Add back in tempo link
This commit is contained in:
Joey Tawadrous 2022-06-09 17:56:15 +01:00 committed by GitHub
parent e046f14f36
commit 4ed7ff2ed1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 683 additions and 61 deletions

View File

@ -174,10 +174,10 @@ To display the service graph:
- [Configure the Grafana Agent](https://grafana.com/docs/tempo/next/grafana-agent/service-graphs/#quickstart) to generate service graph data
- Link a Prometheus datasource in the Tempo datasource settings.
- Navigate to [Explore]({{< relref "../explore/" >}})
- Select the Tempo datasource
- Select the **Service Graph** query type and run the query
- Optionally, filter by service name
- Navigate to [Explore]({{< relref "../explore/" >}}).
- Select the Tempo datasource.
- Select the **Service Graph** query type and run the query.
- (Optional): filter by service name.
You can pan and zoom the view with buttons or you mouse. For details about the visualization, refer to [Node graph panel](https://grafana.com/docs/grafana/latest/panels/visualizations/node-graph/).
@ -192,6 +192,25 @@ The color of each circle represents the percentage of requests in each of the fo
Click on the service to see a context menu with additional links for quick navigation to other relevant information.
## APM table
The APM (Application Performance Management) table allows you to view several APM metrics out of the box.
To display the APM table:
1. Activate the tempoApmTable feature flag in your ini file.
1. Link a Prometheus datasource in the Tempo datasource settings.
1. Navigate to [Explore]({{< relref "../explore/_index.md" >}}).
1. Select the Tempo datasource.
1. Select the **Service Graph** query type and run the query.
1. (Optional): filter your results.
Note: The metric traces_spanmetrics_calls_total is used to display the name, rate & error rate columns and traces_spanmetrics_duration_seconds_bucket is used to display the duration column (these metrics will need to exist in your Prometheus datasource).
Click a row in the rate, error rate, or duration columns to open a query in Prometheus with the span name of that row automatically set in the query. Click a row in the links column to open a query in Tempo with the span name of that row automatically set in the query.
{{< figure src="/static/img/docs/tempo/apm-table.png" class="docs-image--no-shadow" max-width="500px" caption="Screenshot of the Tempo APM table" >}}
## Linking Trace ID from logs
You can link to Tempo trace from logs in Loki or Elastic by configuring an internal link. See the [Derived fields]({{< relref "loki/#derived-fields" >}}) section in the [Loki data source]({{< relref "loki/" >}}) or [Data links]({{< relref "elasticsearch/#data-links" >}}) section in the [Elastic data source]({{< relref "elasticsearch/" >}}) for configuration instructions.

View File

@ -28,6 +28,7 @@ export interface FeatureToggles {
queryOverLive?: boolean;
panelTitleSearch?: boolean;
tempoServiceGraph?: boolean;
tempoApmTable?: boolean;
prometheus_azure_auth?: boolean;
prometheusAzureOverrideAudience?: boolean;
influxdbBackendMigration?: boolean;

View File

@ -72,6 +72,12 @@ var (
State: FeatureStateBeta,
FrontendOnly: true,
},
{
Name: "tempoApmTable",
Description: "Show APM table",
State: FeatureStateAlpha,
FrontendOnly: true,
},
{
Name: "prometheus_azure_auth",
Description: "Experimental. Azure authentication for Prometheus datasource",

View File

@ -55,6 +55,10 @@ const (
// show service
FlagTempoServiceGraph = "tempoServiceGraph"
// FlagTempoApmTable
// Show APM table
FlagTempoApmTable = "tempoApmTable"
// FlagPrometheusAzureAuth
// Experimental. Azure authentication for Prometheus datasource
FlagPrometheusAzureAuth = "prometheus_azure_auth"

View File

@ -837,7 +837,10 @@ export class PrometheusDatasource
const seriesLabels: Array<Record<string, string[]>> = await Promise.all(
options.series.map((series: string) => this.languageProvider.fetchSeriesLabels(series))
);
const uniqueLabels = [...new Set(...seriesLabels.map((value) => Object.keys(value)))];
// Combines tags from all options.series provided
let tags: string[] = [];
seriesLabels.map((value) => (tags = tags.concat(Object.keys(value))));
const uniqueLabels = [...new Set(tags)];
return uniqueLabels.map((value: any) => ({ text: value }));
} else {
// Get all tags

View File

@ -19,7 +19,6 @@ import {
DataQueryResponse,
DataQueryRequest,
PreferredVisualisationType,
CoreApp,
DataFrameType,
} from '@grafana/data';
import { FetchResponse, getDataSourceSrv, getTemplateSrv } from '@grafana/runtime';
@ -48,10 +47,7 @@ interface TimeAndValue {
const isTableResult = (dataFrame: DataFrame, options: DataQueryRequest<PromQuery>): boolean => {
// We want to process vector and scalar results in Explore as table
if (
options.app === CoreApp.Explore &&
(dataFrame.meta?.custom?.resultType === 'vector' || dataFrame.meta?.custom?.resultType === 'scalar')
) {
if (dataFrame.meta?.custom?.resultType === 'vector' || dataFrame.meta?.custom?.resultType === 'scalar') {
return true;
}

View File

@ -79,6 +79,9 @@ class TempoQueryFieldComponent extends React.PureComponent<Props> {
if (config.featureToggles.tempoServiceGraph) {
queryTypeOptions.push({ value: 'serviceMap', label: 'Service Graph' });
// span names in Tempo search links (generated on the service graph page) are in camel case (for Prometheus queries)
// but the span name dropdown menu in the search tab is lower case
query.spanName = query.spanName?.toLowerCase();
}
if (!datasource?.search?.hide) {

View File

@ -3,6 +3,7 @@ import React, { useEffect, useState } from 'react';
import useAsync from 'react-use/lib/useAsync';
import { GrafanaTheme2 } from '@grafana/data';
import { config } from '@grafana/runtime';
import { Alert, InlineField, InlineFieldRow, useStyles2 } from '@grafana/ui';
import { AdHocFilter } from '../../../../features/variables/adhoc/picker/AdHocFilter';
@ -71,11 +72,9 @@ export function ServiceGraphSection({
datasource={{ uid: graphDatasourceUid }}
filters={filters}
getTagKeysOptions={{
series: [
'traces_service_graph_request_server_seconds_sum',
'traces_service_graph_request_total',
'traces_service_graph_request_failed_total',
],
series: config.featureToggles.tempoApmTable
? ['traces_service_graph_request_total', 'traces_spanmetrics_calls_total']
: ['traces_service_graph_request_total'],
}}
addFilter={(filter: AdHocVariableFilter) => {
onChange({

View File

@ -2,6 +2,7 @@ import { lastValueFrom, Observable, of } from 'rxjs';
import { createFetchResponse } from 'test/helpers/createFetchResponse';
import {
ArrayVector,
DataFrame,
dataFrameToJSON,
DataSourceInstanceSettings,
@ -12,8 +13,19 @@ import {
PluginType,
} from '@grafana/data';
import { BackendDataSourceResponse, FetchResponse, setBackendSrv, setDataSourceSrv } from '@grafana/runtime';
import config from 'app/core/config';
import { DEFAULT_LIMIT, TempoJsonData, TempoDatasource, TempoQuery } from './datasource';
import {
DEFAULT_LIMIT,
TempoJsonData,
TempoDatasource,
TempoQuery,
buildExpr,
buildLinkExpr,
getRateAlignedValues,
makeApmRequest,
makeTempoLink,
} from './datasource';
import mockJson from './mockJsonResponse.json';
import mockServiceGraph from './mockServiceGraph.json';
@ -156,34 +168,6 @@ describe('Tempo data source', () => {
]);
});
it('runs service graph queries', async () => {
const ds = new TempoDatasource({
...defaultSettings,
jsonData: {
serviceMap: {
datasourceUid: 'prom',
},
},
});
setDataSourceSrv(backendSrvWithPrometheus as any);
const response = await lastValueFrom(
ds.query({ targets: [{ queryType: 'serviceMap' }], range: getDefaultTimeRange() } as any)
);
expect(response.data).toHaveLength(2);
expect(response.data[0].name).toBe('Nodes');
expect(response.data[0].fields[0].values.length).toBe(3);
// Test Links
expect(response.data[0].fields[0].config.links.length).toBe(4);
expect(response.data[0].fields[0].config.links).toEqual(serviceGraphLinks);
expect(response.data[1].name).toBe('Edges');
expect(response.data[1].fields[0].values.length).toBe(2);
expect(response.state).toBe(LoadingState.Done);
});
it('should handle json file upload', async () => {
const ds = new TempoDatasource(defaultSettings);
ds.uploadedJson = JSON.stringify(mockJson);
@ -356,12 +340,235 @@ describe('Tempo data source', () => {
});
});
describe('Tempo apm table', () => {
it('runs service graph queries', async () => {
const ds = new TempoDatasource({
...defaultSettings,
jsonData: {
serviceMap: {
datasourceUid: 'prom',
},
},
});
config.featureToggles.tempoApmTable = true;
setDataSourceSrv(backendSrvWithPrometheus as any);
const response = await lastValueFrom(
ds.query({ targets: [{ queryType: 'serviceMap' }], range: getDefaultTimeRange() } as any)
);
expect(response.data).toHaveLength(3);
expect(response.state).toBe(LoadingState.Done);
// APM table
expect(response.data[0].fields[0].name).toBe('Name');
expect(response.data[0].fields[0].values.toArray().length).toBe(2);
expect(response.data[0].fields[0].values.toArray()[0]).toBe('HTTP Client');
expect(response.data[0].fields[0].values.toArray()[1]).toBe('HTTP GET - root');
expect(response.data[0].fields[1].name).toBe('Rate');
expect(response.data[0].fields[1].values.toArray().length).toBe(2);
expect(response.data[0].fields[1].values.toArray()[0]).toBe(12.75164671814457);
expect(response.data[0].fields[1].values.toArray()[1]).toBe(12.121331111401608);
expect(response.data[0].fields[1].config.decimals).toBe(2);
expect(response.data[0].fields[1].config.links[0].title).toBe('Rate');
expect(response.data[0].fields[1].config.links[0].internal.query.expr).toBe(
'topk(5, sum(rate(traces_spanmetrics_calls_total{span_name="${__data.fields[0]}"}[$__rate_interval])) by (span_name))'
);
expect(response.data[0].fields[1].config.links[0].internal.query.range).toBe(true);
expect(response.data[0].fields[1].config.links[0].internal.query.exemplar).toBe(true);
expect(response.data[0].fields[1].config.links[0].internal.query.instant).toBe(false);
expect(response.data[0].fields[2].values.toArray().length).toBe(2);
expect(response.data[0].fields[2].values.toArray()[0]).toBe(12.75164671814457);
expect(response.data[0].fields[2].values.toArray()[1]).toBe(12.121331111401608);
expect(response.data[0].fields[2].config.color.mode).toBe('continuous-BlPu');
expect(response.data[0].fields[2].config.custom.displayMode).toBe('lcd-gauge');
expect(response.data[0].fields[2].config.decimals).toBe(3);
expect(response.data[0].fields[3].name).toBe('Error Rate');
expect(response.data[0].fields[3].values.length).toBe(2);
expect(response.data[0].fields[3].values[0]).toBe(3.75164671814457);
expect(response.data[0].fields[3].values[1]).toBe(3.121331111401608);
expect(response.data[0].fields[3].config.decimals).toBe(2);
expect(response.data[0].fields[3].config.links[0].title).toBe('Error Rate');
expect(response.data[0].fields[3].config.links[0].internal.query.expr).toBe(
'topk(5, sum(rate(traces_spanmetrics_calls_total{span_status="STATUS_CODE_ERROR",span_name="${__data.fields[0]}"}[$__rate_interval])) by (span_name))'
);
expect(response.data[0].fields[3].config.links[0].internal.query.range).toBe(true);
expect(response.data[0].fields[3].config.links[0].internal.query.exemplar).toBe(true);
expect(response.data[0].fields[3].config.links[0].internal.query.instant).toBe(false);
expect(response.data[0].fields[4].values.length).toBe(2);
expect(response.data[0].fields[4].values[0]).toBe(3.75164671814457);
expect(response.data[0].fields[4].values[1]).toBe(3.121331111401608);
expect(response.data[0].fields[4].config.color.mode).toBe('continuous-RdYlGr');
expect(response.data[0].fields[4].config.custom.displayMode).toBe('lcd-gauge');
expect(response.data[0].fields[4].config.decimals).toBe(3);
expect(response.data[0].fields[5].name).toBe('Duration (p90)');
expect(response.data[0].fields[5].values.length).toBe(2);
expect(response.data[0].fields[5].values[0]).toBe('0');
expect(response.data[0].fields[5].values[1]).toBe(0.12003505696757232);
expect(response.data[0].fields[5].config.unit).toBe('s');
expect(response.data[0].fields[5].config.links[0].title).toBe('Duration');
expect(response.data[0].fields[5].config.links[0].internal.query.expr).toBe(
'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",span_name="${__data.fields[0]}"}[$__rate_interval])) by (le))'
);
expect(response.data[0].fields[5].config.links[0].internal.query.range).toBe(true);
expect(response.data[0].fields[5].config.links[0].internal.query.exemplar).toBe(true);
expect(response.data[0].fields[5].config.links[0].internal.query.instant).toBe(false);
expect(response.data[0].fields[6].config.links[0].url).toBe('');
expect(response.data[0].fields[6].config.links[0].title).toBe('Tempo');
expect(response.data[0].fields[6].config.links[0].internal.query.queryType).toBe('nativeSearch');
expect(response.data[0].fields[6].config.links[0].internal.query.spanName).toBe('${__data.fields[0]}');
// Service graph
expect(response.data[1].name).toBe('Nodes');
expect(response.data[1].fields[0].values.length).toBe(3);
expect(response.data[1].fields[0].config.links.length).toBeGreaterThan(0);
expect(response.data[1].fields[0].config.links).toEqual(serviceGraphLinks);
expect(response.data[2].name).toBe('Edges');
expect(response.data[2].fields[0].values.length).toBe(2);
});
it('should build expr correctly', () => {
let targets = { targets: [{ queryType: 'serviceMap' }] } as any;
let builtQuery = buildExpr(
{ expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))', params: [] },
'',
targets
);
expect(builtQuery).toBe('topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))');
builtQuery = buildExpr(
{
expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))',
params: ['span_status="STATUS_CODE_ERROR"'],
},
'span_name=~"HTTP Client|HTTP GET|HTTP GET - root|HTTP POST|HTTP POST - post"',
targets
);
expect(builtQuery).toBe(
'topk(5, sum(rate(traces_spanmetrics_calls_total{span_status="STATUS_CODE_ERROR",span_name=~"HTTP Client|HTTP GET|HTTP GET - root|HTTP POST|HTTP POST - post"}[$__range])) by (span_name))'
);
builtQuery = buildExpr(
{
expr: 'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{}[$__range])) by (le))',
params: ['span_status="STATUS_CODE_ERROR"'],
},
'span_name=~"HTTP Client"',
targets
);
expect(builtQuery).toBe(
'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",span_name=~"HTTP Client"}[$__range])) by (le))'
);
targets = { targets: [{ queryType: 'serviceMap', serviceMapQuery: '{client="app",service="app"}' }] } as any;
builtQuery = buildExpr(
{ expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))', params: [] },
'',
targets
);
expect(builtQuery).toBe(
'topk(5, sum(rate(traces_spanmetrics_calls_total{service="app",service="app"}[$__range])) by (span_name))'
);
});
it('should build link expr correctly', () => {
let builtQuery = buildLinkExpr('topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))');
expect(builtQuery).toBe('topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__rate_interval])) by (span_name))');
});
it('should get rate aligned values correctly', () => {
const resp = [
{
refId:
'topk(5, sum(rate(traces_spanmetrics_calls_total{service="app",service="app"}[$__range])) by (span_name))',
fields: [
{
name: 'Time',
type: 'time',
config: {},
values: [1653828275000, 1653828275000, 1653828275000, 1653828275000, 1653828275000],
},
{
name: 'span_name',
config: {
filterable: true,
},
type: 'string',
values: new ArrayVector(['HTTP Client', 'HTTP GET', 'HTTP GET - root', 'HTTP POST', 'HTTP POST - post']),
},
],
},
];
const objToAlign = {
'HTTP GET - root': {
name: 'HTTP GET - root',
value: 0.2724936652307618,
},
'HTTP GET': {
name: 'HTTP GET',
value: 0.2724936652307618,
},
'HTTP POST - post': {
name: 'HTTP POST - post',
value: 0.03697421858453128,
},
};
let value = getRateAlignedValues(resp, objToAlign as any);
expect(value.toString()).toBe('0,0.2724936652307618,0.2724936652307618,0,0.03697421858453128');
});
it('should make apm request correctly', () => {
const apmRequest = makeApmRequest([
'topk(5, sum(rate(traces_spanmetrics_calls_total{service="app"}[$__range])) by (span_name))"',
'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",service="app",service="app",span_name=~"HTTP Client"}[$__range])) by (le))',
]);
expect(apmRequest).toEqual([
{
refId: 'topk(5, sum(rate(traces_spanmetrics_calls_total{service="app"}[$__range])) by (span_name))"',
expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{service="app"}[$__range])) by (span_name))"',
instant: true,
},
{
refId:
'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",service="app",service="app",span_name=~"HTTP Client"}[$__range])) by (le))',
expr: 'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",service="app",service="app",span_name=~"HTTP Client"}[$__range])) by (le))',
instant: true,
},
]);
});
it('should make tempo link correctly', () => {
const tempoLink = makeTempoLink('Tempo', '', '"${__data.fields[0]}"', 'gdev-tempo');
expect(tempoLink).toEqual({
url: '',
title: 'Tempo',
internal: {
query: {
queryType: 'nativeSearch',
spanName: '"${__data.fields[0]}"',
},
datasourceUid: 'gdev-tempo',
datasourceName: 'Tempo',
},
});
});
});
const backendSrvWithPrometheus = {
async get(uid: string) {
if (uid === 'prom') {
return {
query() {
return of({ data: [totalsPromMetric, secondsPromMetric, failedPromMetric] });
return of({
data: [rateMetric, errorRateMetric, durationMetric, totalsPromMetric, secondsPromMetric, failedPromMetric],
});
},
};
}
@ -406,6 +613,43 @@ const defaultSettings: DataSourceInstanceSettings<TempoJsonData> = {
},
};
const rateMetric = new MutableDataFrame({
refId: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))',
fields: [
{ name: 'Time', values: [1653725618609, 1653725618609] },
{ name: 'span_name', values: ['HTTP Client', 'HTTP GET - root'] },
{
name: 'Value #topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))',
values: [12.75164671814457, 12.121331111401608],
},
],
});
const errorRateMetric = new MutableDataFrame({
refId:
'topk(5, sum(rate(traces_spanmetrics_calls_total{span_status="STATUS_CODE_ERROR",span_name=~"HTTP Client|HTTP GET - root"}[$__range])) by (span_name))',
fields: [
{ name: 'Time', values: [1653725618609, 1653725618609] },
{ name: 'span_name', values: ['HTTP Client', 'HTTP GET - root'] },
{
name: 'Value #topk(5, sum(rate(traces_spanmetrics_calls_total{span_status="STATUS_CODE_ERROR"}[$__range])) by (span_name))',
values: [3.75164671814457, 3.121331111401608],
},
],
});
const durationMetric = new MutableDataFrame({
refId:
'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",span_name=~"HTTP GET - root"}[$__range])) by (le))',
fields: [
{ name: 'Time', values: [1653725618609] },
{
name: 'Value #histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{span_status="STATUS_CODE_ERROR",span_name=~"HTTP GET - root"}[$__range])) by (le))',
values: [0.12003505696757232],
},
],
});
const totalsPromMetric = new MutableDataFrame({
refId: 'traces_service_graph_request_total',
fields: [
@ -485,6 +729,9 @@ const serviceGraphLinks = [
internal: {
query: {
expr: 'rate(traces_service_graph_request_total{server="${__data.fields.id}"}[$__rate_interval])',
instant: false,
range: true,
exemplar: true,
},
datasourceUid: 'prom',
datasourceName: 'Prometheus',
@ -496,6 +743,9 @@ const serviceGraphLinks = [
internal: {
query: {
expr: 'histogram_quantile(0.9, sum(rate(traces_service_graph_request_server_seconds_bucket{server="${__data.fields.id}"}[$__rate_interval])) by (le, client, server))',
instant: false,
range: true,
exemplar: true,
},
datasourceUid: 'prom',
datasourceName: 'Prometheus',
@ -507,6 +757,9 @@ const serviceGraphLinks = [
internal: {
query: {
expr: 'rate(traces_service_graph_request_failed_total{server="${__data.fields.id}"}[$__rate_interval])',
instant: false,
range: true,
exemplar: true,
},
datasourceUid: 'prom',
datasourceName: 'Prometheus',

View File

@ -1,19 +1,22 @@
import { identity, pick, pickBy, groupBy, startCase } from 'lodash';
import { EMPTY, from, merge, Observable, of, throwError } from 'rxjs';
import { catchError, map, mergeMap, toArray } from 'rxjs/operators';
import { catchError, concatMap, map, mergeMap, toArray } from 'rxjs/operators';
import {
DataQuery,
DataQueryRequest,
DataQueryResponse,
DataQueryResponseData,
DataSourceApi,
DataSourceInstanceSettings,
DataSourceJsonData,
FieldType,
isValidGoDuration,
LoadingState,
ScopedVars,
} from '@grafana/data';
import {
config,
BackendSrvRequest,
DataSourceWithBackend,
getBackendSrv,
@ -36,6 +39,9 @@ import {
mapPromMetricsToServiceMap,
serviceMapMetrics,
totalsMetric,
rateMetric,
durationMetric,
errorRateMetric,
} from './graphTransform';
import {
transformTrace,
@ -208,7 +214,20 @@ export class TempoDatasource extends DataSourceWithBackend<TempoQuery, TempoJson
}
if (this.serviceMap?.datasourceUid && targets.serviceMap?.length > 0) {
subQueries.push(serviceMapQuery(options, this.serviceMap.datasourceUid, this.name));
const dsId = this.serviceMap.datasourceUid;
if (config.featureToggles.tempoApmTable) {
subQueries.push(
serviceMapQuery(options, dsId, this.name).pipe(
concatMap((result) =>
rateQuery(options, result, dsId).pipe(
concatMap((result) => errorAndDurationQuery(options, result, dsId, this.name))
)
)
)
);
} else {
subQueries.push(serviceMapQuery(options, dsId, this.name));
}
}
if (targets.traceId?.length > 0) {
@ -394,7 +413,7 @@ export class TempoDatasource extends DataSourceWithBackend<TempoQuery, TempoJson
};
}
function queryServiceMapPrometheus(request: DataQueryRequest<PromQuery>, datasourceUid: string) {
function queryPrometheus(request: DataQueryRequest<PromQuery>, datasourceUid: string) {
return from(getDatasourceSrv().get(datasourceUid)).pipe(
mergeMap((ds) => {
return (ds as PrometheusDatasource).query(request);
@ -403,7 +422,9 @@ function queryServiceMapPrometheus(request: DataQueryRequest<PromQuery>, datasou
}
function serviceMapQuery(request: DataQueryRequest<TempoQuery>, datasourceUid: string, tempoDatasourceUid: string) {
return queryServiceMapPrometheus(makePromServiceMapRequest(request), datasourceUid).pipe(
const serviceMapRequest = makePromServiceMapRequest(request);
return queryPrometheus(serviceMapRequest, datasourceUid).pipe(
// Just collect all the responses first before processing into node graph data
toArray(),
map((responses: DataQueryResponse[]) => {
@ -418,19 +439,22 @@ function serviceMapQuery(request: DataQueryRequest<TempoQuery>, datasourceUid: s
makePromLink(
'Request rate',
`rate(${totalsMetric}{server="\${__data.fields.id}"}[$__rate_interval])`,
datasourceUid
datasourceUid,
false
),
makePromLink(
'Request histogram',
`histogram_quantile(0.9, sum(rate(${histogramMetric}{server="\${__data.fields.id}"}[$__rate_interval])) by (le, client, server))`,
datasourceUid
datasourceUid,
false
),
makePromLink(
'Failed request rate',
`rate(${failedMetric}{server="\${__data.fields.id}"}[$__rate_interval])`,
datasourceUid
datasourceUid,
false
),
makeTempoLink('View traces', `\${__data.fields[0]}`, tempoDatasourceUid),
makeTempoLink('View traces', `\${__data.fields[0]}`, '', tempoDatasourceUid),
],
};
@ -442,13 +466,99 @@ function serviceMapQuery(request: DataQueryRequest<TempoQuery>, datasourceUid: s
);
}
function makePromLink(title: string, metric: string, datasourceUid: string) {
function rateQuery(
request: DataQueryRequest<TempoQuery>,
serviceMapResponse: DataQueryResponse,
datasourceUid: string
) {
const serviceMapRequest = makePromServiceMapRequest(request);
serviceMapRequest.targets = makeApmRequest([buildExpr(rateMetric, '', request)]);
return queryPrometheus(serviceMapRequest, datasourceUid).pipe(
toArray(),
map((responses: DataQueryResponse[]) => {
const errorRes = responses.find((res) => !!res.error);
if (errorRes) {
throw new Error(errorRes.error!.message);
}
return {
data: [responses[0]?.data ?? [], serviceMapResponse.data[0], serviceMapResponse.data[1]],
state: LoadingState.Done,
};
})
);
}
// we need the response from the rate query to get the rate span_name(s),
// -> which determine the errorRate/duration span_name(s) we need to query
function errorAndDurationQuery(
request: DataQueryRequest<TempoQuery>,
rateResponse: DataQueryResponse,
datasourceUid: string,
tempoDatasourceUid: string
) {
let apmMetrics = [];
let errorRateBySpanName = '';
let durationsBySpanName: string[] = [];
const spanNames = rateResponse.data[0][0]?.fields[1]?.values.toArray() ?? [];
if (spanNames.length > 0) {
errorRateBySpanName = buildExpr(errorRateMetric, 'span_name=~"' + spanNames.join('|') + '"', request);
apmMetrics.push(errorRateBySpanName);
spanNames.map((name: string) => {
const metric = buildExpr(durationMetric, 'span_name=~"' + name + '"', request);
durationsBySpanName.push(metric);
apmMetrics.push(metric);
});
}
const serviceMapRequest = makePromServiceMapRequest(request);
serviceMapRequest.targets = makeApmRequest(apmMetrics);
return queryPrometheus(serviceMapRequest, datasourceUid).pipe(
// Just collect all the responses first before processing into node graph data
toArray(),
map((errorAndDurationResponse: DataQueryResponse[]) => {
const errorRes = errorAndDurationResponse.find((res) => !!res.error);
if (errorRes) {
throw new Error(errorRes.error!.message);
}
const apmTable = getApmTable(
request,
rateResponse,
errorAndDurationResponse[0],
errorRateBySpanName,
durationsBySpanName,
datasourceUid,
tempoDatasourceUid
);
if (apmTable.fields.length === 0) {
return {
data: [rateResponse.data[1], rateResponse.data[2]],
state: LoadingState.Done,
};
}
return {
data: [apmTable, rateResponse.data[1], rateResponse.data[2]],
state: LoadingState.Done,
};
})
);
}
function makePromLink(title: string, expr: string, datasourceUid: string, instant: boolean) {
return {
url: '',
title,
internal: {
query: {
expr: metric,
expr: expr,
range: !instant,
exemplar: !instant,
instant: instant,
} as PromQuery,
datasourceUid,
datasourceName: 'Prometheus',
@ -456,15 +566,20 @@ function makePromLink(title: string, metric: string, datasourceUid: string) {
};
}
function makeTempoLink(title: string, serviceName: string, datasourceUid: string) {
export function makeTempoLink(title: string, serviceName: string, spanName: string, datasourceUid: string) {
let query = { queryType: 'nativeSearch' } as TempoQuery;
if (serviceName !== '') {
query.serviceName = serviceName;
}
if (spanName !== '') {
query.spanName = spanName;
}
return {
url: '',
title,
internal: {
query: {
queryType: 'nativeSearch',
serviceName: serviceName,
} as TempoQuery,
query,
datasourceUid: datasourceUid,
datasourceName: 'Tempo',
},
@ -479,9 +594,219 @@ function makePromServiceMapRequest(options: DataQueryRequest<TempoQuery>): DataQ
refId: metric,
// options.targets[0] is not correct here, but not sure what should happen if you have multiple queries for
// service map at the same time anyway
expr: `delta(${metric}${options.targets[0].serviceMapQuery || ''}[$__range])`,
expr: `rate(${metric}${options.targets[0].serviceMapQuery || ''}[$__range])`,
instant: true,
};
}),
};
}
function getApmTable(
request: DataQueryRequest<TempoQuery>,
rateResponse: DataQueryResponse,
secondResponse: DataQueryResponse,
errorRateBySpanName: string,
durationsBySpanName: string[],
datasourceUid: string,
tempoDatasourceUid: string
) {
let df: any = { fields: [] };
const rate = rateResponse.data[0]?.filter((x: { refId: string }) => {
return x.refId === buildExpr(rateMetric, '', request);
});
const errorRate = secondResponse.data.filter((x) => {
return x.refId === errorRateBySpanName;
});
const duration = secondResponse.data.filter((x) => {
return durationsBySpanName.includes(x.refId);
});
if (rate.length > 0 && rate[0].fields?.length > 2) {
df.fields.push({
...rate[0].fields[1],
name: 'Name',
config: {
filterable: false,
},
});
df.fields.push({
...rate[0].fields[2],
name: 'Rate',
config: {
links: [
makePromLink(
'Rate',
buildLinkExpr(buildExpr(rateMetric, 'span_name="${__data.fields[0]}"', request)),
datasourceUid,
false
),
],
decimals: 2,
},
});
df.fields.push({
...rate[0].fields[2],
name: ' ',
labels: null,
config: {
color: {
mode: 'continuous-BlPu',
},
custom: {
displayMode: 'lcd-gauge',
},
decimals: 3,
},
});
}
if (errorRate.length > 0 && errorRate[0].fields?.length > 2) {
const errorRateNames = errorRate[0].fields[1]?.values.toArray() ?? [];
const errorRateValues = errorRate[0].fields[2]?.values.toArray() ?? [];
let errorRateObj: any = {};
errorRateNames.map((name: string, index: number) => {
errorRateObj[name] = { name: name, value: errorRateValues[index] };
});
const values = getRateAlignedValues(rate, errorRateObj);
df.fields.push({
...errorRate[0].fields[2],
name: 'Error Rate',
values: values,
config: {
links: [
makePromLink(
'Error Rate',
buildLinkExpr(buildExpr(errorRateMetric, 'span_name="${__data.fields[0]}"', request)),
datasourceUid,
false
),
],
decimals: 2,
},
});
df.fields.push({
...errorRate[0].fields[2],
name: ' ',
values: values,
labels: null,
config: {
color: {
mode: 'continuous-RdYlGr',
},
custom: {
displayMode: 'lcd-gauge',
},
decimals: 3,
},
});
}
if (duration.length > 0 && duration[0].fields?.length > 1) {
let durationObj: any = {};
duration.map((d) => {
const delimiter = d.refId?.includes('span_name=~"') ? 'span_name=~"' : 'span_name="';
const name = d.refId?.split(delimiter)[1].split('"}')[0];
durationObj[name] = { name: name, value: d.fields[1].values.toArray()[0] };
});
df.fields.push({
...duration[0].fields[1],
name: 'Duration (p90)',
values: getRateAlignedValues(rate, durationObj),
config: {
links: [
makePromLink(
'Duration',
buildLinkExpr(buildExpr(durationMetric, 'span_name="${__data.fields[0]}"', request)),
datasourceUid,
false
),
],
unit: 's',
},
});
}
if (df.fields.length > 0 && df.fields[0].values) {
df.fields.push({
name: 'Links',
type: FieldType.string,
values: df.fields[0].values.map(() => {
return 'Tempo';
}),
config: {
links: [makeTempoLink('Tempo', '', `\${__data.fields[0]}`, tempoDatasourceUid)],
},
});
}
return df;
}
export function buildExpr(
metric: { expr: string; params: string[] },
extraParams: string,
request: DataQueryRequest<TempoQuery>
) {
let serviceMapQuery = request.targets[0]?.serviceMapQuery?.replace('{', '').replace('}', '') ?? '';
// map serviceGraph metric tags to APM metric tags
serviceMapQuery = serviceMapQuery.replace('client', 'service').replace('server', 'service');
const metricParams = serviceMapQuery.includes('span_name')
? metric.params.concat(serviceMapQuery)
: metric.params
.concat(serviceMapQuery)
.concat(extraParams)
.filter((item: string) => item);
return metric.expr.replace('{}', '{' + metricParams.join(',') + '}');
}
export function buildLinkExpr(expr: string) {
return expr.replace('__range', '__rate_interval');
}
// query result frames can come back in any order
// here we align the table col values to the same row name (rateName) across the table
export function getRateAlignedValues(
rateResp: DataQueryResponseData[],
objToAlign: { [x: string]: { value: string } }
) {
const rateNames = rateResp[0]?.fields[1]?.values.toArray().sort() ?? [];
let tempRateNames = rateNames;
let values: string[] = [];
objToAlign = Object.keys(objToAlign)
.sort()
.reduce((obj: any, key) => {
obj[key] = objToAlign[key];
return obj;
}, {});
for (let i = 0; i < rateNames.length; i++) {
if (tempRateNames[i]) {
if (tempRateNames[i] === Object.keys(objToAlign)[i]) {
values.push(objToAlign[Object.keys(objToAlign)[i]].value);
} else {
i--;
tempRateNames = tempRateNames.slice(1);
values.push('0');
}
}
}
return values;
}
export function makeApmRequest(metrics: any[]) {
return metrics.map((metric) => {
return {
refId: metric,
expr: metric,
instant: true,
};
});
}

View File

@ -136,6 +136,19 @@ export const totalsMetric = 'traces_service_graph_request_total';
export const failedMetric = 'traces_service_graph_request_failed_total';
export const histogramMetric = 'traces_service_graph_request_server_seconds_bucket';
export const rateMetric = {
expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))',
params: [],
};
export const errorRateMetric = {
expr: 'topk(5, sum(rate(traces_spanmetrics_calls_total{}[$__range])) by (span_name))',
params: ['span_status="STATUS_CODE_ERROR"'],
};
export const durationMetric = {
expr: 'histogram_quantile(.9, sum(rate(traces_spanmetrics_duration_seconds_bucket{}[$__range])) by (le))',
params: ['span_status="STATUS_CODE_ERROR"'],
};
export const serviceMapMetrics = [
secondsMetric,
totalsMetric,