Tranformations: True OUTER JOIN in the join by field transformation used for tabular data (#72176)

write join for tabular data and add test
This commit is contained in:
Brendan O'Handley 2023-07-26 12:06:45 -04:00 committed by GitHub
parent 60058cb3da
commit d39ec2428e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 211 additions and 5 deletions

View File

@ -8,8 +8,9 @@ import { DataTransformerID } from './ids';
import { joinDataFrames } from './joinDataFrames';
export enum JoinMode {
outer = 'outer',
outer = 'outer', // best for time series, non duplicated join on values
inner = 'inner',
outerTabular = 'outerTabular', // best for tabular data where the join on value can be duplicated
}
export interface JoinByFieldOptions {

View File

@ -2,6 +2,8 @@ import { toDataFrame } from '../../dataframe/processDataFrame';
import { getFieldDisplayName } from '../../field';
import { DataFrame, FieldType } from '../../types/dataFrame';
import { mockTransformationsRegistry } from '../../utils/tests/mockTransformationsRegistry';
import { fieldMatchers } from '../matchers';
import { FieldMatcherID } from '../matchers/ids';
import { calculateFieldTransformer } from './calculateField';
import { JoinMode } from './joinByField';
@ -28,6 +30,8 @@ describe('align frames', () => {
],
});
// the following does not work for tabular joins where the joined on field value is duplicated
// the time will never have a dupicated time which is joined on
it('should perform an outer join', () => {
const out = joinDataFrames({ frames: [series1, series2] })!;
expect(
@ -130,6 +134,89 @@ describe('align frames', () => {
});
});
describe('join tabular data by chosen field', () => {
// join on gender where there are multiple values, duplicate values which can increase the rows
const tableData1 = toDataFrame({
fields: [
{ name: 'gender', type: FieldType.string, values: ['MALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE'] },
{
name: 'day',
type: FieldType.string,
values: ['Wednesday', 'Tuesday', 'Monday', 'Wednesday', 'Tuesday', 'Monday'],
},
{ name: 'count', type: FieldType.number, values: [18, 72, 13, 17, 71, 7] },
],
});
const tableData2 = toDataFrame({
fields: [
{ name: 'gender', type: FieldType.string, values: ['MALE', 'FEMALE'] },
{ name: 'count', type: FieldType.number, values: [103, 95] },
],
});
it('should perform an outer join with duplicated values to join on', () => {
const out = joinDataFrames({
frames: [tableData1, tableData2],
joinBy: fieldMatchers.get(FieldMatcherID.byName).get('gender'),
mode: JoinMode.outerTabular,
})!;
expect(
out.fields.map((f) => ({
name: f.name,
values: f.values,
}))
).toMatchInlineSnapshot(`
[
{
"name": "gender",
"values": [
"MALE",
"MALE",
"MALE",
"FEMALE",
"FEMALE",
"FEMALE",
],
},
{
"name": "day",
"values": [
"Wednesday",
"Tuesday",
"Monday",
"Wednesday",
"Tuesday",
"Monday",
],
},
{
"name": "count",
"values": [
18,
72,
13,
17,
71,
7,
],
},
{
"name": "count",
"values": [
103,
103,
103,
95,
95,
95,
],
},
]
`);
});
});
it('unsorted input keep indexes', () => {
//----------
const series1 = toDataFrame({

View File

@ -151,6 +151,10 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
const nullModes: JoinNullMode[][] = [];
const allData: AlignedData[] = [];
const originalFields: Field[] = [];
// store frame field order for tabular data join
const originalFieldsOrderByFrame: number[][] = [];
// all other fields that are not the join on are in the 1+ position (join is always the 0)
let fieldsOrder = 1;
const joinFieldMatcher = getJoinMatcher(options);
for (let frameIndex = 0; frameIndex < options.frames.length; frameIndex++) {
@ -163,6 +167,7 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
const nullModesFrame: JoinNullMode[] = [NULL_REMOVE];
let join: Field | undefined = undefined;
let fields: Field[] = [];
let frameFieldsOrder = [];
for (let fieldIndex = 0; fieldIndex < frame.fields.length; fieldIndex++) {
const field = frame.fields[fieldIndex];
@ -220,12 +225,22 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
originalFields.push(field);
// clear field displayName state
delete field.state?.displayName;
// store frame field order for tabular data join
frameFieldsOrder.push(fieldsOrder);
fieldsOrder++;
}
// store frame field order for tabular data join
originalFieldsOrderByFrame.push(frameFieldsOrder);
allData.push(a);
}
const joined = join(allData, nullModes, options.mode);
let joined: Array<Array<number | string | null | undefined>> = [];
if (options.mode === JoinMode.outerTabular) {
joined = joinOuterTabular(allData, originalFieldsOrderByFrame, originalFields.length, nullModes);
} else {
joined = join(allData, nullModes, options.mode);
}
return {
// ...options.data[0], // keep name, meta?
@ -237,6 +252,98 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
};
}
// The following full outer join allows for multiple/duplicated joined fields values where as the performant join from uplot creates a unique set of field values to be joined on
// http://www.silota.com/docs/recipes/sql-join-tutorial-javascript-examples.html
// The frame field value which is used join on is sorted to the 0 position of each table data in both tables and nullModes
// (not sure if we need nullModes) for nullModes, the field to join on is given NULL_REMOVE and all other fields are given NULL_EXPAND
function joinOuterTabular(
tables: AlignedData[],
originalFieldsOrderByFrame: number[][],
numberOfFields: number,
nullModes?: number[][]
) {
// we will iterate through all frames and check frames for matches preventing duplicates.
// we will store each matched frame "row" or field values at the same index in the following hash.
let duplicateHash: { [key: string]: Array<number | string | null | undefined> } = {};
// iterate through the tables (frames)
// for each frame we get the field data where the data in the 0 pos is the value to join on
for (let tableIdx = 0; tableIdx < tables.length; tableIdx++) {
// the table (frame) to check for matches in other tables
let table = tables[tableIdx];
// the field value to join on (the join value is always in the 0 position)
let joinOnTableField = table[0];
// now we iterate through the other table (frame) data to look for matches
for (let otherTablesIdx = 0; otherTablesIdx < tables.length; otherTablesIdx++) {
// do not match on the same table
if (otherTablesIdx === tableIdx) {
continue;
}
let otherTable = tables[otherTablesIdx];
let otherTableJoinOnField = otherTable[0];
// iterate through the field to join on from the first table
for (
let joinTableFieldValuesIdx = 0;
joinTableFieldValuesIdx < joinOnTableField.length;
joinTableFieldValuesIdx++
) {
// create the joined data
// this has the orignalFields length and should start out undefined
// joined row + number of other fields in each frame
// the order of each field is important in how we
// 1 check for duplicates
// 2 transform the row back into fields for the joined frame
// 3 when there is no match for the row we keep the vals undefined
const tableJoinOnValue = joinOnTableField[joinTableFieldValuesIdx];
const allOtherFields = numberOfFields - 1;
let joinedRow: Array<number | string | null | undefined> = [tableJoinOnValue].concat(new Array(allOtherFields));
let tableFieldValIdx = 0;
for (let fieldsIdx = 1; fieldsIdx < table.length; fieldsIdx++) {
const joinRowIdx = originalFieldsOrderByFrame[tableIdx][tableFieldValIdx];
joinedRow[joinRowIdx] = table[fieldsIdx][joinTableFieldValuesIdx];
tableFieldValIdx++;
}
for (let otherTableValuesIdx = 0; otherTableValuesIdx < otherTableJoinOnField.length; otherTableValuesIdx++) {
if (joinOnTableField[joinTableFieldValuesIdx] === otherTableJoinOnField[otherTableValuesIdx]) {
let tableFieldValIdx = 0;
for (let fieldsIdx = 1; fieldsIdx < otherTable.length; fieldsIdx++) {
const joinRowIdx = originalFieldsOrderByFrame[otherTablesIdx][tableFieldValIdx];
joinedRow[joinRowIdx] = otherTable[fieldsIdx][otherTableValuesIdx];
tableFieldValIdx++;
}
break;
}
}
// prevent duplicates by entering rows in a hash where keys are the rows
duplicateHash[JSON.stringify(joinedRow)] = joinedRow;
}
}
}
// transform the joined rows into data for a dataframe
let data: Array<Array<number | string | null | undefined>> = [];
for (let field = 0; field < numberOfFields; field++) {
data.push(new Array(0));
}
for (let key in duplicateHash) {
const row = duplicateHash[key];
for (let valIdx = 0; valIdx < row.length; valIdx++) {
data[valIdx].push(row[valIdx]);
}
}
return data;
}
//--------------------------------------------------------------------------------
// Below here is copied from uplot (MIT License)
// https://github.com/leeoniya/uPlot/blob/master/src/utils.js#L325

View File

@ -14,8 +14,19 @@ import { Select, InlineFieldRow, InlineField } from '@grafana/ui';
import { useAllFieldNamesFromDataFrames } from '../utils';
const modes = [
{ value: JoinMode.outer, label: 'OUTER', description: 'Keep all rows from any table with a value' },
{ value: JoinMode.inner, label: 'INNER', description: 'Drop rows that do not match a value in all tables' },
{
value: JoinMode.outer,
label: 'OUTER (TIME SERIES)',
description:
'Keep all rows from any table with a value. Join on distinct field values. Performant and best used for time series.',
},
{
value: JoinMode.outerTabular,
label: 'OUTER (TABULAR)',
description:
'Join on a field value with dupicated values. Non performant outer join best used for tabular(SQL like) data.',
},
{ value: JoinMode.inner, label: 'INNER', description: 'Drop rows that do not match a value in all tables.' },
];
export function SeriesToFieldsTransformerEditor({ input, options, onChange }: TransformerUIProps<JoinByFieldOptions>) {