Tranformations: True OUTER JOIN in the join by field transformation used for tabular data (#72176)

write join for tabular data and add test
2025-02-25 18:55:37 -06:00 · 2023-07-26 12:06:45 -04:00 · 2023-07-26 12:06:45 -04:00 · d39ec2428e
commit d39ec2428e
parent 60058cb3da
4 changed files with 211 additions and 5 deletions
--- a/packages/grafana-data/src/transformations/transformers/joinByField.ts
+++ b/packages/grafana-data/src/transformations/transformers/joinByField.ts
@ -8,8 +8,9 @@ import { DataTransformerID } from './ids';
 import { joinDataFrames } from './joinDataFrames';

 export enum JoinMode {
-  outer = 'outer',
+  outer = 'outer', // best for time series, non duplicated join on values
  inner = 'inner',
+  outerTabular = 'outerTabular', // best for tabular data where the join on value can be duplicated
 }

 export interface JoinByFieldOptions {
--- a/packages/grafana-data/src/transformations/transformers/joinDataFrames.test.ts
+++ b/packages/grafana-data/src/transformations/transformers/joinDataFrames.test.ts
@ -2,6 +2,8 @@ import { toDataFrame } from '../../dataframe/processDataFrame';
 import { getFieldDisplayName } from '../../field';
 import { DataFrame, FieldType } from '../../types/dataFrame';
 import { mockTransformationsRegistry } from '../../utils/tests/mockTransformationsRegistry';
+import { fieldMatchers } from '../matchers';
+import { FieldMatcherID } from '../matchers/ids';

 import { calculateFieldTransformer } from './calculateField';
 import { JoinMode } from './joinByField';
@ -28,6 +30,8 @@ describe('align frames', () => {
      ],
    });

+    // the following does not work for tabular joins where the joined on field value is duplicated
+    // the time will never have a dupicated time which is joined on
    it('should perform an outer join', () => {
      const out = joinDataFrames({ frames: [series1, series2] })!;
      expect(
@ -130,6 +134,89 @@ describe('align frames', () => {
    });
  });

+  describe('join tabular data by chosen field', () => {
+    // join on gender where there are multiple values, duplicate values which can increase the rows
+
+    const tableData1 = toDataFrame({
+      fields: [
+        { name: 'gender', type: FieldType.string, values: ['MALE', 'MALE', 'MALE', 'FEMALE', 'FEMALE', 'FEMALE'] },
+        {
+          name: 'day',
+          type: FieldType.string,
+          values: ['Wednesday', 'Tuesday', 'Monday', 'Wednesday', 'Tuesday', 'Monday'],
+        },
+        { name: 'count', type: FieldType.number, values: [18, 72, 13, 17, 71, 7] },
+      ],
+    });
+    const tableData2 = toDataFrame({
+      fields: [
+        { name: 'gender', type: FieldType.string, values: ['MALE', 'FEMALE'] },
+        { name: 'count', type: FieldType.number, values: [103, 95] },
+      ],
+    });
+
+    it('should perform an outer join with duplicated values to join on', () => {
+      const out = joinDataFrames({
+        frames: [tableData1, tableData2],
+        joinBy: fieldMatchers.get(FieldMatcherID.byName).get('gender'),
+        mode: JoinMode.outerTabular,
+      })!;
+      expect(
+        out.fields.map((f) => ({
+          name: f.name,
+          values: f.values,
+        }))
+      ).toMatchInlineSnapshot(`
+        [
+          {
+            "name": "gender",
+            "values": [
+              "MALE",
+              "MALE",
+              "MALE",
+              "FEMALE",
+              "FEMALE",
+              "FEMALE",
+            ],
+          },
+          {
+            "name": "day",
+            "values": [
+              "Wednesday",
+              "Tuesday",
+              "Monday",
+              "Wednesday",
+              "Tuesday",
+              "Monday",
+            ],
+          },
+          {
+            "name": "count",
+            "values": [
+              18,
+              72,
+              13,
+              17,
+              71,
+              7,
+            ],
+          },
+          {
+            "name": "count",
+            "values": [
+              103,
+              103,
+              103,
+              95,
+              95,
+              95,
+            ],
+          },
+        ]
+      `);
+    });
+  });
+
  it('unsorted input keep indexes', () => {
    //----------
    const series1 = toDataFrame({
--- a/packages/grafana-data/src/transformations/transformers/joinDataFrames.ts
+++ b/packages/grafana-data/src/transformations/transformers/joinDataFrames.ts
@ -151,6 +151,10 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
  const nullModes: JoinNullMode[][] = [];
  const allData: AlignedData[] = [];
  const originalFields: Field[] = [];
+  // store frame field order for tabular data join
+  const originalFieldsOrderByFrame: number[][] = [];
+  // all other fields that are not the join on are in the 1+ position (join is always the 0)
+  let fieldsOrder = 1;
  const joinFieldMatcher = getJoinMatcher(options);

  for (let frameIndex = 0; frameIndex < options.frames.length; frameIndex++) {
@ -163,6 +167,7 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
    const nullModesFrame: JoinNullMode[] = [NULL_REMOVE];
    let join: Field | undefined = undefined;
    let fields: Field[] = [];
+    let frameFieldsOrder = [];

    for (let fieldIndex = 0; fieldIndex < frame.fields.length; fieldIndex++) {
      const field = frame.fields[fieldIndex];
@ -220,12 +225,22 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
      originalFields.push(field);
      // clear field displayName state
      delete field.state?.displayName;
+      // store frame field order for tabular data join
+      frameFieldsOrder.push(fieldsOrder);
+      fieldsOrder++;
    }
-
+    // store frame field order for tabular data join
+    originalFieldsOrderByFrame.push(frameFieldsOrder);
    allData.push(a);
  }

-  const joined = join(allData, nullModes, options.mode);
+  let joined: Array<Array<number | string | null | undefined>> = [];
+
+  if (options.mode === JoinMode.outerTabular) {
+    joined = joinOuterTabular(allData, originalFieldsOrderByFrame, originalFields.length, nullModes);
+  } else {
+    joined = join(allData, nullModes, options.mode);
+  }

  return {
    // ...options.data[0], // keep name, meta?
@ -237,6 +252,98 @@ export function joinDataFrames(options: JoinOptions): DataFrame | undefined {
  };
 }

+// The following full outer join allows for multiple/duplicated joined fields values where as the performant join from uplot creates a unique set of field values to be joined on
+// http://www.silota.com/docs/recipes/sql-join-tutorial-javascript-examples.html
+// The frame field value which is used join on is sorted to the 0 position of each table data in both tables and nullModes
+// (not sure if we need nullModes) for nullModes, the field to join on is given NULL_REMOVE and all other fields are given NULL_EXPAND
+function joinOuterTabular(
+  tables: AlignedData[],
+  originalFieldsOrderByFrame: number[][],
+  numberOfFields: number,
+  nullModes?: number[][]
+) {
+  // we will iterate through all frames and check frames for matches preventing duplicates.
+  // we will store each matched frame "row" or field values at the same index in the following hash.
+  let duplicateHash: { [key: string]: Array<number | string | null | undefined> } = {};
+
+  // iterate through the tables (frames)
+  // for each frame we get the field data where the data in the 0 pos is the value to join on
+  for (let tableIdx = 0; tableIdx < tables.length; tableIdx++) {
+    // the table (frame) to check for matches in other tables
+    let table = tables[tableIdx];
+    // the field value to join on (the join value is always in the 0 position)
+    let joinOnTableField = table[0];
+
+    // now we iterate through the other table (frame) data to look for matches
+    for (let otherTablesIdx = 0; otherTablesIdx < tables.length; otherTablesIdx++) {
+      // do not match on the same table
+      if (otherTablesIdx === tableIdx) {
+        continue;
+      }
+
+      let otherTable = tables[otherTablesIdx];
+      let otherTableJoinOnField = otherTable[0];
+
+      // iterate through the field to join on from the first table
+      for (
+        let joinTableFieldValuesIdx = 0;
+        joinTableFieldValuesIdx < joinOnTableField.length;
+        joinTableFieldValuesIdx++
+      ) {
+        // create the joined data
+        // this has the orignalFields length and should start out undefined
+        // joined row + number of other fields in each frame
+        // the order of each field is important in how we
+        // 1 check for duplicates
+        // 2 transform the row back into fields for the joined frame
+        // 3 when there is no match for the row we keep the vals undefined
+        const tableJoinOnValue = joinOnTableField[joinTableFieldValuesIdx];
+        const allOtherFields = numberOfFields - 1;
+        let joinedRow: Array<number | string | null | undefined> = [tableJoinOnValue].concat(new Array(allOtherFields));
+
+        let tableFieldValIdx = 0;
+        for (let fieldsIdx = 1; fieldsIdx < table.length; fieldsIdx++) {
+          const joinRowIdx = originalFieldsOrderByFrame[tableIdx][tableFieldValIdx];
+          joinedRow[joinRowIdx] = table[fieldsIdx][joinTableFieldValuesIdx];
+          tableFieldValIdx++;
+        }
+
+        for (let otherTableValuesIdx = 0; otherTableValuesIdx < otherTableJoinOnField.length; otherTableValuesIdx++) {
+          if (joinOnTableField[joinTableFieldValuesIdx] === otherTableJoinOnField[otherTableValuesIdx]) {
+            let tableFieldValIdx = 0;
+            for (let fieldsIdx = 1; fieldsIdx < otherTable.length; fieldsIdx++) {
+              const joinRowIdx = originalFieldsOrderByFrame[otherTablesIdx][tableFieldValIdx];
+              joinedRow[joinRowIdx] = otherTable[fieldsIdx][otherTableValuesIdx];
+              tableFieldValIdx++;
+            }
+
+            break;
+          }
+        }
+
+        // prevent duplicates by entering rows in a hash where keys are the rows
+        duplicateHash[JSON.stringify(joinedRow)] = joinedRow;
+      }
+    }
+  }
+
+  // transform the joined rows into data for a dataframe
+  let data: Array<Array<number | string | null | undefined>> = [];
+  for (let field = 0; field < numberOfFields; field++) {
+    data.push(new Array(0));
+  }
+
+  for (let key in duplicateHash) {
+    const row = duplicateHash[key];
+
+    for (let valIdx = 0; valIdx < row.length; valIdx++) {
+      data[valIdx].push(row[valIdx]);
+    }
+  }
+
+  return data;
+}
+
 //--------------------------------------------------------------------------------
 // Below here is copied from uplot (MIT License)
 // https://github.com/leeoniya/uPlot/blob/master/src/utils.js#L325
--- a/public/app/features/transformers/editors/JoinByFieldTransformerEditor.tsx
+++ b/public/app/features/transformers/editors/JoinByFieldTransformerEditor.tsx
@ -14,8 +14,19 @@ import { Select, InlineFieldRow, InlineField } from '@grafana/ui';
 import { useAllFieldNamesFromDataFrames } from '../utils';

 const modes = [
-  { value: JoinMode.outer, label: 'OUTER', description: 'Keep all rows from any table with a value' },
-  { value: JoinMode.inner, label: 'INNER', description: 'Drop rows that do not match a value in all tables' },
+  {
+    value: JoinMode.outer,
+    label: 'OUTER (TIME SERIES)',
+    description:
+      'Keep all rows from any table with a value. Join on distinct field values. Performant and best used for time series.',
+  },
+  {
+    value: JoinMode.outerTabular,
+    label: 'OUTER (TABULAR)',
+    description:
+      'Join on a field value with dupicated values. Non performant outer join best used for tabular(SQL like) data.',
+  },
+  { value: JoinMode.inner, label: 'INNER', description: 'Drop rows that do not match a value in all tables.' },
 ];

 export function SeriesToFieldsTransformerEditor({ input, options, onChange }: TransformerUIProps<JoinByFieldOptions>) {