Merge pull request #1685 from nosqlbench/mwolters/vpf-72

Support mixed-schema KNN data set formats
2024-12-23 15:40:44 -06:00 · 2023-12-06 01:02:31 -06:00 · 2023-12-06 01:02:31 -06:00 · 904ce1c6ec
commit 904ce1c6ec
parent 1138b3b18e 6793828278
11 changed files with 592 additions and 5 deletions
--- a/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DatasetParser.java
+++ b/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DatasetParser.java
@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+
+/**
+ * This interface is used to parse the raw JSON from the HDF dataset into a CQL predicate.
+ */
+public interface DatasetParser {
+
+    /**
+     * Return the specified class to parse the raw JSON from the HDF dataset into a CQL predicate.
+     * @param parsername
+     * @return A new instance of the specified parser class.
+     */
+    static DatasetParser parserFactory(String parsername) {
+        return switch (parsername) {
+            case "default" -> new DefaultDatasetParser();
+            case "noop" -> new NoopDatasetParser();
+            case "jaw" -> new JAWDatasetParser();
+            default -> throw new RuntimeException("Unknown parser name: " + parsername);
+        };
+    }
+
+    String parse(String raw);
+}
--- a/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DefaultDatasetParser.java
+++ b/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DefaultDatasetParser.java
@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+/**
+ * This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This is the default
+ * implementation. It accepts a JSON string of the form found at https://github.com/qdrant/ann-filtering-benchmark-datasets
+ * and converts it into a CQL predicate in String form
+ */
+public class DefaultDatasetParser implements DatasetParser {
+    private static final String WHERE = "WHERE";
+    private static final String MATCH = "match";
+    private static final String AND = "and";
+    private static final String OR = "or";
+    private static final String EQ = "=";
+    private static final String IN = "IN";
+    private static final String CONDITIONS = "conditions";
+    private static final String VALUE = "value";
+    private static final String SPACE = " ";
+    private static final String SINGLE_QUOTE = "'";
+    private static final String COMMA = ",";
+    private static final String LEFT_PAREN = "(";
+    private static final String RIGHT_PAREN = ")";
+    private static final Logger logger = LogManager.getLogger(DefaultDatasetParser.class);
+
+    @Override
+    public String parse(String raw) {
+        logger.debug(() -> "Parsing: " + raw);
+        JsonObject conditions = JsonParser.parseString(raw).getAsJsonObject().get(CONDITIONS).getAsJsonObject();
+        if (conditions.has(AND)) {
+            return parseAndConditionsInline(conditions);
+        } else if (conditions.has(OR)) {
+            return parseOrConditionsInline(conditions);
+        } else {
+            throw new RuntimeException("Unknown predicate type: " + conditions.keySet());
+        }
+    }
+
+    private String parseOrConditionsInline(JsonObject conditions) {
+        StringBuilder builder = new StringBuilder(WHERE);
+        boolean first = true;
+        for (JsonElement e : conditions.get(OR).getAsJsonArray()) {
+            JsonObject condition = e.getAsJsonObject();
+            String field = condition.keySet().iterator().next();
+            JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
+            if (match != null) {
+                if (first) {
+                    builder.append(SPACE).append(field).append(SPACE).append(IN).append(LEFT_PAREN);
+                    first = false;
+                } else {
+                    builder.append(COMMA);
+                }
+                boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
+                    match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
+
+                builder.append(
+                    isNumeric ?
+                        match.getAsJsonObject().get(VALUE).getAsString() :
+                        SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
+                );
+            } else {
+                logger.error(() -> "No match found for: " + condition.keySet());
+            }
+        }
+        builder.append(RIGHT_PAREN);
+        return builder.toString();
+    }
+
+    private String parseAndConditionsInline(JsonObject conditions) {
+        StringBuilder builder = new StringBuilder(WHERE);
+        boolean first = true;
+        for (JsonElement e : conditions.get(AND).getAsJsonArray()) {
+            JsonObject condition = e.getAsJsonObject();
+            String field = condition.keySet().iterator().next();
+            JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
+            if (match != null) {
+                if (first) {
+                    first = false;
+                } else {
+                    builder.append(SPACE).append(AND);
+                }
+                boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
+                    match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
+                builder.append(SPACE).append(field).append(EQ);
+                builder.append(
+                    isNumeric ?
+                        match.getAsJsonObject().get(VALUE).getAsString() :
+                        SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
+                );
+            } else {
+                logger.error(() -> "No match found for: " + condition.keySet());
+            }
+        }
+        return builder.toString();
+    }
+
+}
--- a/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/HdfDatasetToCqlPredicates.java
+++ b/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/HdfDatasetToCqlPredicates.java
@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+
+import io.jhdf.HdfFile;
+import io.jhdf.api.Dataset;
+import io.nosqlbench.api.content.NBIO;
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+
+import java.util.function.LongFunction;
+
+/**
+ * Binding function that accepts a long input value for the cycle and returns a string consisting of the
+ * CQL predicate parsed from a single record in an HDF5 dataset
+ */
+@ThreadSafeMapper
+@Categories(Category.experimental)
+public class HdfDatasetToCqlPredicates implements LongFunction<String> {
+    private final HdfFile hdfFile;
+    private final Dataset dataset;
+    private final int recordCount;
+    private final DatasetParser parser;
+
+    /**
+     * Create a new binding function that accepts a long input value for the cycle and returns a string
+     * @param filename
+     * @param datasetname
+     * @param parsername
+     */
+    public HdfDatasetToCqlPredicates(String filename, String datasetname, String parsername) {
+        hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
+        dataset = hdfFile.getDatasetByPath(datasetname);
+        recordCount = dataset.getDimensions()[0];
+        parser = DatasetParser.parserFactory(parsername);
+    }
+
+    public HdfDatasetToCqlPredicates(String filename, String datasetname) {
+        this(filename, datasetname, "default");
+    }
+
+    @Override
+    public String apply(long l) {
+        long[] sliceOffset = {(l % recordCount)};
+        int[] sliceDimensions = {1};
+        String raw = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
+        return parser.parse(raw);
+    }
+}
--- a/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/JAWDatasetParser.java
+++ b/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/JAWDatasetParser.java
@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+/**
+ * This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
+ * accepts a string consisting of the desired CQL predicate as translated from the original jsonl files
+ * and simply adds the WHERE keyword to the beginning of the string if it is not already present, hence
+ * the new Just Add Where (JAW) parser.
+ */
+public class JAWDatasetParser implements DatasetParser {
+    private static final String WHERE = "WHERE";
+    @Override
+    public String parse(String raw) {
+        if (!raw.toUpperCase().startsWith(WHERE)) {
+            raw = WHERE + " " + raw;
+        }
+        return raw;
+    }
+}
--- a/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/NoopDatasetParser.java
+++ b/adapter-cqld4/src/main/java/io/nosqlbench/datamappers/functions/hdf_to_cql/NoopDatasetParser.java
@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+/**
+ * This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
+ * accepts a string consisting of the desired CQL predicate as translated from the original jsonl files and
+ * simply returns the raw string, hence the name NoopDatasetParser.
+ */
+public class NoopDatasetParser implements DatasetParser {
+    @Override
+    public String parse(String raw) {
+        return raw;
+    }
+}
--- a/adapter-cqld4/src/test/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DefaultDatasetParserTest.java
+++ b/adapter-cqld4/src/test/java/io/nosqlbench/datamappers/functions/hdf_to_cql/DefaultDatasetParserTest.java
@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.datamappers.functions.hdf_to_cql;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class DefaultDatasetParserTest {
+    String test1 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 53}}}]}}";
+    String test2 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": \"thirteen\"}}}, {\"b\": {\"match\": {\"value\": \"fifty-four\"}}}]}}";
+    String test3 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 13}}}, {\"b\": {\"match\": {\"value\": 54}}},  {\"a\": {\"match\": {\"value\": 154}}}]}}";
+    String test4 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}]}}";
+    String test5 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}, {\"a\": {\"match\": {\"value\": 7}}}]}}";
+    String test6 = "{\"conditions\": {\"or\": [{\"b\": {\"match\": {\"value\": \"foo\"}}}, {\"b\": {\"match\": {\"value\": \"bar\"}}}]}}";
+
+
+    @Test
+    public void testParse() {
+        DefaultDatasetParser parser = new DefaultDatasetParser();
+        String parsed = parser.parse(test1);
+        assertEquals("WHERE a=53", parsed);
+
+        parsed = parser.parse(test2);
+        assertEquals("WHERE a='thirteen' and b='fifty-four'", parsed);
+
+        parsed = parser.parse(test3);
+        assertEquals("WHERE a=13 and b=54 and a=154", parsed);
+
+        parsed = parser.parse(test4);
+        assertEquals("WHERE a IN(9,71)", parsed);
+
+        parsed = parser.parse(test5);
+        assertEquals("WHERE a IN(9,71,7)", parsed);
+
+        parsed = parser.parse(test6);
+        assertEquals("WHERE b IN('foo','bar')", parsed);
+    }
+
+}
--- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToString.java
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToString.java
@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
+
+import io.jhdf.HdfFile;
+import io.jhdf.api.Dataset;
+import io.nosqlbench.api.content.NBIO;
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+
+import java.util.function.LongFunction;
+
+/**
+ * This function reads a vector dataset from an HDF5 file. The entire dataset is parsed  into a single
+ * String Object with the discreet values separated by the user supplied separator character. It is
+ * intended for use only with small datasets where the entire dataset can be read into memory and there
+ * is no need to read individual vectors from the dataset.
+ * The lambda function simply returns the String representation of the dataset.
+ */
+@ThreadSafeMapper
+@Categories(Category.experimental)
+public class HdfDatasetToString implements LongFunction<String> {
+    private final HdfFile hdfFile;
+    private final Dataset dataset;
+    private final String separator;
+    private final String datasetAsString;
+
+    /**
+     * Create a new binding function that accepts a long input value for the cycle and returns a string representation
+     * of the specified dataset
+     * @param filename
+     * @param dataset
+     * @param separator
+     */
+    public HdfDatasetToString(String filename, String dataset, String separator) {
+        hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
+        this.dataset = hdfFile.getDatasetByPath(dataset);
+        this.separator = separator;
+        this.datasetAsString = parseDataset();
+    }
+
+    public HdfDatasetToString(String filename, String dataset) {
+        this(filename, dataset, ",");
+    }
+
+    private String parseDataset() {
+        String[] columnDataset = (String[])dataset.getData();
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < columnDataset.length; i++) {
+            sb.append(columnDataset[i]);
+            if (i < columnDataset.length - 1) {
+                sb.append(separator);
+            }
+        }
+        return sb.toString();
+    }
+
+    @Override
+    public String apply(long value) {
+        return datasetAsString;
+    }
+
+}
--- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java
@ -21,10 +21,11 @@ import io.nosqlbench.virtdata.api.annotations.Category;
 import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
 import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;

+import java.util.Arrays;
 import java.util.function.LongFunction;

 /**
- * This function reads a dataset from an HDF5 file. The dataset itself is not
+ * This function reads a dataset of any supported type from an HDF5 file. The dataset itself is not
 * read into memory, only the metadata (the "dataset" Java Object). The lambda function
 * reads a single vector from the dataset, based on the long input value.
 */
@ -42,11 +43,30 @@ public class HdfDatasetToStrings extends AbstractHdfFileToVectorType implements
        int[] sliceDimensions = new int[dims.length];
        sliceDimensions[0] = 1;
        if (dims.length > 1) {
-            for (int i = 1; i < dims.length; i++) {
-                sliceDimensions[i] = dims[i];
-            }
+            System.arraycopy(dims, 1, sliceDimensions, 1, dims.length - 1);
        }
-        return ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
+        String payload = null;
+        switch(dataset.getJavaType().getSimpleName().toLowerCase()) {
+            case "string" ->
+                payload = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
+            case "int" ->
+                payload = Arrays.toString(((int[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
+            case "float" ->
+                payload = Arrays.toString(((float[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
+            case "short" ->
+                payload = Arrays.toString(((short[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
+            case "long" ->
+                payload = Arrays.toString(((long[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
+            case "double" ->
+                payload = Arrays.toString(((double[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
+            case "char" ->
+                payload = String.valueOf(((char[][])dataset.getData(sliceOffset, sliceDimensions))[0]);
+        }
+        if (payload == null) {
+            throw new RuntimeException("Unsupported datatype: " + dataset.getJavaType().getSimpleName());
+        }
+        payload = payload.replaceAll("\\[", "").replaceAll("]", "");
+        return payload;
    }

 }
--- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetsToString.java
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetsToString.java
@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
+
+import io.jhdf.HdfFile;
+import io.jhdf.api.Dataset;
+import io.nosqlbench.api.content.NBIO;
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+
+import java.util.function.LongFunction;
+
+@ThreadSafeMapper
+@Categories(Category.experimental)
+public class HdfDatasetsToString implements LongFunction<String> {
+    private final HdfFile hdfFile;
+    private final Dataset DSLeft;
+    private final Dataset DSRight;
+    private final String intraSeparator;
+    private final String interSeparator;
+
+    public HdfDatasetsToString(String filename, String DSNameLeft, String DSNameRight, String intraSeparator, String interSeparator) {
+        hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
+        DSLeft = hdfFile.getDatasetByPath(DSNameLeft);
+        DSRight = hdfFile.getDatasetByPath(DSNameRight);
+        this.intraSeparator = intraSeparator;
+        this.interSeparator = interSeparator;
+    }
+
+    /*
+     * Read the column names from the columns DS and store them in an array
+     * Read the column data types from the columnTypes DS and store them in an array
+     * Create a csv schema string from the column names and data types
+     */
+    @Override
+    public String apply(long value) {
+        Object columnDataset = DSLeft.getData();
+        Object columnTypeDataset = DSRight.getData();
+        return pairByOrdinal((String[]) columnDataset, (String[])columnTypeDataset);
+    }
+
+    private String pairByOrdinal(String[] columnDataset, String[] columnTypeDataset) {
+        if (columnDataset.length != columnTypeDataset.length) {
+            throw new RuntimeException("Left hand dataset and right hand dataset must be the same length");
+        }
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < columnDataset.length; i++) {
+            sb.append(columnDataset[i]).append(intraSeparator).append(columnTypeDataset[i]);
+            if (i < columnDataset.length - 1) {
+                sb.append(interSeparator);
+            }
+        }
+        return sb.toString();
+    }
+}
--- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java
@ -43,6 +43,12 @@ public class EmbeddingGeneratorFactory {
                }
                return generators.get(type);
            }
+            case "long" -> {
+                if (!generators.containsKey(type)) {
+                    generators.put(type, new LongEmbeddingGenerator());
+                }
+                return generators.get(type);
+            }
            default -> throw new RuntimeException("Unknown embedding type: " + type);
        }
    }
--- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/LongEmbeddingGenerator.java
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/LongEmbeddingGenerator.java
@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.List;
+
+public class LongEmbeddingGenerator implements EmbeddingGenerator {
+    @Override
+    public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
+        // in this case o will always be long[1][x]
+        long[] vector = ((long[][]) o)[0];
+        Float[] vector2 = new Float[vector.length];
+        for (int i = 0; i < vector.length; i++) {
+            vector2[i] = (float) vector[i];
+        }
+        return List.of(vector2);
+    }
+
+    @Override
+    public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
+        long[] vector = ((long[][]) o)[0];
+        float[] vector2 = new float[vector.length];
+        for (int i = 0; i < vector.length; i++) {
+            vector2[i] = (float) vector[i];
+        }
+        return vector2;
+    }
+
+    @Override
+    public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
+        long[] vector = ((long[][]) o)[0];
+        Long[] vector2 = new Long[vector.length];
+        for (int i = 0; i < vector.length; i++) {
+            vector2[i] = vector[i];
+        }
+        return List.of(vector2);
+    }
+
+    @Override
+    public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
+        return ((long[][]) o)[0];
+    }
+
+    @Override
+    public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
+        long[] vector = ((long[][]) o)[0];
+        Integer[] vector2 = new Integer[vector.length];
+        for (int i = 0; i < vector.length; i++) {
+            vector2[i] = Math.toIntExact(vector[i]);
+        }
+        return List.of(vector2);
+    }
+
+    @Override
+    public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
+        long[] vector = ((long[][]) o)[0];
+        int[] vector2 = new int[vector.length];
+        for (int i = 0; i < vector.length; i++) {
+            vector2[i] = Math.toIntExact(vector[i]);
+        }
+        return vector2;
+    }
+}