diff --git a/adapter-dynamodb/pom.xml b/adapter-dynamodb/pom.xml
index f796250e5..b46909ea0 100644
--- a/adapter-dynamodb/pom.xml
+++ b/adapter-dynamodb/pom.xml
@@ -43,7 +43,7 @@
com.amazonaws
aws-java-sdk-dynamodb
- 1.12.500
+ 1.12.507
diff --git a/adapter-kafka/pom.xml b/adapter-kafka/pom.xml
index 552ea0131..7bd4f07c7 100644
--- a/adapter-kafka/pom.xml
+++ b/adapter-kafka/pom.xml
@@ -34,7 +34,7 @@
- 3.5.0
+ 3.5.1
@@ -60,7 +60,7 @@
org.apache.commons
commons-lang3
- 3.12.0
+ 3.13.0
diff --git a/adapter-mongodb/pom.xml b/adapter-mongodb/pom.xml
index 25b523b89..8692fa642 100644
--- a/adapter-mongodb/pom.xml
+++ b/adapter-mongodb/pom.xml
@@ -42,7 +42,7 @@
org.mongodb
mongodb-driver-sync
- 4.10.1
+ 4.10.2
diff --git a/adapter-pulsar/pom.xml b/adapter-pulsar/pom.xml
index fd4ba41f1..1c6f28c5d 100644
--- a/adapter-pulsar/pom.xml
+++ b/adapter-pulsar/pom.xml
@@ -34,7 +34,7 @@
- 3.0.0
+ 3.0.1
@@ -66,7 +66,7 @@
org.apache.commons
commons-lang3
- 3.12.0
+ 3.13.0
diff --git a/docsys/pom.xml b/docsys/pom.xml
index 9585b1405..8b44d73d4 100644
--- a/docsys/pom.xml
+++ b/docsys/pom.xml
@@ -22,7 +22,7 @@
docsys
http://nosqlbench.io/
- 3.1.2
+ 3.1.3
@@ -94,7 +94,7 @@
org.glassfish.jersey.media
jersey-media-json-jackson
- 3.1.2
+ 3.1.3
diff --git a/hdf-loader/pom.xml b/hdf-loader/pom.xml
new file mode 100644
index 000000000..78063bd3e
--- /dev/null
+++ b/hdf-loader/pom.xml
@@ -0,0 +1,100 @@
+
+
+
+
+ 4.0.0
+ hdf-loader
+
+ jar
+
+
+ mvn-defaults
+ io.nosqlbench
+ ${revision}
+ ../mvn-defaults
+
+
+ ${project.artifactId}
+
+
+ 17
+ 17
+ UTF-8
+
+
+
+
+
+ org.snakeyaml
+ snakeyaml-engine
+ 2.6
+
+
+ org.yaml
+ snakeyaml
+ 2.0
+
+
+
+ com.datastax.oss
+ java-driver-core
+ 4.16.0
+
+
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ 2.15.2
+
+
+
+
+ org.deeplearning4j
+ deeplearning4j-core
+ 1.0.0-M2.1
+
+
+
+ org.nd4j
+ nd4j-native
+ 1.0.0-M2.1
+
+
+
+ org.deeplearning4j
+ deeplearning4j-nlp
+ 1.0.0-M2.1
+
+
+
+ io.jhdf
+ jhdf
+ 0.6.10
+
+
+ io.nosqlbench
+ nb-api
+ 5.17.3-SNAPSHOT
+ compile
+
+
+
+
+
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java
new file mode 100644
index 000000000..1a6ba6fa6
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf;
+
+import io.nosqlbench.loader.hdf.config.LoaderConfig;
+import io.nosqlbench.loader.hdf.readers.Hdf5Reader;
+import io.nosqlbench.loader.hdf.readers.HdfReader;
+import io.nosqlbench.loader.hdf.writers.AstraVectorWriter;
+import io.nosqlbench.loader.hdf.writers.FileVectorWriter;
+import io.nosqlbench.loader.hdf.writers.NoopVectorWriter;
+import io.nosqlbench.loader.hdf.writers.VectorWriter;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+public class HdfLoader {
+ private static final Logger logger = LogManager.getLogger(HdfLoader.class);
+ public static final String FILEWRITER = "filewriter";
+ public static final String ASTRA = "astra";
+ public static final String NOOP = "noop";
+ public static final String HDF5 = "hdf5";
+ public static final String HDF4 = "hdf4";
+
+ public static void main (String[] args) {
+ if (args.length == 0) {
+ System.out.println("Usage: hdf-loader ");
+ System.exit(1);
+ }
+ try {
+ LoaderConfig config = new LoaderConfig(args[0]);
+ logger.info("Starting loader with config: " + config);
+ HdfReader reader = null;
+ VectorWriter writer = null;
+
+ String format = config.getFormat();
+ switch (format.toLowerCase()) {
+ case HDF4 -> {
+ logger.info("HDF4 format not yet supported");
+ System.exit(1);
+ }
+ case HDF5 -> {
+ logger.info("HDF5 format selected");
+ reader = new Hdf5Reader(config);
+ }
+ default -> {
+ logger.info("Unknown format: " + format);
+ System.exit(1);
+ }
+ }
+
+ String writerType = config.getWriter();
+ logger.info("Using writer type: " + writerType);
+ switch (writerType.toLowerCase()) {
+ case FILEWRITER -> writer = new FileVectorWriter(config);
+ case ASTRA -> writer = new AstraVectorWriter(config);
+ case NOOP -> writer = new NoopVectorWriter();
+ default -> {
+ logger.info("Unknown writer type: " + writerType);
+ System.exit(1);
+ }
+ }
+ reader.setWriter(writer);
+ logger.info("Starting main read loop");
+ reader.read();
+ } catch (Exception e) {
+ logger.error(e);
+ System.exit(1);
+ }
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java
new file mode 100644
index 000000000..f8c02137f
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.config;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.yaml.snakeyaml.Yaml;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+public class LoaderConfig {
+ private static final Logger logger = LogManager.getLogger(LoaderConfig.class);
+ private static final Yaml yaml = new Yaml();
+ private final Map configMap;
+
+ public LoaderConfig(String filePath) throws IOException {
+ FileReader fileReader = new FileReader(filePath);
+ configMap = yaml.load(fileReader);
+ for (Map.Entry entry : configMap.entrySet()) {
+ logger.debug(entry.getKey() + " : " + entry.getValue());
+ }
+ }
+
+ public Object getRawValue(String key) {
+ return configMap.get(key);
+ }
+
+ public String getStringValue(String key) {
+ return configMap.get(key).toString();
+ }
+
+ public List getDatasets() {
+ return (List) configMap.get("datasets");
+ }
+
+ public String getFormat() {
+ return (String) configMap.getOrDefault("format", "HD5");
+ }
+
+ public Map getAstra() {
+ return (Map) configMap.get("astra");
+ }
+
+ public String getEmbedding() {
+ return (String) configMap.getOrDefault("embedding", "Deeplearning4j");
+ }
+
+ public String getWriter() {
+ return (String) configMap.getOrDefault("writer", "filewriter");
+ }
+
+ public String getSourceFile() {
+ return (String) configMap.get("sourceFile");
+ }
+
+ public String getTargetFile() {
+ return (String) configMap.getOrDefault("targetFile", "./vectors.txt");
+ }
+
+ public int getThreads() {
+ return (int) configMap.getOrDefault("threads", 5);
+ }
+
+ public int getQueueSize() {
+ return (int) configMap.getOrDefault("queueSize", 1000);
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java
new file mode 100644
index 000000000..07b96dfea
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
+
+ @Override
+ public float[][] generateEmbeddingFrom(Object o, int[] dims) {
+ return switch (dims.length) {
+ case 1 -> new float[][]{convertToFloat((double[]) o)};
+ case 2 -> convertToFloats((double[][]) o);
+ case 3 -> flatten(o, dims);
+ default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
+ };
+ }
+
+ private float[][] convertToFloats(double[][] o) {
+ float[][] floats = new float[o.length][];
+ for (int i = 0; i < o.length; i++) {
+ floats[i] = convertToFloat(o[i]);
+ }
+ return floats;
+ }
+
+ public float[] convertToFloat(double[] doubleArray) {
+ if (doubleArray == null) {
+ return null;
+ }
+ float[] floatArray = new float[doubleArray.length];
+ for (int i = 0; i < doubleArray.length; i++) {
+ floatArray[i] = (float) doubleArray[i];
+ }
+ return floatArray;
+ }
+
+ private float[][] flatten(Object o, int[] dims) {
+ double[][][] arr = (double[][][]) o;
+ float[][] flat = new float[dims[0]][dims[1] * dims[2]];
+ for (int i = 0; i < dims[0]; i++) {
+ for (int j = 0; j < dims[1]; j++) {
+ for (int k = 0; k < dims[2]; k++) {
+ flat[i][j * dims[2] + k] = (float)arr[i][j][k];
+ }
+ }
+ }
+ return flat;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java
new file mode 100644
index 000000000..22fcad5ed
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+public interface EmbeddingGenerator {
+ float[][] generateEmbeddingFrom(Object o, int[] dims);
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java
new file mode 100644
index 000000000..a7b677e65
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class EmbeddingGeneratorFactory {
+ private static final Map generators = new HashMap<>();
+
+ public static EmbeddingGenerator getGenerator(String type) {
+ String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
+ if (typeLower.equals("integer")) typeLower = "int";
+ switch (typeLower) {
+ case "string" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new StringEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ case "float" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new FloatEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ case "double" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new DoubleEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ case "int" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new IntEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ default -> throw new RuntimeException("Unknown embedding type: " + type);
+ }
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java
new file mode 100644
index 000000000..9245e53f5
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+public class FloatEmbeddingGenerator implements EmbeddingGenerator {
+
+ @Override
+ public float[][] generateEmbeddingFrom(Object o, int[] dims) {
+ return switch (dims.length) {
+ case 1 -> new float[][]{(float[]) o};
+ case 2 -> (float[][]) o;
+ case 3 -> flatten(o, dims);
+ default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
+ };
+ }
+
+ private float[][] flatten(Object o, int[] dims) {
+ float[][][] arr = (float[][][]) o;
+ float[][] flat = new float[dims[0]][dims[1] * dims[2]];
+ for (int i = 0; i < dims[0]; i++) {
+ for (int j = 0; j < dims[1]; j++) {
+ if (dims[2] >= 0) System.arraycopy(arr[i][j], 0, flat[i], j * dims[2] + 0, dims[2]);
+ }
+ }
+ return flat;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java
new file mode 100644
index 000000000..c4f0c1988
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+public class IntEmbeddingGenerator implements EmbeddingGenerator {
+ @Override
+ public float[][] generateEmbeddingFrom(Object o, int[] dims) {
+ switch (dims.length) {
+ case 1 -> {
+ float[] arr = new float[dims[0]];
+ for (int i = 0; i < dims[0]; i++) {
+ arr[i] = ((int[]) o)[i];
+ }
+ return new float[][]{arr};
+ }
+ case 2 -> {
+ float[][] arr = new float[dims[0]][dims[1]];
+ for (int i = 0; i < dims[0]; i++) {
+ for (int j = 0; j < dims[1]; j++) {
+ arr[i][j] = ((int[][]) o)[i][j];
+ }
+ }
+ return arr;
+ }
+ case 3 -> {
+ return flatten(o, dims);
+ }
+ default ->
+ throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
+ }
+ }
+
+ private float[][] flatten(Object o, int[] dims) {
+ int[][][] arr = (int[][][]) o;
+ float[][] flat = new float[dims[0]][dims[1] * dims[2]];
+ for (int i = 0; i < dims[0]; i++) {
+ for (int j = 0; j < dims[1]; j++) {
+ for (int k = 0; k < dims[2]; k++) {
+ flat[i][j * dims[2] + k] = arr[i][j][k];
+ }
+ }
+ }
+ return flat;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java
new file mode 100644
index 000000000..01ffb9af4
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.embedding;
+
+import org.deeplearning4j.models.word2vec.Word2Vec;
+import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
+import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
+import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
+import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
+import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
+
+import java.util.Arrays;
+import java.util.Collections;
+
+public class StringEmbeddingGenerator implements EmbeddingGenerator {
+ private final TokenizerFactory tokenizerFactory= new DefaultTokenizerFactory();
+
+ @Override
+ public float[][] generateEmbeddingFrom(Object o, int[] dims) {
+ switch (dims.length) {
+ case 1 -> {
+ return generateWordEmbeddings((String[]) o);
+ }
+ default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
+ }
+
+ }
+
+ private float[][] generateWordEmbeddings(String[] text) {
+ SentenceIterator iter = new CollectionSentenceIterator(Collections.singletonList(text));
+ /*Word2Vec vec = new Word2Vec.Builder()
+ .minWordFrequency(1)
+ .iterations(1)
+ .layerSize(targetDims)
+ .seed(42)
+ .windowSize(5)
+ .iterate(iter)
+ .tokenizerFactory(tokenizerFactory)
+ .build();
+*/
+ return null;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java
new file mode 100644
index 000000000..af3810202
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.readers;
+
+import io.jhdf.HdfFile;
+import io.jhdf.api.Dataset;
+import io.jhdf.api.Group;
+import io.jhdf.api.Node;
+import io.nosqlbench.loader.hdf.config.LoaderConfig;
+import io.nosqlbench.loader.hdf.embedding.EmbeddingGenerator;
+import io.nosqlbench.loader.hdf.writers.VectorWriter;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.LinkedBlockingQueue;
+
+import static io.nosqlbench.loader.hdf.embedding.EmbeddingGeneratorFactory.getGenerator;
+
+public class Hdf5Reader implements HdfReader {
+ private static final Logger logger = LogManager.getLogger(Hdf5Reader.class);
+ public static final String ALL = "all";
+ private VectorWriter writer;
+ private final LoaderConfig config;
+ private final ExecutorService executorService;
+ private final LinkedBlockingQueue queue;
+ private List datasets;
+ private final float[] SHUTDOWN = new float[0];
+ public Hdf5Reader(LoaderConfig config) {
+ this.config = config;
+ executorService = Executors.newCachedThreadPool();
+ queue = new LinkedBlockingQueue<>(config.getQueueSize());
+ }
+
+ @Override
+ public void setWriter(VectorWriter writer) {
+ this.writer = writer;
+ writer.setQueue(queue);
+ }
+
+ public void extractDatasets(Group parent) {
+ Map nodes = parent.getChildren();
+ for (String key : nodes.keySet()) {
+ Node node = nodes.get(key);
+ if (node instanceof Dataset) {
+ datasets.add(node.getPath());
+ }
+ else if (node.isGroup()) {
+ extractDatasets((Group) node);
+ }
+ }
+ }
+
+ @Override
+ public void read() {
+ HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile()));
+ datasets = config.getDatasets();
+ if (datasets.get(0).equalsIgnoreCase(ALL)) {
+ extractDatasets(hdfFile);
+ }
+ List> futures = new ArrayList<>();
+ executorService.submit(writer);
+ for (String ds : datasets) {
+ if (ds.equalsIgnoreCase(ALL)) {
+ continue;
+ }
+ Future> future = executorService.submit(() -> {
+ logger.info("Processing dataset: " + ds);
+ Dataset dataset = hdfFile.getDatasetByPath(ds);
+ int[] dims = dataset.getDimensions();
+ String type = dataset.getJavaType().getSimpleName().toLowerCase();
+ EmbeddingGenerator generator = getGenerator(type);
+ Object data;
+ if (dataset.getSizeInBytes() > Integer.MAX_VALUE) {
+ logger.info("slicing large dataset: " + ds);
+ // TODO: For now this will be implemented to handle numeric types with
+ // 2 dimensions where the 1st dimension is the number of vectors and the 2nd
+ // dimension is the number of dimensions in the vector.
+ long[] sliceOffset = new long[dims.length];
+ int[] sliceDimensions = new int[dims.length];
+ sliceDimensions[1] = dims[1];
+ int noOfSlices = (int) (dataset.getSizeInBytes() / Integer.MAX_VALUE) + 1;
+ int sliceSize = dims[0] / noOfSlices;
+ for (int i = 0; i < noOfSlices; i++) {
+ sliceOffset[0] = (long) i * sliceSize;
+ sliceDimensions[0] = sliceSize;
+ data = dataset.getData(sliceOffset, sliceDimensions);
+ float[][] vectors = generator.generateEmbeddingFrom(data, dims);
+ for (float[] vector : vectors) {
+ try {
+ queue.put(vector);
+ } catch (InterruptedException e) {
+ logger.error(e.getMessage(), e);
+ }
+ }
+ }
+ } else {
+ data = dataset.getData();
+ float[][] vectors = generator.generateEmbeddingFrom(data, dims);
+ for (float[] vector : vectors) {
+ try {
+ queue.put(vector);
+ } catch (InterruptedException e) {
+ logger.error(e.getMessage(), e);
+ }
+ }
+ }
+ });
+ futures.add(future);
+ }
+ for (Future> future : futures) {
+ try {
+ future.get();
+ } catch (Exception e) {
+ logger.error(e.getMessage(), e);
+ }
+ }
+ hdfFile.close();
+ writer.shutdown();
+ try {
+ queue.put(SHUTDOWN);
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ executorService.shutdown();
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java
new file mode 100644
index 000000000..f9304e6c9
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.readers;
+
+import io.nosqlbench.loader.hdf.writers.VectorWriter;
+
+public interface HdfReader {
+ void setWriter(VectorWriter writer);
+
+ void read();
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java
new file mode 100644
index 000000000..4c1c070e3
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.writers;
+
+import java.util.concurrent.LinkedBlockingQueue;
+
+public abstract class AbstractVectorWriter implements VectorWriter {
+ protected LinkedBlockingQueue queue;
+ protected boolean shutdown = false;
+
+ public void setQueue(LinkedBlockingQueue queue) {
+ this.queue = queue;
+ }
+
+ @Override
+ public void run() {
+ while (!shutdown || !queue.isEmpty()) {
+ try {
+ float[] vector = queue.take();
+ if (vector.length==0) {
+ break;
+ }
+ writeVector(vector);
+ } catch (InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ protected abstract void writeVector(float[] vector);
+
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java
new file mode 100644
index 000000000..29bbf6191
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.writers;
+
+import com.datastax.oss.driver.api.core.CqlSession;
+import com.datastax.oss.driver.api.core.cql.PreparedStatement;
+import com.datastax.oss.driver.api.core.data.CqlVector;
+import io.nosqlbench.loader.hdf.config.LoaderConfig;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.nio.file.Paths;
+import java.util.Map;
+
+public class AstraVectorWriter extends AbstractVectorWriter {
+ private static final Logger logger = LogManager.getLogger(AstraVectorWriter.class);
+ private final CqlSession session;
+ PreparedStatement insert_vector;
+
+ public AstraVectorWriter(LoaderConfig config) {
+ Map astraParams = config.getAstra();
+ session = CqlSession.builder()
+ .withCloudSecureConnectBundle(Paths.get(astraParams.get("scb")))
+ .withAuthCredentials(astraParams.get("clientId"), astraParams.get("clientSecret"))
+ .withKeyspace(astraParams.get("keyspace"))
+ .build();
+ logger.info("Astra session initialized");
+ insert_vector = session.prepare(astraParams.get("query"));
+ }
+//TODO: this is insanely slow. Needs work on threading/batching
+ @Override
+ protected void writeVector(float[] vector) {
+ Float[] vector2 = new Float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = vector[i];
+ }
+ CqlVector.Builder vectorBuilder = CqlVector.builder();
+ vectorBuilder.add(vector2);
+ session.execute(insert_vector.bind(getPartitionValue(vector), vectorBuilder.build()));
+ }
+
+ private String getPartitionValue(float[] vector) {
+ float sum = 0;
+ for (float f : vector) {
+ sum += f;
+ }
+ return String.valueOf(sum);
+ }
+
+ @Override
+ public void shutdown() {
+ shutdown = true;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java
new file mode 100644
index 000000000..710b419d3
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.writers;
+
+import io.nosqlbench.loader.hdf.config.LoaderConfig;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.*;
+
+public class FileVectorWriter extends AbstractVectorWriter {
+ private static final Logger logger = LogManager.getLogger(FileVectorWriter.class);
+ private final BufferedWriter targetFile;
+ public FileVectorWriter(LoaderConfig config) throws IOException {
+ String targetFileName = config.getTargetFile();
+ targetFile = new BufferedWriter(new FileWriter(targetFileName));
+ logger.info("Writing to file: " + targetFileName);
+ }
+
+ @Override
+ protected void writeVector(float[] vector) {
+ try {
+ targetFile.write("[");
+ for (int i = 0; i < vector.length; i++) {
+ targetFile.write(String.valueOf(vector[i]));
+ if (i < vector.length - 1) {
+ targetFile.write(",");
+ }
+ }
+ targetFile.write("]");
+ targetFile.write("\n");
+ targetFile.flush();
+ } catch (IOException e) {
+ logger.error(e.getMessage(), e);
+ }
+ }
+
+ @Override
+ public void shutdown() {
+ shutdown = true;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java
new file mode 100644
index 000000000..51788ac4f
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.writers;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+public class NoopVectorWriter extends AbstractVectorWriter {
+ private static final Logger logger = LogManager.getLogger(NoopVectorWriter.class);
+
+ @Override
+ protected void writeVector(float[] vector) {
+ //No-op
+ logger.debug(vector);
+ }
+
+ @Override
+ public void shutdown() {
+ shutdown = true;
+ }
+}
diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java
new file mode 100644
index 000000000..7e1da2edb
--- /dev/null
+++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.loader.hdf.writers;
+
+import java.util.concurrent.LinkedBlockingQueue;
+
+public interface VectorWriter extends Runnable {
+ void setQueue(LinkedBlockingQueue queue);
+
+ void shutdown();
+}
diff --git a/hdf-loader/src/main/resources/config.yaml b/hdf-loader/src/main/resources/config.yaml
new file mode 100644
index 000000000..c3e2338de
--- /dev/null
+++ b/hdf-loader/src/main/resources/config.yaml
@@ -0,0 +1,13 @@
+format: HDF5
+sourceFile: /home/mwolters138/Downloads/h5ex_t_float.h5 #/home/mwolters138/Documents/hdf5/datasets/pass/glove-25-angular.hdf5
+datasets:
+ - all
+embedding: word2vec
+writer: filewriter
+astra:
+ scb: /home/mwolters138/Dev/testing/secure-connect-vector-correctness.zip
+ clientId: IvpdaZejwNuvWeupsIkWTHeL
+ clientSecret: .bxut2-OQL,dWunZeQbjZC0vMHd88UWXKS.xT,nl95zQC0B0xU9FzSWK3HSUGO11o_7pr7wG7+EMaZqegkKlr4fZ54__furPMtWPGiPp,2cZ1q15vrWwc9_-AcgeCbuf
+ keyspace: baselines128dot
+ query: INSERT INTO vectors25(key, value) VALUES (?,?)
+targetFile: /home/mwolters138/vectors.txt
diff --git a/mvn-defaults/pom.xml b/mvn-defaults/pom.xml
index b3e24a184..c864e0676 100644
--- a/mvn-defaults/pom.xml
+++ b/mvn-defaults/pom.xml
@@ -184,12 +184,22 @@
org.xerial.snappy
snappy-java
- 1.1.10.1
+ 1.1.10.3
com.datastax.oss
java-driver-query-builder
- 4.16.0
+ 4.17.0
+
+
+ org.snakeyaml
+ snakeyaml-engine
+ 2.6
+
+
+ org.xerial.snappy
+ snappy-java
+ 1.1.10.3
com.esri.geometry
@@ -199,7 +209,7 @@
io.netty
netty-handler
- 4.1.94.Final
+ 4.1.95.Final
io.netty
@@ -221,7 +231,7 @@
com.github.docker-java
docker-java-api
- 3.3.1
+ 3.3.2
org.slf4j
@@ -278,7 +288,7 @@
com.github.docker-java
docker-java-transport-okhttp
- 3.3.1
+ 3.3.2
org.slf4j
@@ -289,7 +299,7 @@
com.github.docker-java
docker-java
- 3.3.1
+ 3.3.2
org.slf4j
@@ -301,7 +311,7 @@
com.github.oshi
oshi-core-java11
- 6.4.3
+ 6.4.4
com.google.code.gson
@@ -311,7 +321,7 @@
com.amazonaws
aws-java-sdk-s3
- 1.12.498
+ 1.12.513
com.elega9t
@@ -326,7 +336,7 @@
org.apache.commons
commons-lang3
- 3.12.0
+ 3.13.0
com.squareup
@@ -382,7 +392,7 @@
org.graalvm.sdk
graal-sdk
- 22.3.2
+ 22.3.3
org.graalvm.js
@@ -393,12 +403,12 @@
org.graalvm.js
js-scriptengine
- 22.3.2
+ 22.3.3
org.graalvm.tools
profiler
- 22.3.2
+ 22.3.3
runtime
diff --git a/pom.xml b/pom.xml
index a79b86a73..173c73523 100644
--- a/pom.xml
+++ b/pom.xml
@@ -67,6 +67,7 @@
adapter-kafka
adapter-amqp
adapter-jdbc
+ hdf-loader
virtdata-api
@@ -76,6 +77,7 @@
virtdata-lib-random
virtdata-lib-curves4
virtdata-lib-realer
+ virtdata-lib-hdf5
virtdata-userlibs
@@ -114,6 +116,7 @@
adapter-amqp
adapter-jdbc
adapter-pinecone
+ hdf-loader
virtdata-api
@@ -123,6 +126,7 @@
virtdata-lib-random
virtdata-lib-curves4
virtdata-lib-realer
+ virtdata-lib-hdf5
virtdata-userlibs
diff --git a/virtdata-lib-basics/pom.xml b/virtdata-lib-basics/pom.xml
index 1d069da92..1e67c931f 100644
--- a/virtdata-lib-basics/pom.xml
+++ b/virtdata-lib-basics/pom.xml
@@ -82,6 +82,7 @@
5.1.1
test
+
diff --git a/virtdata-lib-hdf5/pom.xml b/virtdata-lib-hdf5/pom.xml
new file mode 100644
index 000000000..6f0ebf6d2
--- /dev/null
+++ b/virtdata-lib-hdf5/pom.xml
@@ -0,0 +1,62 @@
+
+
+
+ 4.0.0
+
+
+ mvn-defaults
+ io.nosqlbench
+ ${revision}
+ ../mvn-defaults
+
+
+ virtdata-lib-hdf5
+ jar
+ virtdata-lib-hdf5
+ http://nosqlbench.io/
+
+ With inspiration from other libraries
+
+
+
+
+ io.nosqlbench
+ virtdata-lib-basics
+ ${revision}
+
+
+
+ io.jhdf
+ jhdf
+ 0.6.10
+
+
+
+
+
+
+
+ src/test/resources
+
+ h5ex_t_float.h5
+
+ true
+
+
+
+
+
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java
new file mode 100644
index 000000000..833fecda1
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long;
+
+import io.jhdf.HdfFile;
+import io.jhdf.api.Dataset;
+import io.nosqlbench.api.content.NBIO;
+
+import java.nio.file.Paths;
+
+public abstract class AbstractHdfFileToVector {
+ protected final HdfFile hdfFile;
+ protected final Dataset dataset;
+ protected final int[] dims;
+
+ public AbstractHdfFileToVector(String filename, String datasetName) {
+ //hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
+ hdfFile = new HdfFile(Paths.get(filename));
+ //TODO: implement a function to get the dataset by name only without needing the full path
+ dataset = hdfFile.getDatasetByPath(datasetName);
+ dims = dataset.getDimensions();
+ }
+
+ protected Object getDataFrom(long l) {
+ long[] sliceOffset = new long[dims.length];
+ sliceOffset[0] = (l % dims[0]);
+ int[] sliceDimensions = new int[dims.length];
+ sliceDimensions[0] = 1;
+ // Do we want to give the option of reducing vector dimensions here?
+ sliceDimensions[1] = dims[1];
+ return dataset.getData(sliceOffset, sliceDimensions);
+ }
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java
new file mode 100644
index 000000000..be5408ac9
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
+
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
+import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
+import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
+
+import java.util.function.LongFunction;
+
+/**
+ * This function reads a vector dataset from an HDF5 file. The dataset itself is not
+ * read into memory, only the metadata (the "dataset" Java Object). The lambda function
+ * reads a single vector from the dataset, based on the long input value. As currently
+ * written this class will only work for datasets with 2 dimensions where the 1st dimension
+ * specifies the number of vectors and the 2nd dimension specifies the number of elements in
+ * each vector. Only datatypes short, int, and float are supported at this time.
+ *
+ * This implementation is specific to returning an array of floats
+ */
+@ThreadSafeMapper
+@Categories(Category.experimental)
+public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction {
+ private final EmbeddingGenerator embeddingGenerator;
+
+ public HdfFileToVectorArray(String filename, String datasetName) {
+ super(filename, datasetName);
+ embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
+ }
+ @Override
+ public float[] apply(long l) {
+ Object data = getDataFrom(l);
+ return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
+ }
+
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java
new file mode 100644
index 000000000..2f018df1f
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
+
+import io.nosqlbench.virtdata.api.annotations.Categories;
+import io.nosqlbench.virtdata.api.annotations.Category;
+import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
+import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
+import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
+import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
+
+import java.util.List;
+import java.util.function.LongFunction;
+
+/**
+ * This function reads a vector dataset from an HDF5 file. The dataset itself is not
+ * read into memory, only the metadata (the "dataset" Java Object). The lambda function
+ * reads a single vector from the dataset, based on the long input value. As currently
+ * written this class will only work for datasets with 2 dimensions where the 1st dimension
+ * specifies the number of vectors and the 2nd dimension specifies the number of elements in
+ * each vector. Only datatypes short, int, and float are supported at this time.
+ *
+ * This implementation is specific to returning a List of Floats, so as to work with the
+ * normalization functions e.g. NormalizeListVector and its variants.
+ */
+@ThreadSafeMapper
+@Categories(Category.experimental)
+public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction> {
+ private final EmbeddingGenerator embeddingGenerator;
+
+ public HdfFileToVectorList(String filename, String datasetName) {
+ super(filename, datasetName);
+ embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
+ }
+ @Override
+ public List apply(long l) {
+ Object data = getDataFrom(l);
+ return embeddingGenerator.generateListEmbeddingFrom(data, dims);
+ }
+
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java
new file mode 100644
index 000000000..8ebeba3ac
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.List;
+
+public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
+
+ @Override
+ public List generateListEmbeddingFrom(Object o, int[] dims) {
+ // in this case o will always be double[1][x]
+ double[] vector = ((double[][]) o)[0];
+ Float[] vector2 = new Float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = (float) vector[i];
+ }
+ return List.of(vector2);
+ }
+
+ @Override
+ public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
+ double[] vector = ((double[][]) o)[0];
+ float[] vector2 = new float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = (float) vector[i];
+ }
+ return vector2;
+ }
+
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java
new file mode 100644
index 000000000..865d9f1a4
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.List;
+
+public interface EmbeddingGenerator {
+ List generateListEmbeddingFrom(Object o, int[] dims);
+
+ float[] generateArrayEmbeddingFrom(Object o, int[] dims);
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java
new file mode 100644
index 000000000..23c2fe578
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class EmbeddingGeneratorFactory {
+ private static final Map generators = new HashMap<>();
+
+ public static EmbeddingGenerator getGenerator(String type) {
+ String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
+ switch (typeLower) {
+ case "float" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new FloatEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ case "int" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new IntEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ case "double" -> {
+ if (!generators.containsKey(type)) {
+ generators.put(type, new DoubleEmbeddingGenerator());
+ }
+ return generators.get(type);
+ }
+ default -> throw new RuntimeException("Unknown embedding type: " + type);
+ }
+ }
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java
new file mode 100644
index 000000000..d3aa92354
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.List;
+
+public class FloatEmbeddingGenerator implements EmbeddingGenerator {
+
+ @Override
+ public List generateListEmbeddingFrom(Object o, int[] dims) {
+ // in this case o will always be float[1][x]
+ float[] vector = ((float[][]) o)[0];
+ Float[] vector2 = new Float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = vector[i];
+ }
+ return List.of(vector2);
+ }
+
+ @Override
+ public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
+ return ((float[][]) o)[0];
+ }
+
+}
diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java
new file mode 100644
index 000000000..769f81482
--- /dev/null
+++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.helpers;
+
+import java.util.List;
+
+public class IntEmbeddingGenerator implements EmbeddingGenerator {
+ @Override
+ public List generateListEmbeddingFrom(Object o, int[] dims) {
+ // in this case o will always be int[1][x]
+ int[] vector = ((int[][]) o)[0];
+ Float[] vector2 = new Float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = (float) vector[i];
+ }
+ return List.of(vector2);
+ }
+
+ @Override
+ public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
+ int[] vector = ((int[][]) o)[0];
+ float[] vector2 = new float[vector.length];
+ for (int i = 0; i < vector.length; i++) {
+ vector2[i] = (float) vector[i];
+ }
+ return vector2;
+ }
+}
diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java
new file mode 100644
index 000000000..2d88c2468
--- /dev/null
+++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
+
+import org.junit.jupiter.api.Test;
+
+public class HdfFileToArrayTest {
+
+ @Test
+ public void testHdfFileToVector() {
+ final float[][] results = new float[][]{
+ {0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
+ {2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
+ {4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
+ {6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
+ };
+
+ HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
+ "src/test/resources/h5ex_t_float.h5",
+ "/DS1");
+
+ float[] read;
+ for (int i = 0; i < 4; i++) {
+ read = hdfFileToVector.apply(i);
+ for (int j = 0; j < 7; j++) {
+ assert (read[j] == results[i][j]);
+ }
+ }
+ }
+}
diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java
new file mode 100644
index 000000000..4ba0f104a
--- /dev/null
+++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+public class HdfFileToVectorTest {
+
+ @Test
+ public void testHdfFileToVector() {
+ final float[][] results = new float[][]{
+ {0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
+ {2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
+ {4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
+ {6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
+ };
+
+ HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
+ "src/test/resources/h5ex_t_float.h5",
+ "/DS1");
+
+ List read;
+ for (int i = 0; i < 4; i++) {
+ read = hdfFileToVector.apply(i);
+ for (int j = 0; j < 7; j++) {
+ assert (read.get(j) == results[i][j]);
+ }
+ }
+ }
+}
diff --git a/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5 b/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
new file mode 100644
index 000000000..9c8cb981d
Binary files /dev/null and b/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5 differ
diff --git a/virtdata-userlibs/pom.xml b/virtdata-userlibs/pom.xml
index 09be05153..c5d144b27 100644
--- a/virtdata-userlibs/pom.xml
+++ b/virtdata-userlibs/pom.xml
@@ -66,6 +66,13 @@
virtdata-lib-curves4
${revision}
+
+
+ io.nosqlbench
+ virtdata-lib-hdf5
+ ${revision}
+
+
io.nosqlbench
docsys