diff --git a/adapter-dynamodb/pom.xml b/adapter-dynamodb/pom.xml index f796250e5..b46909ea0 100644 --- a/adapter-dynamodb/pom.xml +++ b/adapter-dynamodb/pom.xml @@ -43,7 +43,7 @@ com.amazonaws aws-java-sdk-dynamodb - 1.12.500 + 1.12.507 diff --git a/adapter-kafka/pom.xml b/adapter-kafka/pom.xml index 552ea0131..7bd4f07c7 100644 --- a/adapter-kafka/pom.xml +++ b/adapter-kafka/pom.xml @@ -34,7 +34,7 @@ - 3.5.0 + 3.5.1 @@ -60,7 +60,7 @@ org.apache.commons commons-lang3 - 3.12.0 + 3.13.0 diff --git a/adapter-mongodb/pom.xml b/adapter-mongodb/pom.xml index 25b523b89..8692fa642 100644 --- a/adapter-mongodb/pom.xml +++ b/adapter-mongodb/pom.xml @@ -42,7 +42,7 @@ org.mongodb mongodb-driver-sync - 4.10.1 + 4.10.2 diff --git a/adapter-pulsar/pom.xml b/adapter-pulsar/pom.xml index fd4ba41f1..1c6f28c5d 100644 --- a/adapter-pulsar/pom.xml +++ b/adapter-pulsar/pom.xml @@ -34,7 +34,7 @@ - 3.0.0 + 3.0.1 @@ -66,7 +66,7 @@ org.apache.commons commons-lang3 - 3.12.0 + 3.13.0 diff --git a/docsys/pom.xml b/docsys/pom.xml index 9585b1405..8b44d73d4 100644 --- a/docsys/pom.xml +++ b/docsys/pom.xml @@ -22,7 +22,7 @@ docsys http://nosqlbench.io/ - 3.1.2 + 3.1.3 @@ -94,7 +94,7 @@ org.glassfish.jersey.media jersey-media-json-jackson - 3.1.2 + 3.1.3 diff --git a/hdf-loader/pom.xml b/hdf-loader/pom.xml new file mode 100644 index 000000000..78063bd3e --- /dev/null +++ b/hdf-loader/pom.xml @@ -0,0 +1,100 @@ + + + + + 4.0.0 + hdf-loader + + jar + + + mvn-defaults + io.nosqlbench + ${revision} + ../mvn-defaults + + + ${project.artifactId} + + + 17 + 17 + UTF-8 + + + + + + org.snakeyaml + snakeyaml-engine + 2.6 + + + org.yaml + snakeyaml + 2.0 + + + + com.datastax.oss + java-driver-core + 4.16.0 + + + + + com.fasterxml.jackson.core + jackson-core + 2.15.2 + + + + + org.deeplearning4j + deeplearning4j-core + 1.0.0-M2.1 + + + + org.nd4j + nd4j-native + 1.0.0-M2.1 + + + + org.deeplearning4j + deeplearning4j-nlp + 1.0.0-M2.1 + + + + io.jhdf + jhdf + 0.6.10 + + + io.nosqlbench + nb-api + 5.17.3-SNAPSHOT + compile + + + + + diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java new file mode 100644 index 000000000..1a6ba6fa6 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/HdfLoader.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf; + +import io.nosqlbench.loader.hdf.config.LoaderConfig; +import io.nosqlbench.loader.hdf.readers.Hdf5Reader; +import io.nosqlbench.loader.hdf.readers.HdfReader; +import io.nosqlbench.loader.hdf.writers.AstraVectorWriter; +import io.nosqlbench.loader.hdf.writers.FileVectorWriter; +import io.nosqlbench.loader.hdf.writers.NoopVectorWriter; +import io.nosqlbench.loader.hdf.writers.VectorWriter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +public class HdfLoader { + private static final Logger logger = LogManager.getLogger(HdfLoader.class); + public static final String FILEWRITER = "filewriter"; + public static final String ASTRA = "astra"; + public static final String NOOP = "noop"; + public static final String HDF5 = "hdf5"; + public static final String HDF4 = "hdf4"; + + public static void main (String[] args) { + if (args.length == 0) { + System.out.println("Usage: hdf-loader "); + System.exit(1); + } + try { + LoaderConfig config = new LoaderConfig(args[0]); + logger.info("Starting loader with config: " + config); + HdfReader reader = null; + VectorWriter writer = null; + + String format = config.getFormat(); + switch (format.toLowerCase()) { + case HDF4 -> { + logger.info("HDF4 format not yet supported"); + System.exit(1); + } + case HDF5 -> { + logger.info("HDF5 format selected"); + reader = new Hdf5Reader(config); + } + default -> { + logger.info("Unknown format: " + format); + System.exit(1); + } + } + + String writerType = config.getWriter(); + logger.info("Using writer type: " + writerType); + switch (writerType.toLowerCase()) { + case FILEWRITER -> writer = new FileVectorWriter(config); + case ASTRA -> writer = new AstraVectorWriter(config); + case NOOP -> writer = new NoopVectorWriter(); + default -> { + logger.info("Unknown writer type: " + writerType); + System.exit(1); + } + } + reader.setWriter(writer); + logger.info("Starting main read loop"); + reader.read(); + } catch (Exception e) { + logger.error(e); + System.exit(1); + } + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java new file mode 100644 index 000000000..f8c02137f --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/config/LoaderConfig.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.config; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.yaml.snakeyaml.Yaml; + +import java.io.FileReader; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class LoaderConfig { + private static final Logger logger = LogManager.getLogger(LoaderConfig.class); + private static final Yaml yaml = new Yaml(); + private final Map configMap; + + public LoaderConfig(String filePath) throws IOException { + FileReader fileReader = new FileReader(filePath); + configMap = yaml.load(fileReader); + for (Map.Entry entry : configMap.entrySet()) { + logger.debug(entry.getKey() + " : " + entry.getValue()); + } + } + + public Object getRawValue(String key) { + return configMap.get(key); + } + + public String getStringValue(String key) { + return configMap.get(key).toString(); + } + + public List getDatasets() { + return (List) configMap.get("datasets"); + } + + public String getFormat() { + return (String) configMap.getOrDefault("format", "HD5"); + } + + public Map getAstra() { + return (Map) configMap.get("astra"); + } + + public String getEmbedding() { + return (String) configMap.getOrDefault("embedding", "Deeplearning4j"); + } + + public String getWriter() { + return (String) configMap.getOrDefault("writer", "filewriter"); + } + + public String getSourceFile() { + return (String) configMap.get("sourceFile"); + } + + public String getTargetFile() { + return (String) configMap.getOrDefault("targetFile", "./vectors.txt"); + } + + public int getThreads() { + return (int) configMap.getOrDefault("threads", 5); + } + + public int getQueueSize() { + return (int) configMap.getOrDefault("queueSize", 1000); + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java new file mode 100644 index 000000000..07b96dfea --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/DoubleEmbeddingGenerator.java @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +public class DoubleEmbeddingGenerator implements EmbeddingGenerator { + + @Override + public float[][] generateEmbeddingFrom(Object o, int[] dims) { + return switch (dims.length) { + case 1 -> new float[][]{convertToFloat((double[]) o)}; + case 2 -> convertToFloats((double[][]) o); + case 3 -> flatten(o, dims); + default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length); + }; + } + + private float[][] convertToFloats(double[][] o) { + float[][] floats = new float[o.length][]; + for (int i = 0; i < o.length; i++) { + floats[i] = convertToFloat(o[i]); + } + return floats; + } + + public float[] convertToFloat(double[] doubleArray) { + if (doubleArray == null) { + return null; + } + float[] floatArray = new float[doubleArray.length]; + for (int i = 0; i < doubleArray.length; i++) { + floatArray[i] = (float) doubleArray[i]; + } + return floatArray; + } + + private float[][] flatten(Object o, int[] dims) { + double[][][] arr = (double[][][]) o; + float[][] flat = new float[dims[0]][dims[1] * dims[2]]; + for (int i = 0; i < dims[0]; i++) { + for (int j = 0; j < dims[1]; j++) { + for (int k = 0; k < dims[2]; k++) { + flat[i][j * dims[2] + k] = (float)arr[i][j][k]; + } + } + } + return flat; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java new file mode 100644 index 000000000..22fcad5ed --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGenerator.java @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +public interface EmbeddingGenerator { + float[][] generateEmbeddingFrom(Object o, int[] dims); +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java new file mode 100644 index 000000000..a7b677e65 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/EmbeddingGeneratorFactory.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +import java.util.HashMap; +import java.util.Map; + +public class EmbeddingGeneratorFactory { + private static final Map generators = new HashMap<>(); + + public static EmbeddingGenerator getGenerator(String type) { + String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase(); + if (typeLower.equals("integer")) typeLower = "int"; + switch (typeLower) { + case "string" -> { + if (!generators.containsKey(type)) { + generators.put(type, new StringEmbeddingGenerator()); + } + return generators.get(type); + } + case "float" -> { + if (!generators.containsKey(type)) { + generators.put(type, new FloatEmbeddingGenerator()); + } + return generators.get(type); + } + case "double" -> { + if (!generators.containsKey(type)) { + generators.put(type, new DoubleEmbeddingGenerator()); + } + return generators.get(type); + } + case "int" -> { + if (!generators.containsKey(type)) { + generators.put(type, new IntEmbeddingGenerator()); + } + return generators.get(type); + } + default -> throw new RuntimeException("Unknown embedding type: " + type); + } + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java new file mode 100644 index 000000000..9245e53f5 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/FloatEmbeddingGenerator.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +public class FloatEmbeddingGenerator implements EmbeddingGenerator { + + @Override + public float[][] generateEmbeddingFrom(Object o, int[] dims) { + return switch (dims.length) { + case 1 -> new float[][]{(float[]) o}; + case 2 -> (float[][]) o; + case 3 -> flatten(o, dims); + default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length); + }; + } + + private float[][] flatten(Object o, int[] dims) { + float[][][] arr = (float[][][]) o; + float[][] flat = new float[dims[0]][dims[1] * dims[2]]; + for (int i = 0; i < dims[0]; i++) { + for (int j = 0; j < dims[1]; j++) { + if (dims[2] >= 0) System.arraycopy(arr[i][j], 0, flat[i], j * dims[2] + 0, dims[2]); + } + } + return flat; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java new file mode 100644 index 000000000..c4f0c1988 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/IntEmbeddingGenerator.java @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +public class IntEmbeddingGenerator implements EmbeddingGenerator { + @Override + public float[][] generateEmbeddingFrom(Object o, int[] dims) { + switch (dims.length) { + case 1 -> { + float[] arr = new float[dims[0]]; + for (int i = 0; i < dims[0]; i++) { + arr[i] = ((int[]) o)[i]; + } + return new float[][]{arr}; + } + case 2 -> { + float[][] arr = new float[dims[0]][dims[1]]; + for (int i = 0; i < dims[0]; i++) { + for (int j = 0; j < dims[1]; j++) { + arr[i][j] = ((int[][]) o)[i][j]; + } + } + return arr; + } + case 3 -> { + return flatten(o, dims); + } + default -> + throw new RuntimeException("unsupported embedding dimensionality: " + dims.length); + } + } + + private float[][] flatten(Object o, int[] dims) { + int[][][] arr = (int[][][]) o; + float[][] flat = new float[dims[0]][dims[1] * dims[2]]; + for (int i = 0; i < dims[0]; i++) { + for (int j = 0; j < dims[1]; j++) { + for (int k = 0; k < dims[2]; k++) { + flat[i][j * dims[2] + k] = arr[i][j][k]; + } + } + } + return flat; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java new file mode 100644 index 000000000..01ffb9af4 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/embedding/StringEmbeddingGenerator.java @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.embedding; + +import org.deeplearning4j.models.word2vec.Word2Vec; +import org.deeplearning4j.text.sentenceiterator.BasicLineIterator; +import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator; +import org.deeplearning4j.text.sentenceiterator.SentenceIterator; +import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; +import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; + +import java.util.Arrays; +import java.util.Collections; + +public class StringEmbeddingGenerator implements EmbeddingGenerator { + private final TokenizerFactory tokenizerFactory= new DefaultTokenizerFactory(); + + @Override + public float[][] generateEmbeddingFrom(Object o, int[] dims) { + switch (dims.length) { + case 1 -> { + return generateWordEmbeddings((String[]) o); + } + default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length); + } + + } + + private float[][] generateWordEmbeddings(String[] text) { + SentenceIterator iter = new CollectionSentenceIterator(Collections.singletonList(text)); + /*Word2Vec vec = new Word2Vec.Builder() + .minWordFrequency(1) + .iterations(1) + .layerSize(targetDims) + .seed(42) + .windowSize(5) + .iterate(iter) + .tokenizerFactory(tokenizerFactory) + .build(); +*/ + return null; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java new file mode 100644 index 000000000..af3810202 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/Hdf5Reader.java @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.readers; + +import io.jhdf.HdfFile; +import io.jhdf.api.Dataset; +import io.jhdf.api.Group; +import io.jhdf.api.Node; +import io.nosqlbench.loader.hdf.config.LoaderConfig; +import io.nosqlbench.loader.hdf.embedding.EmbeddingGenerator; +import io.nosqlbench.loader.hdf.writers.VectorWriter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.LinkedBlockingQueue; + +import static io.nosqlbench.loader.hdf.embedding.EmbeddingGeneratorFactory.getGenerator; + +public class Hdf5Reader implements HdfReader { + private static final Logger logger = LogManager.getLogger(Hdf5Reader.class); + public static final String ALL = "all"; + private VectorWriter writer; + private final LoaderConfig config; + private final ExecutorService executorService; + private final LinkedBlockingQueue queue; + private List datasets; + private final float[] SHUTDOWN = new float[0]; + public Hdf5Reader(LoaderConfig config) { + this.config = config; + executorService = Executors.newCachedThreadPool(); + queue = new LinkedBlockingQueue<>(config.getQueueSize()); + } + + @Override + public void setWriter(VectorWriter writer) { + this.writer = writer; + writer.setQueue(queue); + } + + public void extractDatasets(Group parent) { + Map nodes = parent.getChildren(); + for (String key : nodes.keySet()) { + Node node = nodes.get(key); + if (node instanceof Dataset) { + datasets.add(node.getPath()); + } + else if (node.isGroup()) { + extractDatasets((Group) node); + } + } + } + + @Override + public void read() { + HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile())); + datasets = config.getDatasets(); + if (datasets.get(0).equalsIgnoreCase(ALL)) { + extractDatasets(hdfFile); + } + List> futures = new ArrayList<>(); + executorService.submit(writer); + for (String ds : datasets) { + if (ds.equalsIgnoreCase(ALL)) { + continue; + } + Future future = executorService.submit(() -> { + logger.info("Processing dataset: " + ds); + Dataset dataset = hdfFile.getDatasetByPath(ds); + int[] dims = dataset.getDimensions(); + String type = dataset.getJavaType().getSimpleName().toLowerCase(); + EmbeddingGenerator generator = getGenerator(type); + Object data; + if (dataset.getSizeInBytes() > Integer.MAX_VALUE) { + logger.info("slicing large dataset: " + ds); + // TODO: For now this will be implemented to handle numeric types with + // 2 dimensions where the 1st dimension is the number of vectors and the 2nd + // dimension is the number of dimensions in the vector. + long[] sliceOffset = new long[dims.length]; + int[] sliceDimensions = new int[dims.length]; + sliceDimensions[1] = dims[1]; + int noOfSlices = (int) (dataset.getSizeInBytes() / Integer.MAX_VALUE) + 1; + int sliceSize = dims[0] / noOfSlices; + for (int i = 0; i < noOfSlices; i++) { + sliceOffset[0] = (long) i * sliceSize; + sliceDimensions[0] = sliceSize; + data = dataset.getData(sliceOffset, sliceDimensions); + float[][] vectors = generator.generateEmbeddingFrom(data, dims); + for (float[] vector : vectors) { + try { + queue.put(vector); + } catch (InterruptedException e) { + logger.error(e.getMessage(), e); + } + } + } + } else { + data = dataset.getData(); + float[][] vectors = generator.generateEmbeddingFrom(data, dims); + for (float[] vector : vectors) { + try { + queue.put(vector); + } catch (InterruptedException e) { + logger.error(e.getMessage(), e); + } + } + } + }); + futures.add(future); + } + for (Future future : futures) { + try { + future.get(); + } catch (Exception e) { + logger.error(e.getMessage(), e); + } + } + hdfFile.close(); + writer.shutdown(); + try { + queue.put(SHUTDOWN); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + executorService.shutdown(); + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java new file mode 100644 index 000000000..f9304e6c9 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/readers/HdfReader.java @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.readers; + +import io.nosqlbench.loader.hdf.writers.VectorWriter; + +public interface HdfReader { + void setWriter(VectorWriter writer); + + void read(); +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java new file mode 100644 index 000000000..4c1c070e3 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AbstractVectorWriter.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.writers; + +import java.util.concurrent.LinkedBlockingQueue; + +public abstract class AbstractVectorWriter implements VectorWriter { + protected LinkedBlockingQueue queue; + protected boolean shutdown = false; + + public void setQueue(LinkedBlockingQueue queue) { + this.queue = queue; + } + + @Override + public void run() { + while (!shutdown || !queue.isEmpty()) { + try { + float[] vector = queue.take(); + if (vector.length==0) { + break; + } + writeVector(vector); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } + + protected abstract void writeVector(float[] vector); + +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java new file mode 100644 index 000000000..29bbf6191 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/AstraVectorWriter.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.writers; + +import com.datastax.oss.driver.api.core.CqlSession; +import com.datastax.oss.driver.api.core.cql.PreparedStatement; +import com.datastax.oss.driver.api.core.data.CqlVector; +import io.nosqlbench.loader.hdf.config.LoaderConfig; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.nio.file.Paths; +import java.util.Map; + +public class AstraVectorWriter extends AbstractVectorWriter { + private static final Logger logger = LogManager.getLogger(AstraVectorWriter.class); + private final CqlSession session; + PreparedStatement insert_vector; + + public AstraVectorWriter(LoaderConfig config) { + Map astraParams = config.getAstra(); + session = CqlSession.builder() + .withCloudSecureConnectBundle(Paths.get(astraParams.get("scb"))) + .withAuthCredentials(astraParams.get("clientId"), astraParams.get("clientSecret")) + .withKeyspace(astraParams.get("keyspace")) + .build(); + logger.info("Astra session initialized"); + insert_vector = session.prepare(astraParams.get("query")); + } +//TODO: this is insanely slow. Needs work on threading/batching + @Override + protected void writeVector(float[] vector) { + Float[] vector2 = new Float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = vector[i]; + } + CqlVector.Builder vectorBuilder = CqlVector.builder(); + vectorBuilder.add(vector2); + session.execute(insert_vector.bind(getPartitionValue(vector), vectorBuilder.build())); + } + + private String getPartitionValue(float[] vector) { + float sum = 0; + for (float f : vector) { + sum += f; + } + return String.valueOf(sum); + } + + @Override + public void shutdown() { + shutdown = true; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java new file mode 100644 index 000000000..710b419d3 --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/FileVectorWriter.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.writers; + +import io.nosqlbench.loader.hdf.config.LoaderConfig; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.*; + +public class FileVectorWriter extends AbstractVectorWriter { + private static final Logger logger = LogManager.getLogger(FileVectorWriter.class); + private final BufferedWriter targetFile; + public FileVectorWriter(LoaderConfig config) throws IOException { + String targetFileName = config.getTargetFile(); + targetFile = new BufferedWriter(new FileWriter(targetFileName)); + logger.info("Writing to file: " + targetFileName); + } + + @Override + protected void writeVector(float[] vector) { + try { + targetFile.write("["); + for (int i = 0; i < vector.length; i++) { + targetFile.write(String.valueOf(vector[i])); + if (i < vector.length - 1) { + targetFile.write(","); + } + } + targetFile.write("]"); + targetFile.write("\n"); + targetFile.flush(); + } catch (IOException e) { + logger.error(e.getMessage(), e); + } + } + + @Override + public void shutdown() { + shutdown = true; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java new file mode 100644 index 000000000..51788ac4f --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/NoopVectorWriter.java @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.writers; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +public class NoopVectorWriter extends AbstractVectorWriter { + private static final Logger logger = LogManager.getLogger(NoopVectorWriter.class); + + @Override + protected void writeVector(float[] vector) { + //No-op + logger.debug(vector); + } + + @Override + public void shutdown() { + shutdown = true; + } +} diff --git a/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java new file mode 100644 index 000000000..7e1da2edb --- /dev/null +++ b/hdf-loader/src/main/java/io/nosqlbench/loader/hdf/writers/VectorWriter.java @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.loader.hdf.writers; + +import java.util.concurrent.LinkedBlockingQueue; + +public interface VectorWriter extends Runnable { + void setQueue(LinkedBlockingQueue queue); + + void shutdown(); +} diff --git a/hdf-loader/src/main/resources/config.yaml b/hdf-loader/src/main/resources/config.yaml new file mode 100644 index 000000000..c3e2338de --- /dev/null +++ b/hdf-loader/src/main/resources/config.yaml @@ -0,0 +1,13 @@ +format: HDF5 +sourceFile: /home/mwolters138/Downloads/h5ex_t_float.h5 #/home/mwolters138/Documents/hdf5/datasets/pass/glove-25-angular.hdf5 +datasets: + - all +embedding: word2vec +writer: filewriter +astra: + scb: /home/mwolters138/Dev/testing/secure-connect-vector-correctness.zip + clientId: IvpdaZejwNuvWeupsIkWTHeL + clientSecret: .bxut2-OQL,dWunZeQbjZC0vMHd88UWXKS.xT,nl95zQC0B0xU9FzSWK3HSUGO11o_7pr7wG7+EMaZqegkKlr4fZ54__furPMtWPGiPp,2cZ1q15vrWwc9_-AcgeCbuf + keyspace: baselines128dot + query: INSERT INTO vectors25(key, value) VALUES (?,?) +targetFile: /home/mwolters138/vectors.txt diff --git a/mvn-defaults/pom.xml b/mvn-defaults/pom.xml index b3e24a184..c864e0676 100644 --- a/mvn-defaults/pom.xml +++ b/mvn-defaults/pom.xml @@ -184,12 +184,22 @@ org.xerial.snappy snappy-java - 1.1.10.1 + 1.1.10.3 com.datastax.oss java-driver-query-builder - 4.16.0 + 4.17.0 + + + org.snakeyaml + snakeyaml-engine + 2.6 + + + org.xerial.snappy + snappy-java + 1.1.10.3 com.esri.geometry @@ -199,7 +209,7 @@ io.netty netty-handler - 4.1.94.Final + 4.1.95.Final io.netty @@ -221,7 +231,7 @@ com.github.docker-java docker-java-api - 3.3.1 + 3.3.2 org.slf4j @@ -278,7 +288,7 @@ com.github.docker-java docker-java-transport-okhttp - 3.3.1 + 3.3.2 org.slf4j @@ -289,7 +299,7 @@ com.github.docker-java docker-java - 3.3.1 + 3.3.2 org.slf4j @@ -301,7 +311,7 @@ com.github.oshi oshi-core-java11 - 6.4.3 + 6.4.4 com.google.code.gson @@ -311,7 +321,7 @@ com.amazonaws aws-java-sdk-s3 - 1.12.498 + 1.12.513 com.elega9t @@ -326,7 +336,7 @@ org.apache.commons commons-lang3 - 3.12.0 + 3.13.0 com.squareup @@ -382,7 +392,7 @@ org.graalvm.sdk graal-sdk - 22.3.2 + 22.3.3 org.graalvm.js @@ -393,12 +403,12 @@ org.graalvm.js js-scriptengine - 22.3.2 + 22.3.3 org.graalvm.tools profiler - 22.3.2 + 22.3.3 runtime diff --git a/pom.xml b/pom.xml index a79b86a73..173c73523 100644 --- a/pom.xml +++ b/pom.xml @@ -67,6 +67,7 @@ adapter-kafka adapter-amqp adapter-jdbc + hdf-loader virtdata-api @@ -76,6 +77,7 @@ virtdata-lib-random virtdata-lib-curves4 virtdata-lib-realer + virtdata-lib-hdf5 virtdata-userlibs @@ -114,6 +116,7 @@ adapter-amqp adapter-jdbc adapter-pinecone + hdf-loader virtdata-api @@ -123,6 +126,7 @@ virtdata-lib-random virtdata-lib-curves4 virtdata-lib-realer + virtdata-lib-hdf5 virtdata-userlibs diff --git a/virtdata-lib-basics/pom.xml b/virtdata-lib-basics/pom.xml index 1d069da92..1e67c931f 100644 --- a/virtdata-lib-basics/pom.xml +++ b/virtdata-lib-basics/pom.xml @@ -82,6 +82,7 @@ 5.1.1 test + diff --git a/virtdata-lib-hdf5/pom.xml b/virtdata-lib-hdf5/pom.xml new file mode 100644 index 000000000..6f0ebf6d2 --- /dev/null +++ b/virtdata-lib-hdf5/pom.xml @@ -0,0 +1,62 @@ + + + + 4.0.0 + + + mvn-defaults + io.nosqlbench + ${revision} + ../mvn-defaults + + + virtdata-lib-hdf5 + jar + virtdata-lib-hdf5 + http://nosqlbench.io/ + + With inspiration from other libraries + + + + + io.nosqlbench + virtdata-lib-basics + ${revision} + + + + io.jhdf + jhdf + 0.6.10 + + + + + + + + src/test/resources + + h5ex_t_float.h5 + + true + + + + + diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java new file mode 100644 index 000000000..833fecda1 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVector.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long; + +import io.jhdf.HdfFile; +import io.jhdf.api.Dataset; +import io.nosqlbench.api.content.NBIO; + +import java.nio.file.Paths; + +public abstract class AbstractHdfFileToVector { + protected final HdfFile hdfFile; + protected final Dataset dataset; + protected final int[] dims; + + public AbstractHdfFileToVector(String filename, String datasetName) { + //hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath()); + hdfFile = new HdfFile(Paths.get(filename)); + //TODO: implement a function to get the dataset by name only without needing the full path + dataset = hdfFile.getDatasetByPath(datasetName); + dims = dataset.getDimensions(); + } + + protected Object getDataFrom(long l) { + long[] sliceOffset = new long[dims.length]; + sliceOffset[0] = (l % dims[0]); + int[] sliceDimensions = new int[dims.length]; + sliceDimensions[0] = 1; + // Do we want to give the option of reducing vector dimensions here? + sliceDimensions[1] = dims[1]; + return dataset.getData(sliceOffset, sliceDimensions); + } +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java new file mode 100644 index 000000000..be5408ac9 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToVectorArray.java @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_array; + +import io.nosqlbench.virtdata.api.annotations.Categories; +import io.nosqlbench.virtdata.api.annotations.Category; +import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper; +import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory; + +import java.util.function.LongFunction; + +/** + * This function reads a vector dataset from an HDF5 file. The dataset itself is not + * read into memory, only the metadata (the "dataset" Java Object). The lambda function + * reads a single vector from the dataset, based on the long input value. As currently + * written this class will only work for datasets with 2 dimensions where the 1st dimension + * specifies the number of vectors and the 2nd dimension specifies the number of elements in + * each vector. Only datatypes short, int, and float are supported at this time. + *

+ * This implementation is specific to returning an array of floats + */ +@ThreadSafeMapper +@Categories(Category.experimental) +public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction { + private final EmbeddingGenerator embeddingGenerator; + + public HdfFileToVectorArray(String filename, String datasetName) { + super(filename, datasetName); + embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase()); + } + @Override + public float[] apply(long l) { + Object data = getDataFrom(l); + return embeddingGenerator.generateArrayEmbeddingFrom(data, dims); + } + +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java new file mode 100644 index 000000000..2f018df1f --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorList.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_list; + +import io.nosqlbench.virtdata.api.annotations.Categories; +import io.nosqlbench.virtdata.api.annotations.Category; +import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper; +import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory; + +import java.util.List; +import java.util.function.LongFunction; + +/** + * This function reads a vector dataset from an HDF5 file. The dataset itself is not + * read into memory, only the metadata (the "dataset" Java Object). The lambda function + * reads a single vector from the dataset, based on the long input value. As currently + * written this class will only work for datasets with 2 dimensions where the 1st dimension + * specifies the number of vectors and the 2nd dimension specifies the number of elements in + * each vector. Only datatypes short, int, and float are supported at this time. + *

+ * This implementation is specific to returning a List of Floats, so as to work with the + * normalization functions e.g. NormalizeListVector and its variants. + */ +@ThreadSafeMapper +@Categories(Category.experimental) +public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction> { + private final EmbeddingGenerator embeddingGenerator; + + public HdfFileToVectorList(String filename, String datasetName) { + super(filename, datasetName); + embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase()); + } + @Override + public List apply(long l) { + Object data = getDataFrom(l); + return embeddingGenerator.generateListEmbeddingFrom(data, dims); + } + +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java new file mode 100644 index 000000000..8ebeba3ac --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/DoubleEmbeddingGenerator.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.helpers; + +import java.util.List; + +public class DoubleEmbeddingGenerator implements EmbeddingGenerator { + + @Override + public List generateListEmbeddingFrom(Object o, int[] dims) { + // in this case o will always be double[1][x] + double[] vector = ((double[][]) o)[0]; + Float[] vector2 = new Float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = (float) vector[i]; + } + return List.of(vector2); + } + + @Override + public float[] generateArrayEmbeddingFrom(Object o, int[] dims) { + double[] vector = ((double[][]) o)[0]; + float[] vector2 = new float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = (float) vector[i]; + } + return vector2; + } + +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java new file mode 100644 index 000000000..865d9f1a4 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGenerator.java @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.helpers; + +import java.util.List; + +public interface EmbeddingGenerator { + List generateListEmbeddingFrom(Object o, int[] dims); + + float[] generateArrayEmbeddingFrom(Object o, int[] dims); +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java new file mode 100644 index 000000000..23c2fe578 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/EmbeddingGeneratorFactory.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.helpers; + +import java.util.HashMap; +import java.util.Map; + +public class EmbeddingGeneratorFactory { + private static final Map generators = new HashMap<>(); + + public static EmbeddingGenerator getGenerator(String type) { + String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase(); + switch (typeLower) { + case "float" -> { + if (!generators.containsKey(type)) { + generators.put(type, new FloatEmbeddingGenerator()); + } + return generators.get(type); + } + case "int" -> { + if (!generators.containsKey(type)) { + generators.put(type, new IntEmbeddingGenerator()); + } + return generators.get(type); + } + case "double" -> { + if (!generators.containsKey(type)) { + generators.put(type, new DoubleEmbeddingGenerator()); + } + return generators.get(type); + } + default -> throw new RuntimeException("Unknown embedding type: " + type); + } + } +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java new file mode 100644 index 000000000..d3aa92354 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/FloatEmbeddingGenerator.java @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.helpers; + +import java.util.List; + +public class FloatEmbeddingGenerator implements EmbeddingGenerator { + + @Override + public List generateListEmbeddingFrom(Object o, int[] dims) { + // in this case o will always be float[1][x] + float[] vector = ((float[][]) o)[0]; + Float[] vector2 = new Float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = vector[i]; + } + return List.of(vector2); + } + + @Override + public float[] generateArrayEmbeddingFrom(Object o, int[] dims) { + return ((float[][]) o)[0]; + } + +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java new file mode 100644 index 000000000..769f81482 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/helpers/IntEmbeddingGenerator.java @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.helpers; + +import java.util.List; + +public class IntEmbeddingGenerator implements EmbeddingGenerator { + @Override + public List generateListEmbeddingFrom(Object o, int[] dims) { + // in this case o will always be int[1][x] + int[] vector = ((int[][]) o)[0]; + Float[] vector2 = new Float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = (float) vector[i]; + } + return List.of(vector2); + } + + @Override + public float[] generateArrayEmbeddingFrom(Object o, int[] dims) { + int[] vector = ((int[][]) o)[0]; + float[] vector2 = new float[vector.length]; + for (int i = 0; i < vector.length; i++) { + vector2[i] = (float) vector[i]; + } + return vector2; + } +} diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java new file mode 100644 index 000000000..2d88c2468 --- /dev/null +++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_array/HdfFileToArrayTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_array; + +import org.junit.jupiter.api.Test; + +public class HdfFileToArrayTest { + + @Test + public void testHdfFileToVector() { + final float[][] results = new float[][]{ + {0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f}, + {2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f}, + {4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f}, + {6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f} + }; + + HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray( + "src/test/resources/h5ex_t_float.h5", + "/DS1"); + + float[] read; + for (int i = 0; i < 4; i++) { + read = hdfFileToVector.apply(i); + for (int j = 0; j < 7; j++) { + assert (read[j] == results[i][j]); + } + } + } +} diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java new file mode 100644 index 000000000..4ba0f104a --- /dev/null +++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_list/HdfFileToVectorTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_list; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +public class HdfFileToVectorTest { + + @Test + public void testHdfFileToVector() { + final float[][] results = new float[][]{ + {0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f}, + {2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f}, + {4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f}, + {6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f} + }; + + HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList( + "src/test/resources/h5ex_t_float.h5", + "/DS1"); + + List read; + for (int i = 0; i < 4; i++) { + read = hdfFileToVector.apply(i); + for (int j = 0; j < 7; j++) { + assert (read.get(j) == results[i][j]); + } + } + } +} diff --git a/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5 b/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5 new file mode 100644 index 000000000..9c8cb981d Binary files /dev/null and b/virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5 differ diff --git a/virtdata-userlibs/pom.xml b/virtdata-userlibs/pom.xml index 09be05153..c5d144b27 100644 --- a/virtdata-userlibs/pom.xml +++ b/virtdata-userlibs/pom.xml @@ -66,6 +66,13 @@ virtdata-lib-curves4 ${revision} + + + io.nosqlbench + virtdata-lib-hdf5 + ${revision} + + io.nosqlbench docsys