From 5017f40927b7eb42990db6c1ff14ba7c27fdcd0b Mon Sep 17 00:00:00 2001 From: MarkWolters Date: Mon, 20 Nov 2023 09:13:04 -0600 Subject: [PATCH] binding for hdf5 dataset to string --- virtdata-lib-hdf5/pom.xml | 1 + .../AbstractHdfFileToVectorType.java | 2 +- .../to_string/HdfDatasetToStrings.java | 50 ++++++++++++++++++ .../to_string/HdfDatasetToStringsTest.java | 45 ++++++++++++++++ .../src/test/resources/hdf5_test_strings.h5 | Bin 0 -> 6208 bytes 5 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java create mode 100644 virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStringsTest.java create mode 100644 virtdata-lib-hdf5/src/test/resources/hdf5_test_strings.h5 diff --git a/virtdata-lib-hdf5/pom.xml b/virtdata-lib-hdf5/pom.xml index b15119c65..689978460 100644 --- a/virtdata-lib-hdf5/pom.xml +++ b/virtdata-lib-hdf5/pom.xml @@ -53,6 +53,7 @@ src/test/resources h5ex_t_float.h5 + hdf5_test_strings.h5 **/*.ivec **/*.fvec diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVectorType.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVectorType.java index cab669196..88e94af08 100644 --- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVectorType.java +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/AbstractHdfFileToVectorType.java @@ -38,7 +38,7 @@ public abstract class AbstractHdfFileToVectorType { int[] sliceDimensions = new int[dims.length]; sliceDimensions[0] = 1; // Do we want to give the option of reducing vector dimensions here? - sliceDimensions[1] = dims[1]; + sliceDimensions[1] = dims.length > 1 ? dims[1] : 1; return dataset.getData(sliceOffset, sliceDimensions); } } diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java new file mode 100644 index 000000000..0e2e7edf4 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStrings.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_string; + +import io.nosqlbench.virtdata.api.annotations.Categories; +import io.nosqlbench.virtdata.api.annotations.Category; +import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper; +import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator; +import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory; + +import java.util.function.LongFunction; + +/** + * This function reads a dataset from an HDF5 file. The dataset itself is not + * read into memory, only the metadata (the "dataset" Java Object). The lambda function + * reads a single vector from the dataset, based on the long input value. + */ +@ThreadSafeMapper +@Categories(Category.experimental) +public class HdfDatasetToStrings extends AbstractHdfFileToVectorType implements LongFunction { + + public HdfDatasetToStrings(String filename, String datasetName) { + super(filename, datasetName); + } + @Override + public String apply(long l) { + long[] sliceOffset = new long[dims.length]; + sliceOffset[0] = (l % dims[0]); + int[] sliceDimensions = new int[dims.length]; + sliceDimensions[0] = 1; + sliceDimensions[1] = dims.length > 1 ? dims[1] : 1; + return (String) dataset.getData(sliceOffset, sliceDimensions); + } + +} diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStringsTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStringsTest.java new file mode 100644 index 000000000..a79b0043a --- /dev/null +++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/hdf5/from_long/to_string/HdfDatasetToStringsTest.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.hdf5.from_long.to_string; + +import io.nosqlbench.virtdata.library.hdf5.from_long.to_list.HdfFileToFloatList; +import org.junit.jupiter.api.Test; + +import java.util.List; + +public class HdfDatasetToStringsTest { + + @Test + public void testHdfFileToVector() { + final String[] results = new String[]{ + "String 1", + "String 2", + "String 3", + "String 4" + }; + + HdfDatasetToStrings hdfFileToVector = new HdfDatasetToStrings( + "/Users/mark.wolters/dev/vec/hdf5_test_strings.h5", + "/strings"); + + String read; + for (int i = 0; i < 4; i++) { + read = hdfFileToVector.apply(i); + assert (read.equals(results[i])); + } + } +} diff --git a/virtdata-lib-hdf5/src/test/resources/hdf5_test_strings.h5 b/virtdata-lib-hdf5/src/test/resources/hdf5_test_strings.h5 new file mode 100644 index 0000000000000000000000000000000000000000..c14af7336fb33cdbc9d33e4d8ae780680ae025dc GIT binary patch literal 6208 zcmeH{!A=4}42C93w;6uQT$M8{nE1gdNlFc4T zOi27ocG_u&?(CPg+h%%yGPoLuN{rHzvuyhE1F0^|<9m-UVFc(6^jBd<5dDeZ?{bd( z-E%MVx|Vd(^Seem8THJH?x6lx1e*HxSwHv{U+q=;jXHA}>(ABu$J;vBocbCM9H@iz zEf)$rppru{2g)+_jY*l}8o5kyy`w&M(kizvG4G4Q+!6W8kOEmMQ~PzZw~oPBJk6%s z(YUM(%VW3|??3`1a0mhI;|k5JbE>`9(c0c?tGD+$^B(SIk9odfAlt07b15u~JdcNC zAJcf1`>%a`bo@6yPdomJ=2Y38n&wIZBtQZrKmsH{0wh2JBtQZrKmsK2_XK_c;cq=> literal 0 HcmV?d00001