updating hdf5 bindings

This commit is contained in:
Mark Wolters 2023-08-18 12:40:54 -04:00
parent 81f004c845
commit d0e7c3dd07
13 changed files with 364 additions and 28 deletions

View File

@ -18,16 +18,15 @@ package io.nosqlbench.virtdata.library.hdf5.from_long;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.nosqlbench.api.content.NBIO;
import java.nio.file.Paths;
public abstract class AbstractHdfFileToVector {
public abstract class AbstractHdfFileToVectorType {
protected final HdfFile hdfFile;
protected final Dataset dataset;
protected final int[] dims;
public AbstractHdfFileToVector(String filename, String datasetName) {
public AbstractHdfFileToVectorType(String filename, String datasetName) {
//hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
hdfFile = new HdfFile(Paths.get(filename));
//TODO: implement a function to get the dataset by name only without needing the full path

View File

@ -19,7 +19,7 @@ package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
@ -37,17 +37,17 @@ import java.util.function.LongFunction;
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction<float[]> {
public class HdfFileToFloatArray extends AbstractHdfFileToVectorType implements LongFunction<float[]> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToVectorArray(String filename, String datasetName) {
public HdfFileToFloatArray(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public float[] apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
return embeddingGenerator.generateFloatArrayEmbeddingFrom(data, dims);
}
}

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning an array of ints
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToIntArray extends AbstractHdfFileToVectorType implements LongFunction<int[]> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToIntArray(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public int[] apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateIntArrayEmbeddingFrom(data, dims);
}
}

View File

@ -0,0 +1,53 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning an array of longs
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToLongArray extends AbstractHdfFileToVectorType implements LongFunction<long[]> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToLongArray(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public long[] apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateLongArrayEmbeddingFrom(data, dims);
}
}

View File

@ -19,7 +19,7 @@ package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
@ -39,17 +39,17 @@ import java.util.function.LongFunction;
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction<List<Float>> {
public class HdfFileToFloatList extends AbstractHdfFileToVectorType implements LongFunction<List<Float>> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToVectorList(String filename, String datasetName) {
public HdfFileToFloatList(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public List<Float> apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateListEmbeddingFrom(data, dims);
return embeddingGenerator.generateFloatListEmbeddingFrom(data, dims);
}
}

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.List;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning a List of Integers
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToIntList extends AbstractHdfFileToVectorType implements LongFunction<List<Integer>> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToIntList(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public List<Integer> apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateIntListEmbeddingFrom(data, dims);
}
}

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.List;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning a List of Longs
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToLongList extends AbstractHdfFileToVectorType implements LongFunction<List<Long>> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToLongList(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public List<Long> apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateLongListEmbeddingFrom(data, dims);
}
}

View File

@ -21,18 +21,18 @@ import java.util.List;
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be double[1][x]
double[] vector = ((double[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return List.of(vector2);
public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be double[1][x]
double[] vector = ((double[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return List.of(vector2);
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
float[] vector2 = new float[vector.length];
for (int i = 0; i < vector.length; i++) {
@ -41,4 +41,44 @@ public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
return vector2;
}
@Override
public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
Long[] vector2 = new Long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return List.of(vector2);
}
@Override
public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
long[] vector2 = new long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return vector2;
}
@Override
public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
Integer[] vector2 = new Integer[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (int) vector[i];
}
return List.of(vector2);
}
@Override
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
int[] vector2 = new int[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (int) vector[i];
}
return vector2;
}
}

View File

@ -19,7 +19,15 @@ package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public interface EmbeddingGenerator {
List<Float> generateListEmbeddingFrom(Object o, int[] dims);
List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims);
float[] generateArrayEmbeddingFrom(Object o, int[] dims);
float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims);
List<Long> generateLongListEmbeddingFrom(Object data, int[] dims);
long[] generateLongArrayEmbeddingFrom(Object data, int[] dims);
List<Integer> generateIntListEmbeddingFrom(Object data, int[] dims);
int[] generateIntArrayEmbeddingFrom(Object data, int[] dims);
}

View File

@ -21,7 +21,7 @@ import java.util.List;
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be float[1][x]
float[] vector = ((float[][]) o)[0];
Float[] vector2 = new Float[vector.length];
@ -32,8 +32,48 @@ public class FloatEmbeddingGenerator implements EmbeddingGenerator {
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
return ((float[][]) o)[0];
}
@Override
public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
float[] vector = ((float[][]) o)[0];
Long[] vector2 = new Long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return List.of(vector2);
}
@Override
public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
float[] vector = ((float[][]) o)[0];
long[] vector2 = new long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return vector2;
}
@Override
public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
float[] vector = ((float[][]) o)[0];
Integer[] vector2 = new Integer[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (int) vector[i];
}
return List.of(vector2);
}
@Override
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
float[] vector = ((float[][]) o)[0];
int[] vector2 = new int[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (int) vector[i];
}
return vector2;
}
}

View File

@ -20,7 +20,7 @@ import java.util.List;
public class IntEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be int[1][x]
int[] vector = ((int[][]) o)[0];
Float[] vector2 = new Float[vector.length];
@ -31,7 +31,7 @@ public class IntEmbeddingGenerator implements EmbeddingGenerator {
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
int[] vector = ((int[][]) o)[0];
float[] vector2 = new float[vector.length];
for (int i = 0; i < vector.length; i++) {
@ -39,4 +39,39 @@ public class IntEmbeddingGenerator implements EmbeddingGenerator {
}
return vector2;
}
@Override
public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
int[] vector = ((int[][]) o)[0];
Long[] vector2 = new Long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return List.of(vector2);
}
@Override
public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
int[] vector = ((int[][]) o)[0];
long[] vector2 = new long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (long) vector[i];
}
return vector2;
}
@Override
public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
int[] vector = ((int[][]) o)[0];
Integer[] vector2 = new Integer[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = vector[i];
}
return List.of(vector2);
}
@Override
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
return ((int[][]) o)[0];
}
}

View File

@ -29,7 +29,7 @@ public class HdfFileToArrayTest {
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
};
HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
HdfFileToFloatArray hdfFileToVector = new HdfFileToFloatArray(
"src/test/resources/h5ex_t_float.h5",
"/DS1");

View File

@ -31,7 +31,7 @@ public class HdfFileToVectorTest {
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
};
HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
HdfFileToFloatList hdfFileToVector = new HdfFileToFloatList(
"src/test/resources/h5ex_t_float.h5",
"/DS1");