mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-01-11 08:22:04 -06:00
adding binding function for hdf5 files
This commit is contained in:
parent
0ecf2252b5
commit
7109c4d09f
@ -82,6 +82,17 @@
|
||||
<version>5.1.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding.EmbeddingGenerator;
|
||||
import io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding.EmbeddingGeneratorFactory;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value. As currently
|
||||
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfFileToVector implements LongFunction<List<Float>> {
|
||||
private final HdfFile hdfFile;
|
||||
private final Dataset dataset;
|
||||
private final int[] dims;
|
||||
private final EmbeddingGenerator embeddingGenerator;
|
||||
|
||||
public HdfFileToVector(String filename, String datasetName) {
|
||||
hdfFile = new HdfFile(Paths.get(filename));
|
||||
//TODO: implement a function to get the dataset by name only without needing the full path
|
||||
dataset = hdfFile.getDatasetByPath(datasetName);
|
||||
dims = dataset.getDimensions();
|
||||
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||
}
|
||||
@Override
|
||||
public List<Float> apply(long l) {
|
||||
long[] sliceOffset = new long[dims.length];
|
||||
sliceOffset[0] = (l % dims[0]);
|
||||
int[] sliceDimensions = new int[dims.length];
|
||||
sliceDimensions[0] = 1;
|
||||
// Do we want to give the option of reducing vector dimensions here?
|
||||
sliceDimensions[1] = dims[1];
|
||||
Object data = dataset.getData(sliceOffset, sliceDimensions);
|
||||
|
||||
return embeddingGenerator.generateEmbeddingFrom(data, dims);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface EmbeddingGenerator {
|
||||
List<Float> generateEmbeddingFrom(Object o, int[] dims);
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class EmbeddingGeneratorFactory {
|
||||
private static final Map<String, EmbeddingGenerator> generators = new HashMap<>();
|
||||
|
||||
public static EmbeddingGenerator getGenerator(String type) {
|
||||
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||
switch (typeLower) {
|
||||
case "float" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new FloatEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "int" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new IntEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public List<Float> generateEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be float[1][x]
|
||||
float[] vector = ((float[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.basics.shared.from_long.to_vector.embedding;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||
@Override
|
||||
public List<Float> generateEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be int[1][x]
|
||||
int[] vector = ((int[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user