mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
merging main into vector-mergup
This commit is contained in:
62
virtdata-lib-hdf5/pom.xml
Normal file
62
virtdata-lib-hdf5/pom.xml
Normal file
@@ -0,0 +1,62 @@
|
||||
<!--
|
||||
~ Copyright (c) 2023 nosqlbench
|
||||
~
|
||||
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
~ you may not use this file except in compliance with the License.
|
||||
~ You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<artifactId>mvn-defaults</artifactId>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<version>${revision}</version>
|
||||
<relativePath>../mvn-defaults</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>virtdata-lib-hdf5</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>virtdata-lib-hdf5</name>
|
||||
<url>http://nosqlbench.io/</url>
|
||||
|
||||
<description>With inspiration from other libraries</description>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<artifactId>virtdata-lib-basics</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>src/test/resources</directory>
|
||||
<excludes>
|
||||
<exclude>h5ex_t_float.h5</exclude>
|
||||
</excludes>
|
||||
<filtering>true</filtering>
|
||||
</testResource>
|
||||
</testResources>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public abstract class AbstractHdfFileToVector {
|
||||
protected final HdfFile hdfFile;
|
||||
protected final Dataset dataset;
|
||||
protected final int[] dims;
|
||||
|
||||
public AbstractHdfFileToVector(String filename, String datasetName) {
|
||||
//hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
|
||||
hdfFile = new HdfFile(Paths.get(filename));
|
||||
//TODO: implement a function to get the dataset by name only without needing the full path
|
||||
dataset = hdfFile.getDatasetByPath(datasetName);
|
||||
dims = dataset.getDimensions();
|
||||
}
|
||||
|
||||
protected Object getDataFrom(long l) {
|
||||
long[] sliceOffset = new long[dims.length];
|
||||
sliceOffset[0] = (l % dims[0]);
|
||||
int[] sliceDimensions = new int[dims.length];
|
||||
sliceDimensions[0] = 1;
|
||||
// Do we want to give the option of reducing vector dimensions here?
|
||||
sliceDimensions[1] = dims[1];
|
||||
return dataset.getData(sliceOffset, sliceDimensions);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value. As currently
|
||||
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||
* <p>
|
||||
* This implementation is specific to returning an array of floats
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction<float[]> {
|
||||
private final EmbeddingGenerator embeddingGenerator;
|
||||
|
||||
public HdfFileToVectorArray(String filename, String datasetName) {
|
||||
super(filename, datasetName);
|
||||
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||
}
|
||||
@Override
|
||||
public float[] apply(long l) {
|
||||
Object data = getDataFrom(l);
|
||||
return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,55 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value. As currently
|
||||
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||
* <p>
|
||||
* This implementation is specific to returning a List of Floats, so as to work with the
|
||||
* normalization functions e.g. NormalizeListVector and its variants.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction<List<Float>> {
|
||||
private final EmbeddingGenerator embeddingGenerator;
|
||||
|
||||
public HdfFileToVectorList(String filename, String datasetName) {
|
||||
super(filename, datasetName);
|
||||
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||
}
|
||||
@Override
|
||||
public List<Float> apply(long l) {
|
||||
Object data = getDataFrom(l);
|
||||
return embeddingGenerator.generateListEmbeddingFrom(data, dims);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be double[1][x]
|
||||
double[] vector = ((double[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
double[] vector = ((double[][]) o)[0];
|
||||
float[] vector2 = new float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface EmbeddingGenerator {
|
||||
List<Float> generateListEmbeddingFrom(Object o, int[] dims);
|
||||
|
||||
float[] generateArrayEmbeddingFrom(Object o, int[] dims);
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class EmbeddingGeneratorFactory {
|
||||
private static final Map<String, EmbeddingGenerator> generators = new HashMap<>();
|
||||
|
||||
public static EmbeddingGenerator getGenerator(String type) {
|
||||
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||
switch (typeLower) {
|
||||
case "float" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new FloatEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "int" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new IntEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "double" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new DoubleEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be float[1][x]
|
||||
float[] vector = ((float[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
return ((float[][]) o)[0];
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be int[1][x]
|
||||
int[] vector = ((int[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
int[] vector = ((int[][]) o)[0];
|
||||
float[] vector2 = new float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class HdfFileToArrayTest {
|
||||
|
||||
@Test
|
||||
public void testHdfFileToVector() {
|
||||
final float[][] results = new float[][]{
|
||||
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||
};
|
||||
|
||||
HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
|
||||
"src/test/resources/h5ex_t_float.h5",
|
||||
"/DS1");
|
||||
|
||||
float[] read;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
read = hdfFileToVector.apply(i);
|
||||
for (int j = 0; j < 7; j++) {
|
||||
assert (read[j] == results[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class HdfFileToVectorTest {
|
||||
|
||||
@Test
|
||||
public void testHdfFileToVector() {
|
||||
final float[][] results = new float[][]{
|
||||
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||
};
|
||||
|
||||
HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
|
||||
"src/test/resources/h5ex_t_float.h5",
|
||||
"/DS1");
|
||||
|
||||
List<Float> read;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
read = hdfFileToVector.apply(i);
|
||||
for (int j = 0; j < 7; j++) {
|
||||
assert (read.get(j) == results[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
Binary file not shown.
Reference in New Issue
Block a user