merging main into vector-mergup

This commit is contained in:
Jonathan Shook
2023-08-16 16:19:49 -05:00
38 changed files with 1526 additions and 20 deletions

62
virtdata-lib-hdf5/pom.xml Normal file
View File

@@ -0,0 +1,62 @@
<!--
~ Copyright (c) 2023 nosqlbench
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>mvn-defaults</artifactId>
<groupId>io.nosqlbench</groupId>
<version>${revision}</version>
<relativePath>../mvn-defaults</relativePath>
</parent>
<artifactId>virtdata-lib-hdf5</artifactId>
<packaging>jar</packaging>
<name>virtdata-lib-hdf5</name>
<url>http://nosqlbench.io/</url>
<description>With inspiration from other libraries</description>
<dependencies>
<dependency>
<groupId>io.nosqlbench</groupId>
<artifactId>virtdata-lib-basics</artifactId>
<version>${revision}</version>
</dependency>
<dependency>
<groupId>io.jhdf</groupId>
<artifactId>jhdf</artifactId>
<version>0.6.10</version>
</dependency>
</dependencies>
<build>
<testResources>
<testResource>
<directory>src/test/resources</directory>
<excludes>
<exclude>h5ex_t_float.h5</exclude>
</excludes>
<filtering>true</filtering>
</testResource>
</testResources>
</build>
</project>

View File

@@ -0,0 +1,47 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.nosqlbench.api.content.NBIO;
import java.nio.file.Paths;
public abstract class AbstractHdfFileToVector {
protected final HdfFile hdfFile;
protected final Dataset dataset;
protected final int[] dims;
public AbstractHdfFileToVector(String filename, String datasetName) {
//hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
hdfFile = new HdfFile(Paths.get(filename));
//TODO: implement a function to get the dataset by name only without needing the full path
dataset = hdfFile.getDatasetByPath(datasetName);
dims = dataset.getDimensions();
}
protected Object getDataFrom(long l) {
long[] sliceOffset = new long[dims.length];
sliceOffset[0] = (l % dims[0]);
int[] sliceDimensions = new int[dims.length];
sliceDimensions[0] = 1;
// Do we want to give the option of reducing vector dimensions here?
sliceDimensions[1] = dims[1];
return dataset.getData(sliceOffset, sliceDimensions);
}
}

View File

@@ -0,0 +1,53 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning an array of floats
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction<float[]> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToVectorArray(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public float[] apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
}
}

View File

@@ -0,0 +1,55 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
import java.util.List;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value. As currently
* written this class will only work for datasets with 2 dimensions where the 1st dimension
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
* each vector. Only datatypes short, int, and float are supported at this time.
* <p>
* This implementation is specific to returning a List of Floats, so as to work with the
* normalization functions e.g. NormalizeListVector and its variants.
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction<List<Float>> {
private final EmbeddingGenerator embeddingGenerator;
public HdfFileToVectorList(String filename, String datasetName) {
super(filename, datasetName);
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
}
@Override
public List<Float> apply(long l) {
Object data = getDataFrom(l);
return embeddingGenerator.generateListEmbeddingFrom(data, dims);
}
}

View File

@@ -0,0 +1,44 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be double[1][x]
double[] vector = ((double[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return List.of(vector2);
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
double[] vector = ((double[][]) o)[0];
float[] vector2 = new float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return vector2;
}
}

View File

@@ -0,0 +1,25 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public interface EmbeddingGenerator {
List<Float> generateListEmbeddingFrom(Object o, int[] dims);
float[] generateArrayEmbeddingFrom(Object o, int[] dims);
}

View File

@@ -0,0 +1,49 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.HashMap;
import java.util.Map;
public class EmbeddingGeneratorFactory {
private static final Map<String, EmbeddingGenerator> generators = new HashMap<>();
public static EmbeddingGenerator getGenerator(String type) {
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
switch (typeLower) {
case "float" -> {
if (!generators.containsKey(type)) {
generators.put(type, new FloatEmbeddingGenerator());
}
return generators.get(type);
}
case "int" -> {
if (!generators.containsKey(type)) {
generators.put(type, new IntEmbeddingGenerator());
}
return generators.get(type);
}
case "double" -> {
if (!generators.containsKey(type)) {
generators.put(type, new DoubleEmbeddingGenerator());
}
return generators.get(type);
}
default -> throw new RuntimeException("Unknown embedding type: " + type);
}
}
}

View File

@@ -0,0 +1,39 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be float[1][x]
float[] vector = ((float[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = vector[i];
}
return List.of(vector2);
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
return ((float[][]) o)[0];
}
}

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public class IntEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be int[1][x]
int[] vector = ((int[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return List.of(vector2);
}
@Override
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
int[] vector = ((int[][]) o)[0];
float[] vector2 = new float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return vector2;
}
}

View File

@@ -0,0 +1,44 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
import org.junit.jupiter.api.Test;
public class HdfFileToArrayTest {
@Test
public void testHdfFileToVector() {
final float[][] results = new float[][]{
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
};
HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
"src/test/resources/h5ex_t_float.h5",
"/DS1");
float[] read;
for (int i = 0; i < 4; i++) {
read = hdfFileToVector.apply(i);
for (int j = 0; j < 7; j++) {
assert (read[j] == results[i][j]);
}
}
}
}

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
import org.junit.jupiter.api.Test;
import java.util.List;
public class HdfFileToVectorTest {
@Test
public void testHdfFileToVector() {
final float[][] results = new float[][]{
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
};
HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
"src/test/resources/h5ex_t_float.h5",
"/DS1");
List<Float> read;
for (int i = 0; i < 4; i++) {
read = hdfFileToVector.apply(i);
for (int j = 0; j < 7; j++) {
assert (read.get(j) == results[i][j]);
}
}
}
}

Binary file not shown.