mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
Mwolters/pred enhancements (#2159)
* adding new binding for single array int types * new binding for var length int arrays * new binding for var length int arrays cleanup * add embedding to convert int array to string * add script for hdf5 generation * list version of var length int embedding * updating target byte code for groovy scripts * adding pytorch version * removing datagen files to put them in tools repo * make IntArrayToString threadsafe
This commit is contained in:
parent
cacd39231f
commit
d617796f99
@ -36,10 +36,13 @@ public abstract class AbstractHdfFileToVectorType {
|
|||||||
long[] sliceOffset = new long[dims.length];
|
long[] sliceOffset = new long[dims.length];
|
||||||
sliceOffset[0] = (l % dims[0]);
|
sliceOffset[0] = (l % dims[0]);
|
||||||
int[] sliceDimensions = new int[dims.length];
|
int[] sliceDimensions = new int[dims.length];
|
||||||
// We always want to read a single vector
|
// We always want to read a single value
|
||||||
sliceDimensions[0] = 1;
|
sliceDimensions[0] = 1;
|
||||||
// Number of elements in the vector
|
if (dims.length > 1) {
|
||||||
sliceDimensions[1] = dims[1];
|
sliceDimensions[1] = dims[1];
|
||||||
return dataset.getData(sliceOffset, sliceDimensions);
|
return dataset.getData(sliceOffset, sliceDimensions);
|
||||||
|
} else {
|
||||||
|
return dataset.getData();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,62 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
|
||||||
|
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||||
|
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||||
|
* reads a single vector from the dataset, based on the long input value. As currently
|
||||||
|
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||||
|
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||||
|
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||||
|
* <p>
|
||||||
|
* This implementation is specific to returning an array of ints from a dataset that contains
|
||||||
|
* variable length arrays of ints.
|
||||||
|
*/
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class HdfFileToVarLengthIntArray extends AbstractHdfFileToVectorType implements LongFunction<int[]> {
|
||||||
|
|
||||||
|
public HdfFileToVarLengthIntArray(String filename, String datasetName) {
|
||||||
|
super(filename, datasetName);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public int[] apply(long l) {
|
||||||
|
Object data = getDataFrom(l);
|
||||||
|
return extractIds(data, l);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param o The dataset from hdf5 file. In this case because the number of ints in each array is variable length it
|
||||||
|
* is returned by hdf5 as a one dimensional array of Objects
|
||||||
|
* @param l The current cycle
|
||||||
|
* @return The int array at the offset of l
|
||||||
|
*/
|
||||||
|
private int[] extractIds(Object o, long l) {
|
||||||
|
Object[] objects = (Object[]) o;
|
||||||
|
Object o1 = objects[(int) l % objects.length];
|
||||||
|
return (int[]) o1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_int;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||||
|
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||||
|
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||||
|
* reads a single vector from the dataset, based on the long input value. As currently
|
||||||
|
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||||
|
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||||
|
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||||
|
* <p>
|
||||||
|
* This implementation is specific to returning a single int
|
||||||
|
*/
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class HdfFileToInt extends AbstractHdfFileToVectorType implements LongFunction<Integer> {
|
||||||
|
private final EmbeddingGenerator embeddingGenerator;
|
||||||
|
|
||||||
|
public HdfFileToInt(String filename, String datasetName) {
|
||||||
|
super(filename, datasetName);
|
||||||
|
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public Integer apply(long l) {
|
||||||
|
Object data = getDataFrom(l);
|
||||||
|
return embeddingGenerator.generateIntFrom(data, l);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,67 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||||
|
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||||
|
* reads a single vector from the dataset, based on the long input value. As currently
|
||||||
|
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||||
|
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||||
|
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||||
|
* <p>
|
||||||
|
* This implementation is specific to returning a List of type Integer from a dataset that contains
|
||||||
|
* variable length arrays of ints.
|
||||||
|
*/
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class HdfFileToVarLengthIntList extends AbstractHdfFileToVectorType implements LongFunction<List<Integer>> {
|
||||||
|
|
||||||
|
public HdfFileToVarLengthIntList(String filename, String datasetName) {
|
||||||
|
super(filename, datasetName);
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public List<Integer> apply(long l) {
|
||||||
|
Object data = getDataFrom(l);
|
||||||
|
return extractIds(data, l);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param o The dataset from hdf5 file. In this case because the number of ints in each array is variable length it
|
||||||
|
* is returned by hdf5 as a one dimensional array of Objects
|
||||||
|
* @param l The current cycle
|
||||||
|
* @return A List of the integers at the offset of l
|
||||||
|
*/
|
||||||
|
private List<Integer> extractIds(Object o, long l) {
|
||||||
|
Object[] objects = (Object[]) o;
|
||||||
|
int[] ints = (int[]) objects[(int) l % objects.length];
|
||||||
|
Integer[] integers = new Integer[ints.length];
|
||||||
|
for (int i = 0; i < ints.length; i++) {
|
||||||
|
integers[i] = ints[i];
|
||||||
|
}
|
||||||
|
return List.of(integers);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class IntArrayToString implements Function<int[],String> {
|
||||||
|
private final ThreadLocal<StringBuilder> threadSb = ThreadLocal.withInitial(StringBuilder::new);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String apply(int[] ints) {
|
||||||
|
StringBuilder sb = threadSb.get();
|
||||||
|
sb.setLength(0);
|
||||||
|
for (int i : ints) {
|
||||||
|
sb.append(i);
|
||||||
|
if (i != ints[ints.length - 1]) {
|
||||||
|
sb.append(",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -81,4 +81,10 @@ public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
|||||||
return vector2;
|
return vector2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int generateIntFrom(Object o, long l) {
|
||||||
|
double[] source = (double[]) o;
|
||||||
|
return (int) source[(int) (l % source.length)];
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -30,4 +30,6 @@ public interface EmbeddingGenerator {
|
|||||||
List<Integer> generateIntListEmbeddingFrom(Object data, int[] dims);
|
List<Integer> generateIntListEmbeddingFrom(Object data, int[] dims);
|
||||||
|
|
||||||
int[] generateIntArrayEmbeddingFrom(Object data, int[] dims);
|
int[] generateIntArrayEmbeddingFrom(Object data, int[] dims);
|
||||||
|
|
||||||
|
int generateIntFrom(Object data, long l);
|
||||||
}
|
}
|
||||||
|
@ -76,4 +76,10 @@ public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
|||||||
return vector2;
|
return vector2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int generateIntFrom(Object o, long l) {
|
||||||
|
float[] source = (float[]) o;
|
||||||
|
return (int) source[(int) (l % source.length)];
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -74,4 +74,10 @@ public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
|||||||
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
|
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
|
||||||
return ((int[][]) o)[0];
|
return ((int[][]) o)[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int generateIntFrom(Object o, long l) {
|
||||||
|
int[] source = (int[]) o;
|
||||||
|
return source[(int) (l % source.length)];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -75,4 +75,10 @@ public class LongEmbeddingGenerator implements EmbeddingGenerator {
|
|||||||
}
|
}
|
||||||
return vector2;
|
return vector2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int generateIntFrom(Object o, long l) {
|
||||||
|
long[] source = (long[]) o;
|
||||||
|
return Math.toIntExact(source[(int) (l % source.length)]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user