mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
Merge branch 'main' of github.com:nosqlbench/nosqlbench
This commit is contained in:
commit
da23c66622
101
hdf-loader/pom.xml
Normal file
101
hdf-loader/pom.xml
Normal file
@ -0,0 +1,101 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ Copyright (c) 2023 nosqlbench
|
||||
~
|
||||
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
~ you may not use this file except in compliance with the License.
|
||||
~ You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
~
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>hdf-loader</artifactId>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<parent>
|
||||
<artifactId>mvn-defaults</artifactId>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<version>${revision}</version>
|
||||
<relativePath>../mvn-defaults</relativePath>
|
||||
</parent>
|
||||
|
||||
<name>${project.artifactId}</name>
|
||||
|
||||
<properties>
|
||||
<maven.compiler.source>17</maven.compiler.source>
|
||||
<maven.compiler.target>17</maven.compiler.target>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.snakeyaml</groupId>
|
||||
<artifactId>snakeyaml-engine</artifactId>
|
||||
<version>2.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.yaml</groupId>
|
||||
<artifactId>snakeyaml</artifactId>
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.datastax.oss</groupId>
|
||||
<artifactId>java-driver-core</artifactId>
|
||||
<version>4.16.0</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-core</artifactId>
|
||||
<version>2.15.2</version>
|
||||
</dependency>
|
||||
|
||||
<!-- https://mvnrepository.com/artifact/org.deeplearning4j/deeplearning4j-core -->
|
||||
<dependency>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-core</artifactId>
|
||||
<version>1.0.0-M2.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.nd4j</groupId>
|
||||
<artifactId>nd4j-native</artifactId>
|
||||
<version>1.0.0-M2.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.deeplearning4j</groupId>
|
||||
<artifactId>deeplearning4j-nlp</artifactId>
|
||||
<version>1.0.0-M2.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<artifactId>nb-api</artifactId>
|
||||
<version>5.17.3-SNAPSHOT</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
@ -0,0 +1,84 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf;
|
||||
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import io.nosqlbench.loader.hdf.readers.Hdf5Reader;
|
||||
import io.nosqlbench.loader.hdf.readers.HdfReader;
|
||||
import io.nosqlbench.loader.hdf.writers.AstraVectorWriter;
|
||||
import io.nosqlbench.loader.hdf.writers.FileVectorWriter;
|
||||
import io.nosqlbench.loader.hdf.writers.NoopVectorWriter;
|
||||
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
public class HdfLoader {
|
||||
private static final Logger logger = LogManager.getLogger(HdfLoader.class);
|
||||
public static final String FILEWRITER = "filewriter";
|
||||
public static final String ASTRA = "astra";
|
||||
public static final String NOOP = "noop";
|
||||
public static final String HDF5 = "hdf5";
|
||||
public static final String HDF4 = "hdf4";
|
||||
|
||||
public static void main (String[] args) {
|
||||
if (args.length == 0) {
|
||||
System.out.println("Usage: hdf-loader <filename>");
|
||||
System.exit(1);
|
||||
}
|
||||
try {
|
||||
LoaderConfig config = new LoaderConfig(args[0]);
|
||||
logger.info("Starting loader with config: " + config);
|
||||
HdfReader reader = null;
|
||||
VectorWriter writer = null;
|
||||
|
||||
String format = config.getFormat();
|
||||
switch (format.toLowerCase()) {
|
||||
case HDF4 -> {
|
||||
logger.info("HDF4 format not yet supported");
|
||||
System.exit(1);
|
||||
}
|
||||
case HDF5 -> {
|
||||
logger.info("HDF5 format selected");
|
||||
reader = new Hdf5Reader(config);
|
||||
}
|
||||
default -> {
|
||||
logger.info("Unknown format: " + format);
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
String writerType = config.getWriter();
|
||||
logger.info("Using writer type: " + writerType);
|
||||
switch (writerType.toLowerCase()) {
|
||||
case FILEWRITER -> writer = new FileVectorWriter(config);
|
||||
case ASTRA -> writer = new AstraVectorWriter(config);
|
||||
case NOOP -> writer = new NoopVectorWriter();
|
||||
default -> {
|
||||
logger.info("Unknown writer type: " + writerType);
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
reader.setWriter(writer);
|
||||
logger.info("Starting main read loop");
|
||||
reader.read();
|
||||
} catch (Exception e) {
|
||||
logger.error(e);
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.config;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class LoaderConfig {
|
||||
private static final Logger logger = LogManager.getLogger(LoaderConfig.class);
|
||||
private static final Yaml yaml = new Yaml();
|
||||
private final Map<String, Object> configMap;
|
||||
|
||||
public LoaderConfig(String filePath) throws IOException {
|
||||
FileReader fileReader = new FileReader(filePath);
|
||||
configMap = yaml.load(fileReader);
|
||||
for (Map.Entry<String, Object> entry : configMap.entrySet()) {
|
||||
logger.debug(entry.getKey() + " : " + entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
public Object getRawValue(String key) {
|
||||
return configMap.get(key);
|
||||
}
|
||||
|
||||
public String getStringValue(String key) {
|
||||
return configMap.get(key).toString();
|
||||
}
|
||||
|
||||
public List<String> getDatasets() {
|
||||
return (List<String>) configMap.get("datasets");
|
||||
}
|
||||
|
||||
public String getFormat() {
|
||||
return (String) configMap.getOrDefault("format", "HD5");
|
||||
}
|
||||
|
||||
public Map<String,String> getAstra() {
|
||||
return (Map<String,String>) configMap.get("astra");
|
||||
}
|
||||
|
||||
public String getEmbedding() {
|
||||
return (String) configMap.getOrDefault("embedding", "Deeplearning4j");
|
||||
}
|
||||
|
||||
public String getWriter() {
|
||||
return (String) configMap.getOrDefault("writer", "filewriter");
|
||||
}
|
||||
|
||||
public String getSourceFile() {
|
||||
return (String) configMap.get("sourceFile");
|
||||
}
|
||||
|
||||
public String getTargetFile() {
|
||||
return (String) configMap.getOrDefault("targetFile", "./vectors.txt");
|
||||
}
|
||||
|
||||
public int getThreads() {
|
||||
return (int) configMap.getOrDefault("threads", 5);
|
||||
}
|
||||
|
||||
public int getQueueSize() {
|
||||
return (int) configMap.getOrDefault("queueSize", 1000);
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||
return switch (dims.length) {
|
||||
case 1 -> new float[][]{convertToFloat((double[]) o)};
|
||||
case 2 -> convertToFloats((double[][]) o);
|
||||
case 3 -> flatten(o, dims);
|
||||
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||
};
|
||||
}
|
||||
|
||||
private float[][] convertToFloats(double[][] o) {
|
||||
float[][] floats = new float[o.length][];
|
||||
for (int i = 0; i < o.length; i++) {
|
||||
floats[i] = convertToFloat(o[i]);
|
||||
}
|
||||
return floats;
|
||||
}
|
||||
|
||||
public float[] convertToFloat(double[] doubleArray) {
|
||||
if (doubleArray == null) {
|
||||
return null;
|
||||
}
|
||||
float[] floatArray = new float[doubleArray.length];
|
||||
for (int i = 0; i < doubleArray.length; i++) {
|
||||
floatArray[i] = (float) doubleArray[i];
|
||||
}
|
||||
return floatArray;
|
||||
}
|
||||
|
||||
private float[][] flatten(Object o, int[] dims) {
|
||||
double[][][] arr = (double[][][]) o;
|
||||
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
for (int j = 0; j < dims[1]; j++) {
|
||||
for (int k = 0; k < dims[2]; k++) {
|
||||
flat[i][j * dims[2] + k] = (float)arr[i][j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return flat;
|
||||
}
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
public interface EmbeddingGenerator {
|
||||
public float[][] generateEmbeddingFrom(Object o, int[] dims);
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class EmbeddingGeneratorFactory {
|
||||
private static final Map<String,EmbeddingGenerator> generators = new HashMap<>();
|
||||
|
||||
public static EmbeddingGenerator getGenerator(String type) {
|
||||
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||
if (typeLower.equals("integer")) typeLower = "int";
|
||||
switch (typeLower) {
|
||||
case "string" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new StringEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "float" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new FloatEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "double" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new DoubleEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "int" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new IntEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||
return switch (dims.length) {
|
||||
case 1 -> new float[][]{(float[]) o};
|
||||
case 2 -> (float[][]) o;
|
||||
case 3 -> flatten(o, dims);
|
||||
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||
};
|
||||
}
|
||||
|
||||
private float[][] flatten(Object o, int[] dims) {
|
||||
float[][][] arr = (float[][][]) o;
|
||||
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
for (int j = 0; j < dims[1]; j++) {
|
||||
for (int k = 0; k < dims[2]; k++) {
|
||||
flat[i][j * dims[2] + k] = arr[i][j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return flat;
|
||||
}
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||
@Override
|
||||
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||
switch (dims.length) {
|
||||
case 1 -> {
|
||||
float[] arr = new float[dims[0]];
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
arr[i] = ((int[]) o)[i];
|
||||
}
|
||||
return new float[][]{arr};
|
||||
}
|
||||
case 2 -> {
|
||||
float[][] arr = new float[dims[0]][dims[1]];
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
for (int j = 0; j < dims[1]; j++) {
|
||||
arr[i][j] = ((int[][]) o)[i][j];
|
||||
}
|
||||
}
|
||||
return arr;
|
||||
}
|
||||
case 3 -> {
|
||||
return flatten(o, dims);
|
||||
}
|
||||
default ->
|
||||
throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||
}
|
||||
}
|
||||
|
||||
private float[][] flatten(Object o, int[] dims) {
|
||||
int[][][] arr = (int[][][]) o;
|
||||
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||
for (int i = 0; i < dims[0]; i++) {
|
||||
for (int j = 0; j < dims[1]; j++) {
|
||||
for (int k = 0; k < dims[2]; k++) {
|
||||
flat[i][j * dims[2] + k] = arr[i][j][k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return flat;
|
||||
}
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.embedding;
|
||||
|
||||
import org.deeplearning4j.models.word2vec.Word2Vec;
|
||||
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
|
||||
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
|
||||
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
|
||||
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
|
||||
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class StringEmbeddingGenerator implements EmbeddingGenerator {
|
||||
private TokenizerFactory tokenizerFactory= new DefaultTokenizerFactory();
|
||||
|
||||
@Override
|
||||
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||
switch (dims.length) {
|
||||
case 1 -> {
|
||||
return generateWordEmbeddings((String[]) o);
|
||||
}
|
||||
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private float[][] generateWordEmbeddings(String[] text) {
|
||||
SentenceIterator iter = new CollectionSentenceIterator(Arrays.asList(text));
|
||||
/*Word2Vec vec = new Word2Vec.Builder()
|
||||
.minWordFrequency(1)
|
||||
.iterations(1)
|
||||
.layerSize(targetDims)
|
||||
.seed(42)
|
||||
.windowSize(5)
|
||||
.iterate(iter)
|
||||
.tokenizerFactory(tokenizerFactory)
|
||||
.build();
|
||||
*/
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.readers;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.jhdf.api.Group;
|
||||
import io.jhdf.api.Node;
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import io.nosqlbench.loader.hdf.embedding.EmbeddingGenerator;
|
||||
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import static io.nosqlbench.loader.hdf.embedding.EmbeddingGeneratorFactory.getGenerator;
|
||||
|
||||
public class Hdf5Reader implements HdfReader {
|
||||
private static final Logger logger = LogManager.getLogger(Hdf5Reader.class);
|
||||
public static final String ALL = "all";
|
||||
private VectorWriter writer;
|
||||
private final LoaderConfig config;
|
||||
private final ExecutorService executorService;
|
||||
private final LinkedBlockingQueue<float[]> queue;
|
||||
private List<String> datasets;
|
||||
private final float[] SHUTDOWN = new float[0];
|
||||
public Hdf5Reader(LoaderConfig config) {
|
||||
this.config = config;
|
||||
executorService = Executors.newCachedThreadPool();
|
||||
queue = new LinkedBlockingQueue<>(config.getQueueSize());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setWriter(VectorWriter writer) {
|
||||
this.writer = writer;
|
||||
writer.setQueue(queue);
|
||||
}
|
||||
|
||||
public void extractDatasets(Group parent) {
|
||||
Map<String, Node> nodes = parent.getChildren();
|
||||
for (String key : nodes.keySet()) {
|
||||
Node node = nodes.get(key);
|
||||
if (node instanceof Dataset) {
|
||||
datasets.add(node.getPath());
|
||||
}
|
||||
else if (node.isGroup()) {
|
||||
extractDatasets((Group) node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read() {
|
||||
HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile()));
|
||||
datasets = config.getDatasets();
|
||||
if (datasets.get(0).equalsIgnoreCase(ALL)) {
|
||||
extractDatasets(hdfFile);
|
||||
}
|
||||
List<Future<?>> futures = new ArrayList<>();
|
||||
executorService.submit(writer);
|
||||
for (String ds : datasets) {
|
||||
if (ds.equalsIgnoreCase(ALL)) {
|
||||
continue;
|
||||
}
|
||||
Future<?> future = executorService.submit(() -> {
|
||||
logger.info("Processing dataset: " + ds);
|
||||
Dataset dataset = hdfFile.getDatasetByPath(ds);
|
||||
int[] dims = dataset.getDimensions();
|
||||
String type = dataset.getJavaType().getSimpleName().toLowerCase();
|
||||
EmbeddingGenerator generator = getGenerator(type);
|
||||
Object data;
|
||||
if (dataset.getSizeInBytes() > Integer.MAX_VALUE) {
|
||||
logger.info("slicing large dataset: " + ds);
|
||||
// TODO: For now this will be implemented to handle numeric types with
|
||||
// 2 dimensions where the 1st dimension is the number of vectors and the 2nd
|
||||
// dimension is the number of dimensions in the vector.
|
||||
long[] sliceOffset = new long[dims.length];
|
||||
int[] sliceDimensions = new int[dims.length];
|
||||
sliceDimensions[1] = dims[1];
|
||||
int noOfSlices = (int) (dataset.getSizeInBytes() / Integer.MAX_VALUE) + 1;
|
||||
int sliceSize = dims[0] / noOfSlices;
|
||||
for (int i = 0; i < noOfSlices; i++) {
|
||||
sliceOffset[0] = (long) i * sliceSize;
|
||||
sliceDimensions[0] = sliceSize;
|
||||
data = dataset.getData(sliceOffset, sliceDimensions);
|
||||
float[][] vectors = generator.generateEmbeddingFrom(data, dims);
|
||||
for (float[] vector : vectors) {
|
||||
try {
|
||||
queue.put(vector);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
data = dataset.getData();
|
||||
float[][] vectors = generator.generateEmbeddingFrom(data, dims);
|
||||
for (float[] vector : vectors) {
|
||||
try {
|
||||
queue.put(vector);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
futures.add(future);
|
||||
}
|
||||
for (Future<?> future : futures) {
|
||||
try {
|
||||
future.get();
|
||||
} catch (Exception e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
hdfFile.close();
|
||||
writer.shutdown();
|
||||
try {
|
||||
queue.put(SHUTDOWN);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
executorService.shutdown();
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.readers;
|
||||
|
||||
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||
|
||||
public interface HdfReader {
|
||||
void setWriter(VectorWriter writer);
|
||||
|
||||
void read();
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public abstract class AbstractVectorWriter implements VectorWriter {
|
||||
protected LinkedBlockingQueue<float[]> queue;
|
||||
protected boolean shutdown = false;
|
||||
|
||||
public void setQueue(LinkedBlockingQueue<float[]> queue) {
|
||||
this.queue = queue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
while (!shutdown || !queue.isEmpty()) {
|
||||
try {
|
||||
float[] vector = queue.take();
|
||||
if (vector.length==0) {
|
||||
break;
|
||||
}
|
||||
writeVector(vector);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract void writeVector(float[] vector);
|
||||
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import com.datastax.oss.driver.api.core.CqlSession;
|
||||
import com.datastax.oss.driver.api.core.cql.PreparedStatement;
|
||||
import com.datastax.oss.driver.api.core.data.CqlVector;
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Map;
|
||||
|
||||
public class AstraVectorWriter extends AbstractVectorWriter {
|
||||
private static final Logger logger = LogManager.getLogger(AstraVectorWriter.class);
|
||||
private final CqlSession session;
|
||||
PreparedStatement insert_vector;
|
||||
|
||||
public AstraVectorWriter(LoaderConfig config) {
|
||||
Map<String,String> astraParams = config.getAstra();
|
||||
session = CqlSession.builder()
|
||||
.withCloudSecureConnectBundle(Paths.get(astraParams.get("scb")))
|
||||
.withAuthCredentials(astraParams.get("clientId"), astraParams.get("clientSecret"))
|
||||
.withKeyspace(astraParams.get("keyspace"))
|
||||
.build();
|
||||
logger.info("Astra session initialized");
|
||||
insert_vector = session.prepare(astraParams.get("query"));
|
||||
}
|
||||
//TODO: this is insanely slow. Needs work on threading/batching
|
||||
@Override
|
||||
protected void writeVector(float[] vector) {
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = vector[i];
|
||||
}
|
||||
CqlVector.Builder vectorBuilder = CqlVector.builder();
|
||||
vectorBuilder.add(vector2);
|
||||
session.execute(insert_vector.bind(getPartitionValue(vector), vectorBuilder.build()));
|
||||
}
|
||||
|
||||
private String getPartitionValue(float[] vector) {
|
||||
float sum = 0;
|
||||
for (float f : vector) {
|
||||
sum += f;
|
||||
}
|
||||
return String.valueOf(sum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
shutdown = true;
|
||||
}
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
public class FileVectorWriter extends AbstractVectorWriter {
|
||||
private static final Logger logger = LogManager.getLogger(FileVectorWriter.class);
|
||||
private final BufferedWriter targetFile;
|
||||
public FileVectorWriter(LoaderConfig config) throws IOException {
|
||||
String targetFileName = config.getTargetFile();
|
||||
targetFile = new BufferedWriter(new FileWriter(targetFileName));
|
||||
logger.info("Writing to file: " + targetFileName);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeVector(float[] vector) {
|
||||
try {
|
||||
targetFile.write("[");
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
targetFile.write(String.valueOf(vector[i]));
|
||||
if (i < vector.length - 1) {
|
||||
targetFile.write(",");
|
||||
}
|
||||
}
|
||||
targetFile.write("]");
|
||||
targetFile.write("\n");
|
||||
targetFile.flush();
|
||||
} catch (IOException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
shutdown = true;
|
||||
}
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
public class NoopVectorWriter extends AbstractVectorWriter {
|
||||
private static final Logger logger = LogManager.getLogger(NoopVectorWriter.class);
|
||||
|
||||
@Override
|
||||
protected void writeVector(float[] vector) {
|
||||
//No-op
|
||||
logger.debug(vector);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
shutdown = true;
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public interface VectorWriter extends Runnable {
|
||||
void setQueue(LinkedBlockingQueue<float[]> queue);
|
||||
|
||||
void shutdown();
|
||||
}
|
13
hdf-loader/src/main/resources/config.yaml
Normal file
13
hdf-loader/src/main/resources/config.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
format: HDF5
|
||||
sourceFile: /home/mwolters138/Downloads/h5ex_t_float.h5 #/home/mwolters138/Documents/hdf5/datasets/pass/glove-25-angular.hdf5
|
||||
datasets:
|
||||
- all
|
||||
embedding: word2vec
|
||||
writer: filewriter
|
||||
astra:
|
||||
scb: /home/mwolters138/Dev/testing/secure-connect-vector-correctness.zip
|
||||
clientId: IvpdaZejwNuvWeupsIkWTHeL
|
||||
clientSecret: .bxut2-OQL,dWunZeQbjZC0vMHd88UWXKS.xT,nl95zQC0B0xU9FzSWK3HSUGO11o_7pr7wG7+EMaZqegkKlr4fZ54__furPMtWPGiPp,2cZ1q15vrWwc9_-AcgeCbuf
|
||||
keyspace: baselines128dot
|
||||
query: INSERT INTO vectors25(key, value) VALUES (?,?)
|
||||
targetFile: /home/mwolters138/vectors.txt
|
@ -326,7 +326,7 @@
|
||||
<dependency>
|
||||
<groupId>com.amazonaws</groupId>
|
||||
<artifactId>aws-java-sdk-s3</artifactId>
|
||||
<version>1.12.507</version>
|
||||
<version>1.12.513</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.elega9t</groupId>
|
||||
@ -341,7 +341,7 @@
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
<version>3.12.0</version>
|
||||
<version>3.13.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.squareup</groupId>
|
||||
|
4
pom.xml
4
pom.xml
@ -67,6 +67,7 @@
|
||||
<module.adapter-kafka>adapter-kafka</module.adapter-kafka>
|
||||
<module.adapter-kafka>adapter-amqp</module.adapter-kafka>
|
||||
<module.adapter-jdbc>adapter-jdbc</module.adapter-jdbc>
|
||||
<module.hdf-loader>hdf-loader</module.hdf-loader>
|
||||
|
||||
<!-- VIRTDATA MODULES -->
|
||||
<module.virtdata-api>virtdata-api</module.virtdata-api>
|
||||
@ -76,6 +77,7 @@
|
||||
<module.virtdata-lib-random>virtdata-lib-random</module.virtdata-lib-random>
|
||||
<module.virtdata-lib-curves4>virtdata-lib-curves4</module.virtdata-lib-curves4>
|
||||
<module.virtdata-lib-realer>virtdata-lib-realer</module.virtdata-lib-realer>
|
||||
<module.virtdata-lib-realer>virtdata-lib-hdf5</module.virtdata-lib-realer>
|
||||
<module.virtdata-userlibs>virtdata-userlibs</module.virtdata-userlibs>
|
||||
</properties>
|
||||
<modules>
|
||||
@ -114,6 +116,7 @@
|
||||
<module>adapter-amqp</module>
|
||||
<module>adapter-jdbc</module>
|
||||
<module>adapter-pinecone</module>
|
||||
<module>hdf-loader</module>
|
||||
|
||||
<!-- VIRTDATA MODULES -->
|
||||
<module>virtdata-api</module>
|
||||
@ -123,6 +126,7 @@
|
||||
<module>virtdata-lib-random</module>
|
||||
<module>virtdata-lib-curves4</module>
|
||||
<module>virtdata-lib-realer</module>
|
||||
<module>virtdata-lib-hdf5</module>
|
||||
<module>virtdata-userlibs</module>
|
||||
|
||||
<!-- Documentation -->
|
||||
|
@ -82,6 +82,7 @@
|
||||
<version>5.1.1</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
62
virtdata-lib-hdf5/pom.xml
Normal file
62
virtdata-lib-hdf5/pom.xml
Normal file
@ -0,0 +1,62 @@
|
||||
<!--
|
||||
~ Copyright (c) 2023 nosqlbench
|
||||
~
|
||||
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
~ you may not use this file except in compliance with the License.
|
||||
~ You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<artifactId>mvn-defaults</artifactId>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<version>${revision}</version>
|
||||
<relativePath>../mvn-defaults</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>virtdata-lib-hdf5</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>virtdata-lib-hdf5</name>
|
||||
<url>http://nosqlbench.io/</url>
|
||||
|
||||
<description>With inspiration from other libraries</description>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<artifactId>virtdata-lib-basics</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>src/test/resources</directory>
|
||||
<excludes>
|
||||
<exclude>h5ex_t_float.h5</exclude>
|
||||
</excludes>
|
||||
<filtering>true</filtering>
|
||||
</testResource>
|
||||
</testResources>
|
||||
</build>
|
||||
|
||||
</project>
|
@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
|
||||
public abstract class AbstractHdfFileToVector {
|
||||
protected final HdfFile hdfFile;
|
||||
protected final Dataset dataset;
|
||||
protected final int[] dims;
|
||||
|
||||
public AbstractHdfFileToVector(String filename, String datasetName) {
|
||||
//hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
|
||||
hdfFile = new HdfFile(Paths.get(filename));
|
||||
//TODO: implement a function to get the dataset by name only without needing the full path
|
||||
dataset = hdfFile.getDatasetByPath(datasetName);
|
||||
dims = dataset.getDimensions();
|
||||
}
|
||||
|
||||
protected Object getDataFrom(long l) {
|
||||
long[] sliceOffset = new long[dims.length];
|
||||
sliceOffset[0] = (l % dims[0]);
|
||||
int[] sliceDimensions = new int[dims.length];
|
||||
sliceDimensions[0] = 1;
|
||||
// Do we want to give the option of reducing vector dimensions here?
|
||||
sliceDimensions[1] = dims[1];
|
||||
return dataset.getData(sliceOffset, sliceDimensions);
|
||||
}
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value. As currently
|
||||
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||
* <p>
|
||||
* This implementation is specific to returning an array of floats
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction<float[]> {
|
||||
private final EmbeddingGenerator embeddingGenerator;
|
||||
|
||||
public HdfFileToVectorArray(String filename, String datasetName) {
|
||||
super(filename, datasetName);
|
||||
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||
}
|
||||
@Override
|
||||
public float[] apply(long l) {
|
||||
Object data = getDataFrom(l);
|
||||
return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value. As currently
|
||||
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||
* <p>
|
||||
* This implementation is specific to returning a List of Floats, so as to work with the
|
||||
* normalization functions e.g. NormalizeListVector and its variants.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction<List<Float>> {
|
||||
private final EmbeddingGenerator embeddingGenerator;
|
||||
|
||||
public HdfFileToVectorList(String filename, String datasetName) {
|
||||
super(filename, datasetName);
|
||||
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||
}
|
||||
@Override
|
||||
public List<Float> apply(long l) {
|
||||
Object data = getDataFrom(l);
|
||||
return embeddingGenerator.generateListEmbeddingFrom(data, dims);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be double[1][x]
|
||||
double[] vector = ((double[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
double[] vector = ((double[][]) o)[0];
|
||||
float[] vector2 = new float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public interface EmbeddingGenerator {
|
||||
List<Float> generateListEmbeddingFrom(Object o, int[] dims);
|
||||
|
||||
float[] generateArrayEmbeddingFrom(Object o, int[] dims);
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class EmbeddingGeneratorFactory {
|
||||
private static final Map<String, EmbeddingGenerator> generators = new HashMap<>();
|
||||
|
||||
public static EmbeddingGenerator getGenerator(String type) {
|
||||
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||
switch (typeLower) {
|
||||
case "float" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new FloatEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "int" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new IntEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "double" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new DoubleEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be float[1][x]
|
||||
float[] vector = ((float[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
return ((float[][]) o)[0];
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||
@Override
|
||||
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be int[1][x]
|
||||
int[] vector = ((int[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
int[] vector = ((int[][]) o)[0];
|
||||
float[] vector2 = new float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class HdfFileToArrayTest {
|
||||
|
||||
@Test
|
||||
public void testHdfFileToVector() {
|
||||
final float[][] results = new float[][]{
|
||||
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||
};
|
||||
|
||||
HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
|
||||
"src/test/resources/h5ex_t_float.h5",
|
||||
"/DS1");
|
||||
|
||||
float[] read;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
read = hdfFileToVector.apply(i);
|
||||
for (int j = 0; j < 7; j++) {
|
||||
assert (read[j] == results[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class HdfFileToVectorTest {
|
||||
|
||||
@Test
|
||||
public void testHdfFileToVector() {
|
||||
final float[][] results = new float[][]{
|
||||
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||
};
|
||||
|
||||
HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
|
||||
"src/test/resources/h5ex_t_float.h5",
|
||||
"/DS1");
|
||||
|
||||
List<Float> read;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
read = hdfFileToVector.apply(i);
|
||||
for (int j = 0; j < 7; j++) {
|
||||
assert (read.get(j) == results[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
Binary file not shown.
@ -66,6 +66,13 @@
|
||||
<artifactId>virtdata-lib-curves4</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<artifactId>virtdata-lib-hdf5</artifactId>
|
||||
<version>${revision}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.nosqlbench</groupId>
|
||||
<artifactId>docsys</artifactId>
|
||||
|
Loading…
Reference in New Issue
Block a user