mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
merging main into vector-mergup
This commit is contained in:
commit
34dcc7bc1b
@ -43,7 +43,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.amazonaws</groupId>
|
<groupId>com.amazonaws</groupId>
|
||||||
<artifactId>aws-java-sdk-dynamodb</artifactId>
|
<artifactId>aws-java-sdk-dynamodb</artifactId>
|
||||||
<version>1.12.500</version>
|
<version>1.12.507</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
@ -34,7 +34,7 @@
|
|||||||
</description>
|
</description>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<kafka.version>3.5.0</kafka.version>
|
<kafka.version>3.5.1</kafka.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
@ -60,7 +60,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>3.12.0</version>
|
<version>3.13.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/commons-beanutils/commons-beanutils -->
|
<!-- https://mvnrepository.com/artifact/commons-beanutils/commons-beanutils -->
|
||||||
|
@ -42,7 +42,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.mongodb</groupId>
|
<groupId>org.mongodb</groupId>
|
||||||
<artifactId>mongodb-driver-sync</artifactId>
|
<artifactId>mongodb-driver-sync</artifactId>
|
||||||
<version>4.10.1</version>
|
<version>4.10.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@
|
|||||||
</description>
|
</description>
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<pulsar.version>3.0.0</pulsar.version>
|
<pulsar.version>3.0.1</pulsar.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
@ -66,7 +66,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>3.12.0</version>
|
<version>3.13.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<!-- https://mvnrepository.com/artifact/commons-beanutils/commons-beanutils -->
|
<!-- https://mvnrepository.com/artifact/commons-beanutils/commons-beanutils -->
|
||||||
|
@ -22,7 +22,7 @@
|
|||||||
<name>docsys</name>
|
<name>docsys</name>
|
||||||
<url>http://nosqlbench.io/</url>
|
<url>http://nosqlbench.io/</url>
|
||||||
<properties>
|
<properties>
|
||||||
<jersey.version>3.1.2</jersey.version>
|
<jersey.version>3.1.3</jersey.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<parent>
|
<parent>
|
||||||
@ -94,7 +94,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.glassfish.jersey.media</groupId>
|
<groupId>org.glassfish.jersey.media</groupId>
|
||||||
<artifactId>jersey-media-json-jackson</artifactId>
|
<artifactId>jersey-media-json-jackson</artifactId>
|
||||||
<version>3.1.2</version>
|
<version>3.1.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
100
hdf-loader/pom.xml
Normal file
100
hdf-loader/pom.xml
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!--
|
||||||
|
~ Copyright (c) 2023 nosqlbench
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<artifactId>hdf-loader</artifactId>
|
||||||
|
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<artifactId>mvn-defaults</artifactId>
|
||||||
|
<groupId>io.nosqlbench</groupId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
<relativePath>../mvn-defaults</relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<name>${project.artifactId}</name>
|
||||||
|
|
||||||
|
<properties>
|
||||||
|
<maven.compiler.source>17</maven.compiler.source>
|
||||||
|
<maven.compiler.target>17</maven.compiler.target>
|
||||||
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
|
</properties>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.snakeyaml</groupId>
|
||||||
|
<artifactId>snakeyaml-engine</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.yaml</groupId>
|
||||||
|
<artifactId>snakeyaml</artifactId>
|
||||||
|
<version>2.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.datastax.oss</groupId>
|
||||||
|
<artifactId>java-driver-core</artifactId>
|
||||||
|
<version>4.16.0</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
|
<artifactId>jackson-core</artifactId>
|
||||||
|
<version>2.15.2</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<!-- https://mvnrepository.com/artifact/org.deeplearning4j/deeplearning4j-core -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.deeplearning4j</groupId>
|
||||||
|
<artifactId>deeplearning4j-core</artifactId>
|
||||||
|
<version>1.0.0-M2.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.nd4j</groupId>
|
||||||
|
<artifactId>nd4j-native</artifactId>
|
||||||
|
<version>1.0.0-M2.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.deeplearning4j</groupId>
|
||||||
|
<artifactId>deeplearning4j-nlp</artifactId>
|
||||||
|
<version>1.0.0-M2.1</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.jhdf</groupId>
|
||||||
|
<artifactId>jhdf</artifactId>
|
||||||
|
<version>0.6.10</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.nosqlbench</groupId>
|
||||||
|
<artifactId>nb-api</artifactId>
|
||||||
|
<version>5.17.3-SNAPSHOT</version>
|
||||||
|
<scope>compile</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
</project>
|
@ -0,0 +1,83 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf;
|
||||||
|
|
||||||
|
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||||
|
import io.nosqlbench.loader.hdf.readers.Hdf5Reader;
|
||||||
|
import io.nosqlbench.loader.hdf.readers.HdfReader;
|
||||||
|
import io.nosqlbench.loader.hdf.writers.AstraVectorWriter;
|
||||||
|
import io.nosqlbench.loader.hdf.writers.FileVectorWriter;
|
||||||
|
import io.nosqlbench.loader.hdf.writers.NoopVectorWriter;
|
||||||
|
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
|
||||||
|
public class HdfLoader {
|
||||||
|
private static final Logger logger = LogManager.getLogger(HdfLoader.class);
|
||||||
|
public static final String FILEWRITER = "filewriter";
|
||||||
|
public static final String ASTRA = "astra";
|
||||||
|
public static final String NOOP = "noop";
|
||||||
|
public static final String HDF5 = "hdf5";
|
||||||
|
public static final String HDF4 = "hdf4";
|
||||||
|
|
||||||
|
public static void main (String[] args) {
|
||||||
|
if (args.length == 0) {
|
||||||
|
System.out.println("Usage: hdf-loader <filename>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
LoaderConfig config = new LoaderConfig(args[0]);
|
||||||
|
logger.info("Starting loader with config: " + config);
|
||||||
|
HdfReader reader = null;
|
||||||
|
VectorWriter writer = null;
|
||||||
|
|
||||||
|
String format = config.getFormat();
|
||||||
|
switch (format.toLowerCase()) {
|
||||||
|
case HDF4 -> {
|
||||||
|
logger.info("HDF4 format not yet supported");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
case HDF5 -> {
|
||||||
|
logger.info("HDF5 format selected");
|
||||||
|
reader = new Hdf5Reader(config);
|
||||||
|
}
|
||||||
|
default -> {
|
||||||
|
logger.info("Unknown format: " + format);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String writerType = config.getWriter();
|
||||||
|
logger.info("Using writer type: " + writerType);
|
||||||
|
switch (writerType.toLowerCase()) {
|
||||||
|
case FILEWRITER -> writer = new FileVectorWriter(config);
|
||||||
|
case ASTRA -> writer = new AstraVectorWriter(config);
|
||||||
|
case NOOP -> writer = new NoopVectorWriter();
|
||||||
|
default -> {
|
||||||
|
logger.info("Unknown writer type: " + writerType);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reader.setWriter(writer);
|
||||||
|
logger.info("Starting main read loop");
|
||||||
|
reader.read();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error(e);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,84 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.config;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
import org.yaml.snakeyaml.Yaml;
|
||||||
|
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class LoaderConfig {
|
||||||
|
private static final Logger logger = LogManager.getLogger(LoaderConfig.class);
|
||||||
|
private static final Yaml yaml = new Yaml();
|
||||||
|
private final Map<String, Object> configMap;
|
||||||
|
|
||||||
|
public LoaderConfig(String filePath) throws IOException {
|
||||||
|
FileReader fileReader = new FileReader(filePath);
|
||||||
|
configMap = yaml.load(fileReader);
|
||||||
|
for (Map.Entry<String, Object> entry : configMap.entrySet()) {
|
||||||
|
logger.debug(entry.getKey() + " : " + entry.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object getRawValue(String key) {
|
||||||
|
return configMap.get(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getStringValue(String key) {
|
||||||
|
return configMap.get(key).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getDatasets() {
|
||||||
|
return (List<String>) configMap.get("datasets");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFormat() {
|
||||||
|
return (String) configMap.getOrDefault("format", "HD5");
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String,String> getAstra() {
|
||||||
|
return (Map<String,String>) configMap.get("astra");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEmbedding() {
|
||||||
|
return (String) configMap.getOrDefault("embedding", "Deeplearning4j");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWriter() {
|
||||||
|
return (String) configMap.getOrDefault("writer", "filewriter");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourceFile() {
|
||||||
|
return (String) configMap.get("sourceFile");
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTargetFile() {
|
||||||
|
return (String) configMap.getOrDefault("targetFile", "./vectors.txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getThreads() {
|
||||||
|
return (int) configMap.getOrDefault("threads", 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getQueueSize() {
|
||||||
|
return (int) configMap.getOrDefault("queueSize", 1000);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,62 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
return switch (dims.length) {
|
||||||
|
case 1 -> new float[][]{convertToFloat((double[]) o)};
|
||||||
|
case 2 -> convertToFloats((double[][]) o);
|
||||||
|
case 3 -> flatten(o, dims);
|
||||||
|
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[][] convertToFloats(double[][] o) {
|
||||||
|
float[][] floats = new float[o.length][];
|
||||||
|
for (int i = 0; i < o.length; i++) {
|
||||||
|
floats[i] = convertToFloat(o[i]);
|
||||||
|
}
|
||||||
|
return floats;
|
||||||
|
}
|
||||||
|
|
||||||
|
public float[] convertToFloat(double[] doubleArray) {
|
||||||
|
if (doubleArray == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
float[] floatArray = new float[doubleArray.length];
|
||||||
|
for (int i = 0; i < doubleArray.length; i++) {
|
||||||
|
floatArray[i] = (float) doubleArray[i];
|
||||||
|
}
|
||||||
|
return floatArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[][] flatten(Object o, int[] dims) {
|
||||||
|
double[][][] arr = (double[][][]) o;
|
||||||
|
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||||
|
for (int i = 0; i < dims[0]; i++) {
|
||||||
|
for (int j = 0; j < dims[1]; j++) {
|
||||||
|
for (int k = 0; k < dims[2]; k++) {
|
||||||
|
flat[i][j * dims[2] + k] = (float)arr[i][j][k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return flat;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
public interface EmbeddingGenerator {
|
||||||
|
float[][] generateEmbeddingFrom(Object o, int[] dims);
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class EmbeddingGeneratorFactory {
|
||||||
|
private static final Map<String,EmbeddingGenerator> generators = new HashMap<>();
|
||||||
|
|
||||||
|
public static EmbeddingGenerator getGenerator(String type) {
|
||||||
|
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||||
|
if (typeLower.equals("integer")) typeLower = "int";
|
||||||
|
switch (typeLower) {
|
||||||
|
case "string" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new StringEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
case "float" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new FloatEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
case "double" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new DoubleEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
case "int" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new IntEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
return switch (dims.length) {
|
||||||
|
case 1 -> new float[][]{(float[]) o};
|
||||||
|
case 2 -> (float[][]) o;
|
||||||
|
case 3 -> flatten(o, dims);
|
||||||
|
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[][] flatten(Object o, int[] dims) {
|
||||||
|
float[][][] arr = (float[][][]) o;
|
||||||
|
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||||
|
for (int i = 0; i < dims[0]; i++) {
|
||||||
|
for (int j = 0; j < dims[1]; j++) {
|
||||||
|
if (dims[2] >= 0) System.arraycopy(arr[i][j], 0, flat[i], j * dims[2] + 0, dims[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return flat;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,59 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
@Override
|
||||||
|
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
switch (dims.length) {
|
||||||
|
case 1 -> {
|
||||||
|
float[] arr = new float[dims[0]];
|
||||||
|
for (int i = 0; i < dims[0]; i++) {
|
||||||
|
arr[i] = ((int[]) o)[i];
|
||||||
|
}
|
||||||
|
return new float[][]{arr};
|
||||||
|
}
|
||||||
|
case 2 -> {
|
||||||
|
float[][] arr = new float[dims[0]][dims[1]];
|
||||||
|
for (int i = 0; i < dims[0]; i++) {
|
||||||
|
for (int j = 0; j < dims[1]; j++) {
|
||||||
|
arr[i][j] = ((int[][]) o)[i][j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return arr;
|
||||||
|
}
|
||||||
|
case 3 -> {
|
||||||
|
return flatten(o, dims);
|
||||||
|
}
|
||||||
|
default ->
|
||||||
|
throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[][] flatten(Object o, int[] dims) {
|
||||||
|
int[][][] arr = (int[][][]) o;
|
||||||
|
float[][] flat = new float[dims[0]][dims[1] * dims[2]];
|
||||||
|
for (int i = 0; i < dims[0]; i++) {
|
||||||
|
for (int j = 0; j < dims[1]; j++) {
|
||||||
|
for (int k = 0; k < dims[2]; k++) {
|
||||||
|
flat[i][j * dims[2] + k] = arr[i][j][k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return flat;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.embedding;
|
||||||
|
|
||||||
|
import org.deeplearning4j.models.word2vec.Word2Vec;
|
||||||
|
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
|
||||||
|
import org.deeplearning4j.text.sentenceiterator.CollectionSentenceIterator;
|
||||||
|
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
|
||||||
|
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
|
||||||
|
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
|
public class StringEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
private final TokenizerFactory tokenizerFactory= new DefaultTokenizerFactory();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[][] generateEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
switch (dims.length) {
|
||||||
|
case 1 -> {
|
||||||
|
return generateWordEmbeddings((String[]) o);
|
||||||
|
}
|
||||||
|
default -> throw new RuntimeException("unsupported embedding dimensionality: " + dims.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private float[][] generateWordEmbeddings(String[] text) {
|
||||||
|
SentenceIterator iter = new CollectionSentenceIterator(Collections.singletonList(text));
|
||||||
|
/*Word2Vec vec = new Word2Vec.Builder()
|
||||||
|
.minWordFrequency(1)
|
||||||
|
.iterations(1)
|
||||||
|
.layerSize(targetDims)
|
||||||
|
.seed(42)
|
||||||
|
.windowSize(5)
|
||||||
|
.iterate(iter)
|
||||||
|
.tokenizerFactory(tokenizerFactory)
|
||||||
|
.build();
|
||||||
|
*/
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,147 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.readers;
|
||||||
|
|
||||||
|
import io.jhdf.HdfFile;
|
||||||
|
import io.jhdf.api.Dataset;
|
||||||
|
import io.jhdf.api.Group;
|
||||||
|
import io.jhdf.api.Node;
|
||||||
|
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||||
|
import io.nosqlbench.loader.hdf.embedding.EmbeddingGenerator;
|
||||||
|
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
|
import static io.nosqlbench.loader.hdf.embedding.EmbeddingGeneratorFactory.getGenerator;
|
||||||
|
|
||||||
|
public class Hdf5Reader implements HdfReader {
|
||||||
|
private static final Logger logger = LogManager.getLogger(Hdf5Reader.class);
|
||||||
|
public static final String ALL = "all";
|
||||||
|
private VectorWriter writer;
|
||||||
|
private final LoaderConfig config;
|
||||||
|
private final ExecutorService executorService;
|
||||||
|
private final LinkedBlockingQueue<float[]> queue;
|
||||||
|
private List<String> datasets;
|
||||||
|
private final float[] SHUTDOWN = new float[0];
|
||||||
|
public Hdf5Reader(LoaderConfig config) {
|
||||||
|
this.config = config;
|
||||||
|
executorService = Executors.newCachedThreadPool();
|
||||||
|
queue = new LinkedBlockingQueue<>(config.getQueueSize());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setWriter(VectorWriter writer) {
|
||||||
|
this.writer = writer;
|
||||||
|
writer.setQueue(queue);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void extractDatasets(Group parent) {
|
||||||
|
Map<String, Node> nodes = parent.getChildren();
|
||||||
|
for (String key : nodes.keySet()) {
|
||||||
|
Node node = nodes.get(key);
|
||||||
|
if (node instanceof Dataset) {
|
||||||
|
datasets.add(node.getPath());
|
||||||
|
}
|
||||||
|
else if (node.isGroup()) {
|
||||||
|
extractDatasets((Group) node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void read() {
|
||||||
|
HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile()));
|
||||||
|
datasets = config.getDatasets();
|
||||||
|
if (datasets.get(0).equalsIgnoreCase(ALL)) {
|
||||||
|
extractDatasets(hdfFile);
|
||||||
|
}
|
||||||
|
List<Future<?>> futures = new ArrayList<>();
|
||||||
|
executorService.submit(writer);
|
||||||
|
for (String ds : datasets) {
|
||||||
|
if (ds.equalsIgnoreCase(ALL)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Future<?> future = executorService.submit(() -> {
|
||||||
|
logger.info("Processing dataset: " + ds);
|
||||||
|
Dataset dataset = hdfFile.getDatasetByPath(ds);
|
||||||
|
int[] dims = dataset.getDimensions();
|
||||||
|
String type = dataset.getJavaType().getSimpleName().toLowerCase();
|
||||||
|
EmbeddingGenerator generator = getGenerator(type);
|
||||||
|
Object data;
|
||||||
|
if (dataset.getSizeInBytes() > Integer.MAX_VALUE) {
|
||||||
|
logger.info("slicing large dataset: " + ds);
|
||||||
|
// TODO: For now this will be implemented to handle numeric types with
|
||||||
|
// 2 dimensions where the 1st dimension is the number of vectors and the 2nd
|
||||||
|
// dimension is the number of dimensions in the vector.
|
||||||
|
long[] sliceOffset = new long[dims.length];
|
||||||
|
int[] sliceDimensions = new int[dims.length];
|
||||||
|
sliceDimensions[1] = dims[1];
|
||||||
|
int noOfSlices = (int) (dataset.getSizeInBytes() / Integer.MAX_VALUE) + 1;
|
||||||
|
int sliceSize = dims[0] / noOfSlices;
|
||||||
|
for (int i = 0; i < noOfSlices; i++) {
|
||||||
|
sliceOffset[0] = (long) i * sliceSize;
|
||||||
|
sliceDimensions[0] = sliceSize;
|
||||||
|
data = dataset.getData(sliceOffset, sliceDimensions);
|
||||||
|
float[][] vectors = generator.generateEmbeddingFrom(data, dims);
|
||||||
|
for (float[] vector : vectors) {
|
||||||
|
try {
|
||||||
|
queue.put(vector);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
data = dataset.getData();
|
||||||
|
float[][] vectors = generator.generateEmbeddingFrom(data, dims);
|
||||||
|
for (float[] vector : vectors) {
|
||||||
|
try {
|
||||||
|
queue.put(vector);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
futures.add(future);
|
||||||
|
}
|
||||||
|
for (Future<?> future : futures) {
|
||||||
|
try {
|
||||||
|
future.get();
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
hdfFile.close();
|
||||||
|
writer.shutdown();
|
||||||
|
try {
|
||||||
|
queue.put(SHUTDOWN);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
executorService.shutdown();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.readers;
|
||||||
|
|
||||||
|
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||||
|
|
||||||
|
public interface HdfReader {
|
||||||
|
void setWriter(VectorWriter writer);
|
||||||
|
|
||||||
|
void read();
|
||||||
|
}
|
@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.writers;
|
||||||
|
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
|
public abstract class AbstractVectorWriter implements VectorWriter {
|
||||||
|
protected LinkedBlockingQueue<float[]> queue;
|
||||||
|
protected boolean shutdown = false;
|
||||||
|
|
||||||
|
public void setQueue(LinkedBlockingQueue<float[]> queue) {
|
||||||
|
this.queue = queue;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
while (!shutdown || !queue.isEmpty()) {
|
||||||
|
try {
|
||||||
|
float[] vector = queue.take();
|
||||||
|
if (vector.length==0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
writeVector(vector);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract void writeVector(float[] vector);
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,68 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.writers;
|
||||||
|
|
||||||
|
import com.datastax.oss.driver.api.core.CqlSession;
|
||||||
|
import com.datastax.oss.driver.api.core.cql.PreparedStatement;
|
||||||
|
import com.datastax.oss.driver.api.core.data.CqlVector;
|
||||||
|
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class AstraVectorWriter extends AbstractVectorWriter {
|
||||||
|
private static final Logger logger = LogManager.getLogger(AstraVectorWriter.class);
|
||||||
|
private final CqlSession session;
|
||||||
|
PreparedStatement insert_vector;
|
||||||
|
|
||||||
|
public AstraVectorWriter(LoaderConfig config) {
|
||||||
|
Map<String,String> astraParams = config.getAstra();
|
||||||
|
session = CqlSession.builder()
|
||||||
|
.withCloudSecureConnectBundle(Paths.get(astraParams.get("scb")))
|
||||||
|
.withAuthCredentials(astraParams.get("clientId"), astraParams.get("clientSecret"))
|
||||||
|
.withKeyspace(astraParams.get("keyspace"))
|
||||||
|
.build();
|
||||||
|
logger.info("Astra session initialized");
|
||||||
|
insert_vector = session.prepare(astraParams.get("query"));
|
||||||
|
}
|
||||||
|
//TODO: this is insanely slow. Needs work on threading/batching
|
||||||
|
@Override
|
||||||
|
protected void writeVector(float[] vector) {
|
||||||
|
Float[] vector2 = new Float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = vector[i];
|
||||||
|
}
|
||||||
|
CqlVector.Builder vectorBuilder = CqlVector.builder();
|
||||||
|
vectorBuilder.add(vector2);
|
||||||
|
session.execute(insert_vector.bind(getPartitionValue(vector), vectorBuilder.build()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getPartitionValue(float[] vector) {
|
||||||
|
float sum = 0;
|
||||||
|
for (float f : vector) {
|
||||||
|
sum += f;
|
||||||
|
}
|
||||||
|
return String.valueOf(sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
shutdown = true;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,56 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.writers;
|
||||||
|
|
||||||
|
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
public class FileVectorWriter extends AbstractVectorWriter {
|
||||||
|
private static final Logger logger = LogManager.getLogger(FileVectorWriter.class);
|
||||||
|
private final BufferedWriter targetFile;
|
||||||
|
public FileVectorWriter(LoaderConfig config) throws IOException {
|
||||||
|
String targetFileName = config.getTargetFile();
|
||||||
|
targetFile = new BufferedWriter(new FileWriter(targetFileName));
|
||||||
|
logger.info("Writing to file: " + targetFileName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeVector(float[] vector) {
|
||||||
|
try {
|
||||||
|
targetFile.write("[");
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
targetFile.write(String.valueOf(vector[i]));
|
||||||
|
if (i < vector.length - 1) {
|
||||||
|
targetFile.write(",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
targetFile.write("]");
|
||||||
|
targetFile.write("\n");
|
||||||
|
targetFile.flush();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error(e.getMessage(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
shutdown = true;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.writers;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.LogManager;
|
||||||
|
import org.apache.logging.log4j.Logger;
|
||||||
|
|
||||||
|
public class NoopVectorWriter extends AbstractVectorWriter {
|
||||||
|
private static final Logger logger = LogManager.getLogger(NoopVectorWriter.class);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void writeVector(float[] vector) {
|
||||||
|
//No-op
|
||||||
|
logger.debug(vector);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void shutdown() {
|
||||||
|
shutdown = true;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.loader.hdf.writers;
|
||||||
|
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
|
public interface VectorWriter extends Runnable {
|
||||||
|
void setQueue(LinkedBlockingQueue<float[]> queue);
|
||||||
|
|
||||||
|
void shutdown();
|
||||||
|
}
|
13
hdf-loader/src/main/resources/config.yaml
Normal file
13
hdf-loader/src/main/resources/config.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
format: HDF5
|
||||||
|
sourceFile: /home/mwolters138/Downloads/h5ex_t_float.h5 #/home/mwolters138/Documents/hdf5/datasets/pass/glove-25-angular.hdf5
|
||||||
|
datasets:
|
||||||
|
- all
|
||||||
|
embedding: word2vec
|
||||||
|
writer: filewriter
|
||||||
|
astra:
|
||||||
|
scb: /home/mwolters138/Dev/testing/secure-connect-vector-correctness.zip
|
||||||
|
clientId: IvpdaZejwNuvWeupsIkWTHeL
|
||||||
|
clientSecret: .bxut2-OQL,dWunZeQbjZC0vMHd88UWXKS.xT,nl95zQC0B0xU9FzSWK3HSUGO11o_7pr7wG7+EMaZqegkKlr4fZ54__furPMtWPGiPp,2cZ1q15vrWwc9_-AcgeCbuf
|
||||||
|
keyspace: baselines128dot
|
||||||
|
query: INSERT INTO vectors25(key, value) VALUES (?,?)
|
||||||
|
targetFile: /home/mwolters138/vectors.txt
|
@ -184,12 +184,22 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.xerial.snappy</groupId>
|
<groupId>org.xerial.snappy</groupId>
|
||||||
<artifactId>snappy-java</artifactId>
|
<artifactId>snappy-java</artifactId>
|
||||||
<version>1.1.10.1</version>
|
<version>1.1.10.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.datastax.oss</groupId>
|
<groupId>com.datastax.oss</groupId>
|
||||||
<artifactId>java-driver-query-builder</artifactId>
|
<artifactId>java-driver-query-builder</artifactId>
|
||||||
<version>4.16.0</version>
|
<version>4.17.0</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.snakeyaml</groupId>
|
||||||
|
<artifactId>snakeyaml-engine</artifactId>
|
||||||
|
<version>2.6</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.xerial.snappy</groupId>
|
||||||
|
<artifactId>snappy-java</artifactId>
|
||||||
|
<version>1.1.10.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.esri.geometry</groupId>
|
<groupId>com.esri.geometry</groupId>
|
||||||
@ -199,7 +209,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>io.netty</groupId>
|
<groupId>io.netty</groupId>
|
||||||
<artifactId>netty-handler</artifactId>
|
<artifactId>netty-handler</artifactId>
|
||||||
<version>4.1.94.Final</version>
|
<version>4.1.95.Final</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>io.netty</groupId>
|
<groupId>io.netty</groupId>
|
||||||
@ -221,7 +231,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.docker-java</groupId>
|
<groupId>com.github.docker-java</groupId>
|
||||||
<artifactId>docker-java-api</artifactId>
|
<artifactId>docker-java-api</artifactId>
|
||||||
<version>3.3.1</version>
|
<version>3.3.2</version>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
@ -278,7 +288,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.docker-java</groupId>
|
<groupId>com.github.docker-java</groupId>
|
||||||
<artifactId>docker-java-transport-okhttp</artifactId>
|
<artifactId>docker-java-transport-okhttp</artifactId>
|
||||||
<version>3.3.1</version>
|
<version>3.3.2</version>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
@ -289,7 +299,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.docker-java</groupId>
|
<groupId>com.github.docker-java</groupId>
|
||||||
<artifactId>docker-java</artifactId>
|
<artifactId>docker-java</artifactId>
|
||||||
<version>3.3.1</version>
|
<version>3.3.2</version>
|
||||||
<exclusions>
|
<exclusions>
|
||||||
<exclusion>
|
<exclusion>
|
||||||
<groupId>org.slf4j</groupId>
|
<groupId>org.slf4j</groupId>
|
||||||
@ -301,7 +311,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.oshi</groupId>
|
<groupId>com.github.oshi</groupId>
|
||||||
<artifactId>oshi-core-java11</artifactId>
|
<artifactId>oshi-core-java11</artifactId>
|
||||||
<version>6.4.3</version>
|
<version>6.4.4</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.code.gson</groupId>
|
<groupId>com.google.code.gson</groupId>
|
||||||
@ -311,7 +321,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.amazonaws</groupId>
|
<groupId>com.amazonaws</groupId>
|
||||||
<artifactId>aws-java-sdk-s3</artifactId>
|
<artifactId>aws-java-sdk-s3</artifactId>
|
||||||
<version>1.12.498</version>
|
<version>1.12.513</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.elega9t</groupId>
|
<groupId>com.elega9t</groupId>
|
||||||
@ -326,7 +336,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-lang3</artifactId>
|
<artifactId>commons-lang3</artifactId>
|
||||||
<version>3.12.0</version>
|
<version>3.13.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.squareup</groupId>
|
<groupId>com.squareup</groupId>
|
||||||
@ -382,7 +392,7 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.graalvm.sdk</groupId>
|
<groupId>org.graalvm.sdk</groupId>
|
||||||
<artifactId>graal-sdk</artifactId>
|
<artifactId>graal-sdk</artifactId>
|
||||||
<version>22.3.2</version>
|
<version>22.3.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.graalvm.js</groupId>
|
<groupId>org.graalvm.js</groupId>
|
||||||
@ -393,12 +403,12 @@
|
|||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.graalvm.js</groupId>
|
<groupId>org.graalvm.js</groupId>
|
||||||
<artifactId>js-scriptengine</artifactId>
|
<artifactId>js-scriptengine</artifactId>
|
||||||
<version>22.3.2</version>
|
<version>22.3.3</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.graalvm.tools</groupId>
|
<groupId>org.graalvm.tools</groupId>
|
||||||
<artifactId>profiler</artifactId>
|
<artifactId>profiler</artifactId>
|
||||||
<version>22.3.2</version>
|
<version>22.3.3</version>
|
||||||
<scope>runtime</scope>
|
<scope>runtime</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
|
4
pom.xml
4
pom.xml
@ -67,6 +67,7 @@
|
|||||||
<module.adapter-kafka>adapter-kafka</module.adapter-kafka>
|
<module.adapter-kafka>adapter-kafka</module.adapter-kafka>
|
||||||
<module.adapter-kafka>adapter-amqp</module.adapter-kafka>
|
<module.adapter-kafka>adapter-amqp</module.adapter-kafka>
|
||||||
<module.adapter-jdbc>adapter-jdbc</module.adapter-jdbc>
|
<module.adapter-jdbc>adapter-jdbc</module.adapter-jdbc>
|
||||||
|
<module.hdf-loader>hdf-loader</module.hdf-loader>
|
||||||
|
|
||||||
<!-- VIRTDATA MODULES -->
|
<!-- VIRTDATA MODULES -->
|
||||||
<module.virtdata-api>virtdata-api</module.virtdata-api>
|
<module.virtdata-api>virtdata-api</module.virtdata-api>
|
||||||
@ -76,6 +77,7 @@
|
|||||||
<module.virtdata-lib-random>virtdata-lib-random</module.virtdata-lib-random>
|
<module.virtdata-lib-random>virtdata-lib-random</module.virtdata-lib-random>
|
||||||
<module.virtdata-lib-curves4>virtdata-lib-curves4</module.virtdata-lib-curves4>
|
<module.virtdata-lib-curves4>virtdata-lib-curves4</module.virtdata-lib-curves4>
|
||||||
<module.virtdata-lib-realer>virtdata-lib-realer</module.virtdata-lib-realer>
|
<module.virtdata-lib-realer>virtdata-lib-realer</module.virtdata-lib-realer>
|
||||||
|
<module.virtdata-lib-realer>virtdata-lib-hdf5</module.virtdata-lib-realer>
|
||||||
<module.virtdata-userlibs>virtdata-userlibs</module.virtdata-userlibs>
|
<module.virtdata-userlibs>virtdata-userlibs</module.virtdata-userlibs>
|
||||||
</properties>
|
</properties>
|
||||||
<modules>
|
<modules>
|
||||||
@ -114,6 +116,7 @@
|
|||||||
<module>adapter-amqp</module>
|
<module>adapter-amqp</module>
|
||||||
<module>adapter-jdbc</module>
|
<module>adapter-jdbc</module>
|
||||||
<module>adapter-pinecone</module>
|
<module>adapter-pinecone</module>
|
||||||
|
<module>hdf-loader</module>
|
||||||
|
|
||||||
<!-- VIRTDATA MODULES -->
|
<!-- VIRTDATA MODULES -->
|
||||||
<module>virtdata-api</module>
|
<module>virtdata-api</module>
|
||||||
@ -123,6 +126,7 @@
|
|||||||
<module>virtdata-lib-random</module>
|
<module>virtdata-lib-random</module>
|
||||||
<module>virtdata-lib-curves4</module>
|
<module>virtdata-lib-curves4</module>
|
||||||
<module>virtdata-lib-realer</module>
|
<module>virtdata-lib-realer</module>
|
||||||
|
<module>virtdata-lib-hdf5</module>
|
||||||
<module>virtdata-userlibs</module>
|
<module>virtdata-userlibs</module>
|
||||||
|
|
||||||
<!-- Documentation -->
|
<!-- Documentation -->
|
||||||
|
@ -82,6 +82,7 @@
|
|||||||
<version>5.1.1</version>
|
<version>5.1.1</version>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
62
virtdata-lib-hdf5/pom.xml
Normal file
62
virtdata-lib-hdf5/pom.xml
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
<!--
|
||||||
|
~ Copyright (c) 2023 nosqlbench
|
||||||
|
~
|
||||||
|
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
~ you may not use this file except in compliance with the License.
|
||||||
|
~ You may obtain a copy of the License at
|
||||||
|
~
|
||||||
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
~
|
||||||
|
~ Unless required by applicable law or agreed to in writing, software
|
||||||
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
~ See the License for the specific language governing permissions and
|
||||||
|
~ limitations under the License.
|
||||||
|
-->
|
||||||
|
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
<parent>
|
||||||
|
<artifactId>mvn-defaults</artifactId>
|
||||||
|
<groupId>io.nosqlbench</groupId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
<relativePath>../mvn-defaults</relativePath>
|
||||||
|
</parent>
|
||||||
|
|
||||||
|
<artifactId>virtdata-lib-hdf5</artifactId>
|
||||||
|
<packaging>jar</packaging>
|
||||||
|
<name>virtdata-lib-hdf5</name>
|
||||||
|
<url>http://nosqlbench.io/</url>
|
||||||
|
|
||||||
|
<description>With inspiration from other libraries</description>
|
||||||
|
|
||||||
|
<dependencies>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.nosqlbench</groupId>
|
||||||
|
<artifactId>virtdata-lib-basics</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.jhdf</groupId>
|
||||||
|
<artifactId>jhdf</artifactId>
|
||||||
|
<version>0.6.10</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
</dependencies>
|
||||||
|
|
||||||
|
<build>
|
||||||
|
<testResources>
|
||||||
|
<testResource>
|
||||||
|
<directory>src/test/resources</directory>
|
||||||
|
<excludes>
|
||||||
|
<exclude>h5ex_t_float.h5</exclude>
|
||||||
|
</excludes>
|
||||||
|
<filtering>true</filtering>
|
||||||
|
</testResource>
|
||||||
|
</testResources>
|
||||||
|
</build>
|
||||||
|
|
||||||
|
</project>
|
@ -0,0 +1,47 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long;
|
||||||
|
|
||||||
|
import io.jhdf.HdfFile;
|
||||||
|
import io.jhdf.api.Dataset;
|
||||||
|
import io.nosqlbench.api.content.NBIO;
|
||||||
|
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
public abstract class AbstractHdfFileToVector {
|
||||||
|
protected final HdfFile hdfFile;
|
||||||
|
protected final Dataset dataset;
|
||||||
|
protected final int[] dims;
|
||||||
|
|
||||||
|
public AbstractHdfFileToVector(String filename, String datasetName) {
|
||||||
|
//hdfFile = new HdfFile(NBIO.all().search(filename).first().get().asPath());
|
||||||
|
hdfFile = new HdfFile(Paths.get(filename));
|
||||||
|
//TODO: implement a function to get the dataset by name only without needing the full path
|
||||||
|
dataset = hdfFile.getDatasetByPath(datasetName);
|
||||||
|
dims = dataset.getDimensions();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Object getDataFrom(long l) {
|
||||||
|
long[] sliceOffset = new long[dims.length];
|
||||||
|
sliceOffset[0] = (l % dims[0]);
|
||||||
|
int[] sliceDimensions = new int[dims.length];
|
||||||
|
sliceDimensions[0] = 1;
|
||||||
|
// Do we want to give the option of reducing vector dimensions here?
|
||||||
|
sliceDimensions[1] = dims[1];
|
||||||
|
return dataset.getData(sliceOffset, sliceDimensions);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,53 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||||
|
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||||
|
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||||
|
* reads a single vector from the dataset, based on the long input value. As currently
|
||||||
|
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||||
|
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||||
|
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||||
|
* <p>
|
||||||
|
* This implementation is specific to returning an array of floats
|
||||||
|
*/
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class HdfFileToVectorArray extends AbstractHdfFileToVector implements LongFunction<float[]> {
|
||||||
|
private final EmbeddingGenerator embeddingGenerator;
|
||||||
|
|
||||||
|
public HdfFileToVectorArray(String filename, String datasetName) {
|
||||||
|
super(filename, datasetName);
|
||||||
|
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public float[] apply(long l) {
|
||||||
|
Object data = getDataFrom(l);
|
||||||
|
return embeddingGenerator.generateArrayEmbeddingFrom(data, dims);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,55 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||||
|
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVector;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGenerator;
|
||||||
|
import io.nosqlbench.virtdata.library.hdf5.helpers.EmbeddingGeneratorFactory;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This function reads a vector dataset from an HDF5 file. The dataset itself is not
|
||||||
|
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||||
|
* reads a single vector from the dataset, based on the long input value. As currently
|
||||||
|
* written this class will only work for datasets with 2 dimensions where the 1st dimension
|
||||||
|
* specifies the number of vectors and the 2nd dimension specifies the number of elements in
|
||||||
|
* each vector. Only datatypes short, int, and float are supported at this time.
|
||||||
|
* <p>
|
||||||
|
* This implementation is specific to returning a List of Floats, so as to work with the
|
||||||
|
* normalization functions e.g. NormalizeListVector and its variants.
|
||||||
|
*/
|
||||||
|
@ThreadSafeMapper
|
||||||
|
@Categories(Category.experimental)
|
||||||
|
public class HdfFileToVectorList extends AbstractHdfFileToVector implements LongFunction<List<Float>> {
|
||||||
|
private final EmbeddingGenerator embeddingGenerator;
|
||||||
|
|
||||||
|
public HdfFileToVectorList(String filename, String datasetName) {
|
||||||
|
super(filename, datasetName);
|
||||||
|
embeddingGenerator = EmbeddingGeneratorFactory.getGenerator(dataset.getJavaType().getSimpleName().toLowerCase());
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public List<Float> apply(long l) {
|
||||||
|
Object data = getDataFrom(l);
|
||||||
|
return embeddingGenerator.generateListEmbeddingFrom(data, dims);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class DoubleEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
// in this case o will always be double[1][x]
|
||||||
|
double[] vector = ((double[][]) o)[0];
|
||||||
|
Float[] vector2 = new Float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = (float) vector[i];
|
||||||
|
}
|
||||||
|
return List.of(vector2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
double[] vector = ((double[][]) o)[0];
|
||||||
|
float[] vector2 = new float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = (float) vector[i];
|
||||||
|
}
|
||||||
|
return vector2;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public interface EmbeddingGenerator {
|
||||||
|
List<Float> generateListEmbeddingFrom(Object o, int[] dims);
|
||||||
|
|
||||||
|
float[] generateArrayEmbeddingFrom(Object o, int[] dims);
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class EmbeddingGeneratorFactory {
|
||||||
|
private static final Map<String, EmbeddingGenerator> generators = new HashMap<>();
|
||||||
|
|
||||||
|
public static EmbeddingGenerator getGenerator(String type) {
|
||||||
|
String typeLower = type.equalsIgnoreCase("short") ? "int" : type.toLowerCase();
|
||||||
|
switch (typeLower) {
|
||||||
|
case "float" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new FloatEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
case "int" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new IntEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
case "double" -> {
|
||||||
|
if (!generators.containsKey(type)) {
|
||||||
|
generators.put(type, new DoubleEmbeddingGenerator());
|
||||||
|
}
|
||||||
|
return generators.get(type);
|
||||||
|
}
|
||||||
|
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class FloatEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
// in this case o will always be float[1][x]
|
||||||
|
float[] vector = ((float[][]) o)[0];
|
||||||
|
Float[] vector2 = new Float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = vector[i];
|
||||||
|
}
|
||||||
|
return List.of(vector2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
return ((float[][]) o)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,42 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class IntEmbeddingGenerator implements EmbeddingGenerator {
|
||||||
|
@Override
|
||||||
|
public List<Float> generateListEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
// in this case o will always be int[1][x]
|
||||||
|
int[] vector = ((int[][]) o)[0];
|
||||||
|
Float[] vector2 = new Float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = (float) vector[i];
|
||||||
|
}
|
||||||
|
return List.of(vector2);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] generateArrayEmbeddingFrom(Object o, int[] dims) {
|
||||||
|
int[] vector = ((int[][]) o)[0];
|
||||||
|
float[] vector2 = new float[vector.length];
|
||||||
|
for (int i = 0; i < vector.length; i++) {
|
||||||
|
vector2[i] = (float) vector[i];
|
||||||
|
}
|
||||||
|
return vector2;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_array;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class HdfFileToArrayTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfFileToVector() {
|
||||||
|
final float[][] results = new float[][]{
|
||||||
|
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||||
|
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||||
|
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||||
|
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||||
|
};
|
||||||
|
|
||||||
|
HdfFileToVectorArray hdfFileToVector = new HdfFileToVectorArray(
|
||||||
|
"src/test/resources/h5ex_t_float.h5",
|
||||||
|
"/DS1");
|
||||||
|
|
||||||
|
float[] read;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
read = hdfFileToVector.apply(i);
|
||||||
|
for (int j = 0; j < 7; j++) {
|
||||||
|
assert (read[j] == results[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,46 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.hdf5.from_long.to_list;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class HdfFileToVectorTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHdfFileToVector() {
|
||||||
|
final float[][] results = new float[][]{
|
||||||
|
{0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f},
|
||||||
|
{2.0f,1.6666666f,2.4f,3.2857144f,4.2222223f,5.181818f,6.1538463f},
|
||||||
|
{4.0f,2.3333333f,2.8f,3.5714285f,4.4444447f,5.3636365f,6.3076925f},
|
||||||
|
{6.0f,3.0f,3.2f,3.857143f,4.6666665f,5.5454545f,6.4615383f}
|
||||||
|
};
|
||||||
|
|
||||||
|
HdfFileToVectorList hdfFileToVector = new HdfFileToVectorList(
|
||||||
|
"src/test/resources/h5ex_t_float.h5",
|
||||||
|
"/DS1");
|
||||||
|
|
||||||
|
List<Float> read;
|
||||||
|
for (int i = 0; i < 4; i++) {
|
||||||
|
read = hdfFileToVector.apply(i);
|
||||||
|
for (int j = 0; j < 7; j++) {
|
||||||
|
assert (read.get(j) == results[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
BIN
virtdata-lib-hdf5/src/test/resources/h5ex_t_float.h5
Normal file
Binary file not shown.
@ -66,6 +66,13 @@
|
|||||||
<artifactId>virtdata-lib-curves4</artifactId>
|
<artifactId>virtdata-lib-curves4</artifactId>
|
||||||
<version>${revision}</version>
|
<version>${revision}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>io.nosqlbench</groupId>
|
||||||
|
<artifactId>virtdata-lib-hdf5</artifactId>
|
||||||
|
<version>${revision}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>io.nosqlbench</groupId>
|
<groupId>io.nosqlbench</groupId>
|
||||||
<artifactId>docsys</artifactId>
|
<artifactId>docsys</artifactId>
|
||||||
|
Loading…
Reference in New Issue
Block a user