mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
removing non-functional library reference
This commit is contained in:
@@ -52,18 +52,17 @@
|
||||
<version>2.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.hdfgroup</groupId>
|
||||
<artifactId>hdf-java</artifactId>
|
||||
<version>2.6.1</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.datastax.oss</groupId>
|
||||
<artifactId>java-driver-core</artifactId>
|
||||
<version>4.16.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.jhdf</groupId>
|
||||
<artifactId>jhdf</artifactId>
|
||||
<version>0.6.10</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
||||
@@ -49,9 +49,7 @@ public class HdfLoader {
|
||||
logger.info("HDF4 format not yet supported");
|
||||
System.exit(1);
|
||||
}
|
||||
case HDF5 -> {
|
||||
reader = new Hdf5Reader(config);
|
||||
}
|
||||
case HDF5 -> reader = new Hdf5Reader(config);
|
||||
default -> {
|
||||
logger.info("Unknown format: " + format);
|
||||
System.exit(1);
|
||||
|
||||
@@ -71,6 +71,10 @@ public class LoaderConfig {
|
||||
return (String) configMap.get("sourceFile");
|
||||
}
|
||||
|
||||
public String getTargetFile() {
|
||||
return (String) configMap.getOrDefault("targetFile", "./vectors.txt");
|
||||
}
|
||||
|
||||
public int getThreads() {
|
||||
return (int) configMap.getOrDefault("threads", 1);
|
||||
}
|
||||
|
||||
@@ -17,16 +17,17 @@
|
||||
|
||||
package io.nosqlbench.loader.hdf.readers;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.jhdf.api.Group;
|
||||
import io.jhdf.api.Node;
|
||||
import io.jhdf.object.datatype.DataType;
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||
|
||||
import ncsa.hdf.hdf5lib.H5;
|
||||
import ncsa.hdf.hdf5lib.HDF5Constants;
|
||||
import ncsa.hdf.hdf5lib.exceptions.HDF5Exception;
|
||||
import ncsa.hdf.hdf5lib.exceptions.HDF5LibraryException;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
@@ -40,6 +41,7 @@ public class Hdf5Reader implements HdfReader {
|
||||
private final LoaderConfig config;
|
||||
private final ExecutorService executorService;
|
||||
private final LinkedBlockingQueue<float[]> queue;
|
||||
private List<String> datasets;
|
||||
public Hdf5Reader(LoaderConfig config) {
|
||||
this.config = config;
|
||||
executorService = Executors.newFixedThreadPool(config.getThreads());
|
||||
@@ -49,76 +51,44 @@ public class Hdf5Reader implements HdfReader {
|
||||
@Override
|
||||
public void setWriter(VectorWriter writer) {
|
||||
this.writer = writer;
|
||||
writer.setQueue(queue);
|
||||
Thread t = new Thread(writer);
|
||||
t.start();
|
||||
}
|
||||
|
||||
public void extractDatasets(Group parent) {
|
||||
Map<String, Node> nodes = parent.getChildren();
|
||||
for (String key : nodes.keySet()) {
|
||||
Node node = nodes.get(key);
|
||||
if (node instanceof Dataset) {
|
||||
datasets.add(((Dataset)node).getPath());
|
||||
} else if (node.isGroup()) {
|
||||
extractDatasets((Group) node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void read() throws HDF5LibraryException {
|
||||
String sourceFile = config.getSourceFile();
|
||||
int fileId = H5.H5Fopen(sourceFile, HDF5Constants.H5F_ACC_RDONLY, HDF5Constants.H5P_DEFAULT);
|
||||
List<String> datasets = config.getDatasets();
|
||||
public void read() {
|
||||
HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile()));
|
||||
datasets = config.getDatasets();
|
||||
if (datasets.get(0).equalsIgnoreCase(ALL)) {
|
||||
try {
|
||||
int numObjects = H5.H5Fget_obj_count(fileId, HDF5Constants.H5F_OBJ_ALL);
|
||||
String[] objNames = new String[numObjects];
|
||||
int[] objTypes = new int[numObjects];
|
||||
long[] refArray = new long[numObjects];
|
||||
//H5.H5Fget_obj_ids(fileId, HDF5Constants.H5F_OBJ_ALL, numObjects, objNames, objTypes);
|
||||
H5.H5Gget_obj_info_all(fileId, null, objNames, objTypes, refArray);
|
||||
|
||||
for (int i = 0; i < numObjects; i++) {
|
||||
String objName = objNames[i];
|
||||
int objType = objTypes[i];
|
||||
if (objType == HDF5Constants.H5G_DATASET) {
|
||||
datasets.add(objName);
|
||||
extractDatasets(hdfFile);
|
||||
}
|
||||
}
|
||||
} catch (HDF5Exception e) {
|
||||
logger.error("Error getting all datasets from file: " + sourceFile, e);
|
||||
}
|
||||
}
|
||||
for (String dataset : config.getDatasets()) {
|
||||
if (dataset.equalsIgnoreCase(ALL)) {
|
||||
for (String ds : datasets) {
|
||||
if (ds.equalsIgnoreCase(ALL)) {
|
||||
continue;
|
||||
}
|
||||
executorService.submit(() -> {
|
||||
// Your lambda code that runs in a separate thread for each object
|
||||
logger.info("Processing dataset: " + dataset);
|
||||
try {
|
||||
int datasetId = H5.H5Dopen(fileId, dataset);
|
||||
// Get the dataspace of the dataset
|
||||
int dataspaceId = H5.H5Dget_space(datasetId);
|
||||
// Get the number of dimensions in the dataspace
|
||||
int numDimensions = H5.H5Sget_simple_extent_ndims(dataspaceId);
|
||||
float[] vector = new float[numDimensions];
|
||||
long[] dims = new long[numDimensions];
|
||||
// Get the datatype of the dataset
|
||||
int datatypeId = H5.H5Dget_type(datasetId);
|
||||
// Get the size of each dimension
|
||||
H5.H5Sget_simple_extent_dims(dataspaceId, dims, null);
|
||||
//executorService.submit(() -> {
|
||||
logger.info("Processing dataset: " + ds);
|
||||
Dataset dataset = hdfFile.getDatasetByPath(ds);
|
||||
DataType dataType = dataset.getDataType();
|
||||
long l = dataset.getSize();
|
||||
int[] dims = dataset.getDimensions();
|
||||
|
||||
// Read the data from the dataset
|
||||
double[] data = new double[(int) dims[0]];
|
||||
H5.H5Dread(datasetId, datatypeId, HDF5Constants.H5S_ALL, HDF5Constants.H5S_ALL,
|
||||
HDF5Constants.H5P_DEFAULT, data);
|
||||
//queue.put(vector);
|
||||
|
||||
// Close the dataspace, datatype, and dataset
|
||||
H5.H5Sclose(dataspaceId);
|
||||
H5.H5Tclose(datatypeId);
|
||||
H5.H5Dclose(datasetId);
|
||||
|
||||
// Now you have the data, and you can convert it into vector embeddings
|
||||
//INDArray dataArray = Nd4j.create(data);
|
||||
//WordVectors wordVectors = new WordVectorsImpl();
|
||||
//wordVectors.setLookupTable(dataArray);
|
||||
//WordVectorSerializer.writeWordVectors(wordVectors, "vector_embeddings.txt");
|
||||
|
||||
queue.put(vector);
|
||||
} catch (HDF5Exception e) {
|
||||
logger.error(e);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
// });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,10 +18,9 @@
|
||||
package io.nosqlbench.loader.hdf.readers;
|
||||
|
||||
import io.nosqlbench.loader.hdf.writers.VectorWriter;
|
||||
import ncsa.hdf.hdf5lib.exceptions.HDF5LibraryException;
|
||||
|
||||
public interface HdfReader {
|
||||
void setWriter(VectorWriter writer);
|
||||
|
||||
void read() throws HDF5LibraryException;
|
||||
void read();
|
||||
}
|
||||
|
||||
@@ -21,4 +21,26 @@ import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public abstract class AbstractVectorWriter implements VectorWriter {
|
||||
protected LinkedBlockingQueue<float[]> queue;
|
||||
|
||||
public void setQueue(LinkedBlockingQueue<float[]> queue) {
|
||||
this.queue = queue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
while (true) {
|
||||
try {
|
||||
float[] vector = queue.take();
|
||||
if (vector.length==0) {
|
||||
break;
|
||||
}
|
||||
writeVector(vector);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract void writeVector(float[] vector);
|
||||
|
||||
}
|
||||
|
||||
@@ -18,9 +18,13 @@
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import com.datastax.oss.driver.api.core.data.CqlVector;
|
||||
|
||||
public class AstraVectorWriter extends AbstractVectorWriter {
|
||||
public AstraVectorWriter(LoaderConfig config) {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeVector(float[] vector) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,8 +18,33 @@
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
import io.nosqlbench.loader.hdf.config.LoaderConfig;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
public class FileVectorWriter implements VectorWriter {
|
||||
public FileVectorWriter(LoaderConfig config) {
|
||||
import java.io.*;
|
||||
|
||||
public class FileVectorWriter extends AbstractVectorWriter {
|
||||
private static final Logger logger = LogManager.getLogger(FileVectorWriter.class);
|
||||
private final BufferedWriter targetFile;
|
||||
public FileVectorWriter(LoaderConfig config) throws IOException {
|
||||
String targetFileName = config.getTargetFile();
|
||||
targetFile = new BufferedWriter(new FileWriter(targetFileName));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeVector(float[] vector) {
|
||||
try {
|
||||
targetFile.write("[");
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
targetFile.write(String.valueOf(vector[i]));
|
||||
if (i < vector.length - 1) {
|
||||
targetFile.write(",");
|
||||
}
|
||||
}
|
||||
targetFile.write("]");
|
||||
targetFile.write("\n");
|
||||
} catch (IOException e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,5 +17,8 @@
|
||||
|
||||
package io.nosqlbench.loader.hdf.writers;
|
||||
|
||||
public interface VectorWriter {
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public interface VectorWriter extends Runnable {
|
||||
void setQueue(LinkedBlockingQueue<float[]> queue);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
format: HDF5
|
||||
sourceFile: /home/mwolters138/Downloads/NEONDSTowerTemperatureData.hdf5
|
||||
sourceFile: /home/mwolters138/Downloads/NEONDSImagingSpectrometerData.h5 #h5ex_t_arrayatt.h5
|
||||
datasets:
|
||||
- all
|
||||
embedding: word2vec
|
||||
|
||||
Reference in New Issue
Block a user