removing non-functional library reference

This commit is contained in:
Mark Wolters
2023-08-04 09:02:59 -04:00
parent 81f0473766
commit a78ef4b2a3
10 changed files with 106 additions and 82 deletions

View File

@@ -52,18 +52,17 @@
<version>2.0</version>
</dependency>
<dependency>
<groupId>org.hdfgroup</groupId>
<artifactId>hdf-java</artifactId>
<version>2.6.1</version>
</dependency>
<dependency>
<groupId>com.datastax.oss</groupId>
<artifactId>java-driver-core</artifactId>
<version>4.16.0</version>
</dependency>
<dependency>
<groupId>io.jhdf</groupId>
<artifactId>jhdf</artifactId>
<version>0.6.10</version>
</dependency>
</dependencies>
</project>

View File

@@ -49,9 +49,7 @@ public class HdfLoader {
logger.info("HDF4 format not yet supported");
System.exit(1);
}
case HDF5 -> {
reader = new Hdf5Reader(config);
}
case HDF5 -> reader = new Hdf5Reader(config);
default -> {
logger.info("Unknown format: " + format);
System.exit(1);

View File

@@ -71,6 +71,10 @@ public class LoaderConfig {
return (String) configMap.get("sourceFile");
}
public String getTargetFile() {
return (String) configMap.getOrDefault("targetFile", "./vectors.txt");
}
public int getThreads() {
return (int) configMap.getOrDefault("threads", 1);
}

View File

@@ -17,16 +17,17 @@
package io.nosqlbench.loader.hdf.readers;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.jhdf.api.Group;
import io.jhdf.api.Node;
import io.jhdf.object.datatype.DataType;
import io.nosqlbench.loader.hdf.config.LoaderConfig;
import io.nosqlbench.loader.hdf.writers.VectorWriter;
import ncsa.hdf.hdf5lib.H5;
import ncsa.hdf.hdf5lib.HDF5Constants;
import ncsa.hdf.hdf5lib.exceptions.HDF5Exception;
import ncsa.hdf.hdf5lib.exceptions.HDF5LibraryException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
@@ -40,6 +41,7 @@ public class Hdf5Reader implements HdfReader {
private final LoaderConfig config;
private final ExecutorService executorService;
private final LinkedBlockingQueue<float[]> queue;
private List<String> datasets;
public Hdf5Reader(LoaderConfig config) {
this.config = config;
executorService = Executors.newFixedThreadPool(config.getThreads());
@@ -49,76 +51,44 @@ public class Hdf5Reader implements HdfReader {
@Override
public void setWriter(VectorWriter writer) {
this.writer = writer;
writer.setQueue(queue);
Thread t = new Thread(writer);
t.start();
}
public void extractDatasets(Group parent) {
Map<String, Node> nodes = parent.getChildren();
for (String key : nodes.keySet()) {
Node node = nodes.get(key);
if (node instanceof Dataset) {
datasets.add(((Dataset)node).getPath());
} else if (node.isGroup()) {
extractDatasets((Group) node);
}
}
}
@Override
public void read() throws HDF5LibraryException {
String sourceFile = config.getSourceFile();
int fileId = H5.H5Fopen(sourceFile, HDF5Constants.H5F_ACC_RDONLY, HDF5Constants.H5P_DEFAULT);
List<String> datasets = config.getDatasets();
public void read() {
HdfFile hdfFile = new HdfFile(Paths.get(config.getSourceFile()));
datasets = config.getDatasets();
if (datasets.get(0).equalsIgnoreCase(ALL)) {
try {
int numObjects = H5.H5Fget_obj_count(fileId, HDF5Constants.H5F_OBJ_ALL);
String[] objNames = new String[numObjects];
int[] objTypes = new int[numObjects];
long[] refArray = new long[numObjects];
//H5.H5Fget_obj_ids(fileId, HDF5Constants.H5F_OBJ_ALL, numObjects, objNames, objTypes);
H5.H5Gget_obj_info_all(fileId, null, objNames, objTypes, refArray);
for (int i = 0; i < numObjects; i++) {
String objName = objNames[i];
int objType = objTypes[i];
if (objType == HDF5Constants.H5G_DATASET) {
datasets.add(objName);
}
}
} catch (HDF5Exception e) {
logger.error("Error getting all datasets from file: " + sourceFile, e);
}
extractDatasets(hdfFile);
}
for (String dataset : config.getDatasets()) {
if (dataset.equalsIgnoreCase(ALL)) {
for (String ds : datasets) {
if (ds.equalsIgnoreCase(ALL)) {
continue;
}
executorService.submit(() -> {
// Your lambda code that runs in a separate thread for each object
logger.info("Processing dataset: " + dataset);
try {
int datasetId = H5.H5Dopen(fileId, dataset);
// Get the dataspace of the dataset
int dataspaceId = H5.H5Dget_space(datasetId);
// Get the number of dimensions in the dataspace
int numDimensions = H5.H5Sget_simple_extent_ndims(dataspaceId);
float[] vector = new float[numDimensions];
long[] dims = new long[numDimensions];
// Get the datatype of the dataset
int datatypeId = H5.H5Dget_type(datasetId);
// Get the size of each dimension
H5.H5Sget_simple_extent_dims(dataspaceId, dims, null);
//executorService.submit(() -> {
logger.info("Processing dataset: " + ds);
Dataset dataset = hdfFile.getDatasetByPath(ds);
DataType dataType = dataset.getDataType();
long l = dataset.getSize();
int[] dims = dataset.getDimensions();
// Read the data from the dataset
double[] data = new double[(int) dims[0]];
H5.H5Dread(datasetId, datatypeId, HDF5Constants.H5S_ALL, HDF5Constants.H5S_ALL,
HDF5Constants.H5P_DEFAULT, data);
//queue.put(vector);
// Close the dataspace, datatype, and dataset
H5.H5Sclose(dataspaceId);
H5.H5Tclose(datatypeId);
H5.H5Dclose(datasetId);
// Now you have the data, and you can convert it into vector embeddings
//INDArray dataArray = Nd4j.create(data);
//WordVectors wordVectors = new WordVectorsImpl();
//wordVectors.setLookupTable(dataArray);
//WordVectorSerializer.writeWordVectors(wordVectors, "vector_embeddings.txt");
queue.put(vector);
} catch (HDF5Exception e) {
logger.error(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
});
// });
}
}
}

View File

@@ -18,10 +18,9 @@
package io.nosqlbench.loader.hdf.readers;
import io.nosqlbench.loader.hdf.writers.VectorWriter;
import ncsa.hdf.hdf5lib.exceptions.HDF5LibraryException;
public interface HdfReader {
void setWriter(VectorWriter writer);
void read() throws HDF5LibraryException;
void read();
}

View File

@@ -21,4 +21,26 @@ import java.util.concurrent.LinkedBlockingQueue;
public abstract class AbstractVectorWriter implements VectorWriter {
protected LinkedBlockingQueue<float[]> queue;
public void setQueue(LinkedBlockingQueue<float[]> queue) {
this.queue = queue;
}
@Override
public void run() {
while (true) {
try {
float[] vector = queue.take();
if (vector.length==0) {
break;
}
writeVector(vector);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
protected abstract void writeVector(float[] vector);
}

View File

@@ -18,9 +18,13 @@
package io.nosqlbench.loader.hdf.writers;
import io.nosqlbench.loader.hdf.config.LoaderConfig;
import com.datastax.oss.driver.api.core.data.CqlVector;
public class AstraVectorWriter extends AbstractVectorWriter {
public AstraVectorWriter(LoaderConfig config) {
}
@Override
protected void writeVector(float[] vector) {
}
}

View File

@@ -18,8 +18,33 @@
package io.nosqlbench.loader.hdf.writers;
import io.nosqlbench.loader.hdf.config.LoaderConfig;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
public class FileVectorWriter implements VectorWriter {
public FileVectorWriter(LoaderConfig config) {
import java.io.*;
public class FileVectorWriter extends AbstractVectorWriter {
private static final Logger logger = LogManager.getLogger(FileVectorWriter.class);
private final BufferedWriter targetFile;
public FileVectorWriter(LoaderConfig config) throws IOException {
String targetFileName = config.getTargetFile();
targetFile = new BufferedWriter(new FileWriter(targetFileName));
}
@Override
protected void writeVector(float[] vector) {
try {
targetFile.write("[");
for (int i = 0; i < vector.length; i++) {
targetFile.write(String.valueOf(vector[i]));
if (i < vector.length - 1) {
targetFile.write(",");
}
}
targetFile.write("]");
targetFile.write("\n");
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}
}

View File

@@ -17,5 +17,8 @@
package io.nosqlbench.loader.hdf.writers;
public interface VectorWriter {
import java.util.concurrent.LinkedBlockingQueue;
public interface VectorWriter extends Runnable {
void setQueue(LinkedBlockingQueue<float[]> queue);
}

View File

@@ -1,5 +1,5 @@
format: HDF5
sourceFile: /home/mwolters138/Downloads/NEONDSTowerTemperatureData.hdf5
sourceFile: /home/mwolters138/Downloads/NEONDSImagingSpectrometerData.h5 #h5ex_t_arrayatt.h5
datasets:
- all
embedding: word2vec