adding in functionality for all datasets in a file

This commit is contained in:
Mark Wolters
2023-08-01 11:17:04 -04:00
parent 29fe0e455b
commit 81f0473766
3 changed files with 33 additions and 10 deletions

View File

@@ -47,8 +47,8 @@ public class LoaderConfig {
return configMap.get(key).toString();
}
public List<Map<String,String>> getDatasets() {
return (List<Map<String,String>>) configMap.get("datasets");
public List<String> getDatasets() {
return (List<String>) configMap.get("datasets");
}
public String getFormat() {

View File

@@ -27,6 +27,7 @@ import ncsa.hdf.hdf5lib.exceptions.HDF5LibraryException;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -34,6 +35,7 @@ import java.util.concurrent.LinkedBlockingQueue;
public class Hdf5Reader implements HdfReader {
private static final Logger logger = LogManager.getLogger(Hdf5Reader.class);
public static final String ALL = "all";
private VectorWriter writer;
private final LoaderConfig config;
private final ExecutorService executorService;
@@ -53,12 +55,36 @@ public class Hdf5Reader implements HdfReader {
public void read() throws HDF5LibraryException {
String sourceFile = config.getSourceFile();
int fileId = H5.H5Fopen(sourceFile, HDF5Constants.H5F_ACC_RDONLY, HDF5Constants.H5P_DEFAULT);
for (Map<String,String> dataset : config.getDatasets()) {
List<String> datasets = config.getDatasets();
if (datasets.get(0).equalsIgnoreCase(ALL)) {
try {
int numObjects = H5.H5Fget_obj_count(fileId, HDF5Constants.H5F_OBJ_ALL);
String[] objNames = new String[numObjects];
int[] objTypes = new int[numObjects];
long[] refArray = new long[numObjects];
//H5.H5Fget_obj_ids(fileId, HDF5Constants.H5F_OBJ_ALL, numObjects, objNames, objTypes);
H5.H5Gget_obj_info_all(fileId, null, objNames, objTypes, refArray);
for (int i = 0; i < numObjects; i++) {
String objName = objNames[i];
int objType = objTypes[i];
if (objType == HDF5Constants.H5G_DATASET) {
datasets.add(objName);
}
}
} catch (HDF5Exception e) {
logger.error("Error getting all datasets from file: " + sourceFile, e);
}
}
for (String dataset : config.getDatasets()) {
if (dataset.equalsIgnoreCase(ALL)) {
continue;
}
executorService.submit(() -> {
// Your lambda code that runs in a separate thread for each object
logger.info("Processing dataset: " + dataset.get("name"));
logger.info("Processing dataset: " + dataset);
try {
int datasetId = H5.H5Dopen(fileId, dataset.get("name"));
int datasetId = H5.H5Dopen(fileId, dataset);
// Get the dataspace of the dataset
int dataspaceId = H5.H5Dget_space(datasetId);
// Get the number of dimensions in the dataspace

View File

@@ -1,10 +1,7 @@
format: HDF5
sourceFile: /home/mwolters138/Downloads/embeddings.h5
sourceFile: /home/mwolters138/Downloads/NEONDSTowerTemperatureData.hdf5
datasets:
- name: dataset1
type: string
- name: dataset2
type: int
- all
embedding: word2vec
writer: filewriter
astra: