mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2024-12-23 15:40:44 -06:00
Merge pull request #1685 from nosqlbench/mwolters/vpf-72
Support mixed-schema KNN data set formats
This commit is contained in:
commit
904ce1c6ec
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
|
||||
/**
|
||||
* This interface is used to parse the raw JSON from the HDF dataset into a CQL predicate.
|
||||
*/
|
||||
public interface DatasetParser {
|
||||
|
||||
/**
|
||||
* Return the specified class to parse the raw JSON from the HDF dataset into a CQL predicate.
|
||||
* @param parsername
|
||||
* @return A new instance of the specified parser class.
|
||||
*/
|
||||
static DatasetParser parserFactory(String parsername) {
|
||||
return switch (parsername) {
|
||||
case "default" -> new DefaultDatasetParser();
|
||||
case "noop" -> new NoopDatasetParser();
|
||||
case "jaw" -> new JAWDatasetParser();
|
||||
default -> throw new RuntimeException("Unknown parser name: " + parsername);
|
||||
};
|
||||
}
|
||||
|
||||
String parse(String raw);
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParser;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
|
||||
/**
|
||||
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This is the default
|
||||
* implementation. It accepts a JSON string of the form found at https://github.com/qdrant/ann-filtering-benchmark-datasets
|
||||
* and converts it into a CQL predicate in String form
|
||||
*/
|
||||
public class DefaultDatasetParser implements DatasetParser {
|
||||
private static final String WHERE = "WHERE";
|
||||
private static final String MATCH = "match";
|
||||
private static final String AND = "and";
|
||||
private static final String OR = "or";
|
||||
private static final String EQ = "=";
|
||||
private static final String IN = "IN";
|
||||
private static final String CONDITIONS = "conditions";
|
||||
private static final String VALUE = "value";
|
||||
private static final String SPACE = " ";
|
||||
private static final String SINGLE_QUOTE = "'";
|
||||
private static final String COMMA = ",";
|
||||
private static final String LEFT_PAREN = "(";
|
||||
private static final String RIGHT_PAREN = ")";
|
||||
private static final Logger logger = LogManager.getLogger(DefaultDatasetParser.class);
|
||||
|
||||
@Override
|
||||
public String parse(String raw) {
|
||||
logger.debug(() -> "Parsing: " + raw);
|
||||
JsonObject conditions = JsonParser.parseString(raw).getAsJsonObject().get(CONDITIONS).getAsJsonObject();
|
||||
if (conditions.has(AND)) {
|
||||
return parseAndConditionsInline(conditions);
|
||||
} else if (conditions.has(OR)) {
|
||||
return parseOrConditionsInline(conditions);
|
||||
} else {
|
||||
throw new RuntimeException("Unknown predicate type: " + conditions.keySet());
|
||||
}
|
||||
}
|
||||
|
||||
private String parseOrConditionsInline(JsonObject conditions) {
|
||||
StringBuilder builder = new StringBuilder(WHERE);
|
||||
boolean first = true;
|
||||
for (JsonElement e : conditions.get(OR).getAsJsonArray()) {
|
||||
JsonObject condition = e.getAsJsonObject();
|
||||
String field = condition.keySet().iterator().next();
|
||||
JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
|
||||
if (match != null) {
|
||||
if (first) {
|
||||
builder.append(SPACE).append(field).append(SPACE).append(IN).append(LEFT_PAREN);
|
||||
first = false;
|
||||
} else {
|
||||
builder.append(COMMA);
|
||||
}
|
||||
boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
|
||||
match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
|
||||
|
||||
builder.append(
|
||||
isNumeric ?
|
||||
match.getAsJsonObject().get(VALUE).getAsString() :
|
||||
SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
|
||||
);
|
||||
} else {
|
||||
logger.error(() -> "No match found for: " + condition.keySet());
|
||||
}
|
||||
}
|
||||
builder.append(RIGHT_PAREN);
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private String parseAndConditionsInline(JsonObject conditions) {
|
||||
StringBuilder builder = new StringBuilder(WHERE);
|
||||
boolean first = true;
|
||||
for (JsonElement e : conditions.get(AND).getAsJsonArray()) {
|
||||
JsonObject condition = e.getAsJsonObject();
|
||||
String field = condition.keySet().iterator().next();
|
||||
JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
|
||||
if (match != null) {
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
builder.append(SPACE).append(AND);
|
||||
}
|
||||
boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
|
||||
match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
|
||||
builder.append(SPACE).append(field).append(EQ);
|
||||
builder.append(
|
||||
isNumeric ?
|
||||
match.getAsJsonObject().get(VALUE).getAsString() :
|
||||
SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
|
||||
);
|
||||
} else {
|
||||
logger.error(() -> "No match found for: " + condition.keySet());
|
||||
}
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* Binding function that accepts a long input value for the cycle and returns a string consisting of the
|
||||
* CQL predicate parsed from a single record in an HDF5 dataset
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfDatasetToCqlPredicates implements LongFunction<String> {
|
||||
private final HdfFile hdfFile;
|
||||
private final Dataset dataset;
|
||||
private final int recordCount;
|
||||
private final DatasetParser parser;
|
||||
|
||||
/**
|
||||
* Create a new binding function that accepts a long input value for the cycle and returns a string
|
||||
* @param filename
|
||||
* @param datasetname
|
||||
* @param parsername
|
||||
*/
|
||||
public HdfDatasetToCqlPredicates(String filename, String datasetname, String parsername) {
|
||||
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
|
||||
dataset = hdfFile.getDatasetByPath(datasetname);
|
||||
recordCount = dataset.getDimensions()[0];
|
||||
parser = DatasetParser.parserFactory(parsername);
|
||||
}
|
||||
|
||||
public HdfDatasetToCqlPredicates(String filename, String datasetname) {
|
||||
this(filename, datasetname, "default");
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(long l) {
|
||||
long[] sliceOffset = {(l % recordCount)};
|
||||
int[] sliceDimensions = {1};
|
||||
String raw = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
|
||||
return parser.parse(raw);
|
||||
}
|
||||
}
|
@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
/**
|
||||
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
|
||||
* accepts a string consisting of the desired CQL predicate as translated from the original jsonl files
|
||||
* and simply adds the WHERE keyword to the beginning of the string if it is not already present, hence
|
||||
* the new Just Add Where (JAW) parser.
|
||||
*/
|
||||
public class JAWDatasetParser implements DatasetParser {
|
||||
private static final String WHERE = "WHERE";
|
||||
@Override
|
||||
public String parse(String raw) {
|
||||
if (!raw.toUpperCase().startsWith(WHERE)) {
|
||||
raw = WHERE + " " + raw;
|
||||
}
|
||||
return raw;
|
||||
}
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
/**
|
||||
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
|
||||
* accepts a string consisting of the desired CQL predicate as translated from the original jsonl files and
|
||||
* simply returns the raw string, hence the name NoopDatasetParser.
|
||||
*/
|
||||
public class NoopDatasetParser implements DatasetParser {
|
||||
@Override
|
||||
public String parse(String raw) {
|
||||
return raw;
|
||||
}
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.datamappers.functions.hdf_to_cql;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class DefaultDatasetParserTest {
|
||||
String test1 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 53}}}]}}";
|
||||
String test2 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": \"thirteen\"}}}, {\"b\": {\"match\": {\"value\": \"fifty-four\"}}}]}}";
|
||||
String test3 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 13}}}, {\"b\": {\"match\": {\"value\": 54}}}, {\"a\": {\"match\": {\"value\": 154}}}]}}";
|
||||
String test4 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}]}}";
|
||||
String test5 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}, {\"a\": {\"match\": {\"value\": 7}}}]}}";
|
||||
String test6 = "{\"conditions\": {\"or\": [{\"b\": {\"match\": {\"value\": \"foo\"}}}, {\"b\": {\"match\": {\"value\": \"bar\"}}}]}}";
|
||||
|
||||
|
||||
@Test
|
||||
public void testParse() {
|
||||
DefaultDatasetParser parser = new DefaultDatasetParser();
|
||||
String parsed = parser.parse(test1);
|
||||
assertEquals("WHERE a=53", parsed);
|
||||
|
||||
parsed = parser.parse(test2);
|
||||
assertEquals("WHERE a='thirteen' and b='fifty-four'", parsed);
|
||||
|
||||
parsed = parser.parse(test3);
|
||||
assertEquals("WHERE a=13 and b=54 and a=154", parsed);
|
||||
|
||||
parsed = parser.parse(test4);
|
||||
assertEquals("WHERE a IN(9,71)", parsed);
|
||||
|
||||
parsed = parser.parse(test5);
|
||||
assertEquals("WHERE a IN(9,71,7)", parsed);
|
||||
|
||||
parsed = parser.parse(test6);
|
||||
assertEquals("WHERE b IN('foo','bar')", parsed);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a vector dataset from an HDF5 file. The entire dataset is parsed into a single
|
||||
* String Object with the discreet values separated by the user supplied separator character. It is
|
||||
* intended for use only with small datasets where the entire dataset can be read into memory and there
|
||||
* is no need to read individual vectors from the dataset.
|
||||
* The lambda function simply returns the String representation of the dataset.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfDatasetToString implements LongFunction<String> {
|
||||
private final HdfFile hdfFile;
|
||||
private final Dataset dataset;
|
||||
private final String separator;
|
||||
private final String datasetAsString;
|
||||
|
||||
/**
|
||||
* Create a new binding function that accepts a long input value for the cycle and returns a string representation
|
||||
* of the specified dataset
|
||||
* @param filename
|
||||
* @param dataset
|
||||
* @param separator
|
||||
*/
|
||||
public HdfDatasetToString(String filename, String dataset, String separator) {
|
||||
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
|
||||
this.dataset = hdfFile.getDatasetByPath(dataset);
|
||||
this.separator = separator;
|
||||
this.datasetAsString = parseDataset();
|
||||
}
|
||||
|
||||
public HdfDatasetToString(String filename, String dataset) {
|
||||
this(filename, dataset, ",");
|
||||
}
|
||||
|
||||
private String parseDataset() {
|
||||
String[] columnDataset = (String[])dataset.getData();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < columnDataset.length; i++) {
|
||||
sb.append(columnDataset[i]);
|
||||
if (i < columnDataset.length - 1) {
|
||||
sb.append(separator);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String apply(long value) {
|
||||
return datasetAsString;
|
||||
}
|
||||
|
||||
}
|
@ -21,10 +21,11 @@ import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* This function reads a dataset from an HDF5 file. The dataset itself is not
|
||||
* This function reads a dataset of any supported type from an HDF5 file. The dataset itself is not
|
||||
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
|
||||
* reads a single vector from the dataset, based on the long input value.
|
||||
*/
|
||||
@ -42,11 +43,30 @@ public class HdfDatasetToStrings extends AbstractHdfFileToVectorType implements
|
||||
int[] sliceDimensions = new int[dims.length];
|
||||
sliceDimensions[0] = 1;
|
||||
if (dims.length > 1) {
|
||||
for (int i = 1; i < dims.length; i++) {
|
||||
sliceDimensions[i] = dims[i];
|
||||
}
|
||||
System.arraycopy(dims, 1, sliceDimensions, 1, dims.length - 1);
|
||||
}
|
||||
return ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
|
||||
String payload = null;
|
||||
switch(dataset.getJavaType().getSimpleName().toLowerCase()) {
|
||||
case "string" ->
|
||||
payload = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
|
||||
case "int" ->
|
||||
payload = Arrays.toString(((int[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
case "float" ->
|
||||
payload = Arrays.toString(((float[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
case "short" ->
|
||||
payload = Arrays.toString(((short[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
case "long" ->
|
||||
payload = Arrays.toString(((long[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
case "double" ->
|
||||
payload = Arrays.toString(((double[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
case "char" ->
|
||||
payload = String.valueOf(((char[][])dataset.getData(sliceOffset, sliceDimensions))[0]);
|
||||
}
|
||||
if (payload == null) {
|
||||
throw new RuntimeException("Unsupported datatype: " + dataset.getJavaType().getSimpleName());
|
||||
}
|
||||
payload = payload.replaceAll("\\[", "").replaceAll("]", "");
|
||||
return payload;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
|
||||
|
||||
import io.jhdf.HdfFile;
|
||||
import io.jhdf.api.Dataset;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.experimental)
|
||||
public class HdfDatasetsToString implements LongFunction<String> {
|
||||
private final HdfFile hdfFile;
|
||||
private final Dataset DSLeft;
|
||||
private final Dataset DSRight;
|
||||
private final String intraSeparator;
|
||||
private final String interSeparator;
|
||||
|
||||
public HdfDatasetsToString(String filename, String DSNameLeft, String DSNameRight, String intraSeparator, String interSeparator) {
|
||||
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
|
||||
DSLeft = hdfFile.getDatasetByPath(DSNameLeft);
|
||||
DSRight = hdfFile.getDatasetByPath(DSNameRight);
|
||||
this.intraSeparator = intraSeparator;
|
||||
this.interSeparator = interSeparator;
|
||||
}
|
||||
|
||||
/*
|
||||
* Read the column names from the columns DS and store them in an array
|
||||
* Read the column data types from the columnTypes DS and store them in an array
|
||||
* Create a csv schema string from the column names and data types
|
||||
*/
|
||||
@Override
|
||||
public String apply(long value) {
|
||||
Object columnDataset = DSLeft.getData();
|
||||
Object columnTypeDataset = DSRight.getData();
|
||||
return pairByOrdinal((String[]) columnDataset, (String[])columnTypeDataset);
|
||||
}
|
||||
|
||||
private String pairByOrdinal(String[] columnDataset, String[] columnTypeDataset) {
|
||||
if (columnDataset.length != columnTypeDataset.length) {
|
||||
throw new RuntimeException("Left hand dataset and right hand dataset must be the same length");
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < columnDataset.length; i++) {
|
||||
sb.append(columnDataset[i]).append(intraSeparator).append(columnTypeDataset[i]);
|
||||
if (i < columnDataset.length - 1) {
|
||||
sb.append(interSeparator);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
@ -43,6 +43,12 @@ public class EmbeddingGeneratorFactory {
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
case "long" -> {
|
||||
if (!generators.containsKey(type)) {
|
||||
generators.put(type, new LongEmbeddingGenerator());
|
||||
}
|
||||
return generators.get(type);
|
||||
}
|
||||
default -> throw new RuntimeException("Unknown embedding type: " + type);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.hdf5.helpers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class LongEmbeddingGenerator implements EmbeddingGenerator {
|
||||
@Override
|
||||
public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
|
||||
// in this case o will always be long[1][x]
|
||||
long[] vector = ((long[][]) o)[0];
|
||||
Float[] vector2 = new Float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
long[] vector = ((long[][]) o)[0];
|
||||
float[] vector2 = new float[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = (float) vector[i];
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
|
||||
long[] vector = ((long[][]) o)[0];
|
||||
Long[] vector2 = new Long[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = vector[i];
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
return ((long[][]) o)[0];
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
|
||||
long[] vector = ((long[][]) o)[0];
|
||||
Integer[] vector2 = new Integer[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = Math.toIntExact(vector[i]);
|
||||
}
|
||||
return List.of(vector2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
|
||||
long[] vector = ((long[][]) o)[0];
|
||||
int[] vector2 = new int[vector.length];
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector2[i] = Math.toIntExact(vector[i]);
|
||||
}
|
||||
return vector2;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user