Merge pull request #1685 from nosqlbench/mwolters/vpf-72

Support mixed-schema KNN data set formats
This commit is contained in:
Jonathan Shook 2023-12-06 01:02:31 -06:00 committed by GitHub
commit 904ce1c6ec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 592 additions and 5 deletions

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
/**
* This interface is used to parse the raw JSON from the HDF dataset into a CQL predicate.
*/
public interface DatasetParser {
/**
* Return the specified class to parse the raw JSON from the HDF dataset into a CQL predicate.
* @param parsername
* @return A new instance of the specified parser class.
*/
static DatasetParser parserFactory(String parsername) {
return switch (parsername) {
case "default" -> new DefaultDatasetParser();
case "noop" -> new NoopDatasetParser();
case "jaw" -> new JAWDatasetParser();
default -> throw new RuntimeException("Unknown parser name: " + parsername);
};
}
String parse(String raw);
}

View File

@ -0,0 +1,117 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This is the default
* implementation. It accepts a JSON string of the form found at https://github.com/qdrant/ann-filtering-benchmark-datasets
* and converts it into a CQL predicate in String form
*/
public class DefaultDatasetParser implements DatasetParser {
private static final String WHERE = "WHERE";
private static final String MATCH = "match";
private static final String AND = "and";
private static final String OR = "or";
private static final String EQ = "=";
private static final String IN = "IN";
private static final String CONDITIONS = "conditions";
private static final String VALUE = "value";
private static final String SPACE = " ";
private static final String SINGLE_QUOTE = "'";
private static final String COMMA = ",";
private static final String LEFT_PAREN = "(";
private static final String RIGHT_PAREN = ")";
private static final Logger logger = LogManager.getLogger(DefaultDatasetParser.class);
@Override
public String parse(String raw) {
logger.debug(() -> "Parsing: " + raw);
JsonObject conditions = JsonParser.parseString(raw).getAsJsonObject().get(CONDITIONS).getAsJsonObject();
if (conditions.has(AND)) {
return parseAndConditionsInline(conditions);
} else if (conditions.has(OR)) {
return parseOrConditionsInline(conditions);
} else {
throw new RuntimeException("Unknown predicate type: " + conditions.keySet());
}
}
private String parseOrConditionsInline(JsonObject conditions) {
StringBuilder builder = new StringBuilder(WHERE);
boolean first = true;
for (JsonElement e : conditions.get(OR).getAsJsonArray()) {
JsonObject condition = e.getAsJsonObject();
String field = condition.keySet().iterator().next();
JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
if (match != null) {
if (first) {
builder.append(SPACE).append(field).append(SPACE).append(IN).append(LEFT_PAREN);
first = false;
} else {
builder.append(COMMA);
}
boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
builder.append(
isNumeric ?
match.getAsJsonObject().get(VALUE).getAsString() :
SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
);
} else {
logger.error(() -> "No match found for: " + condition.keySet());
}
}
builder.append(RIGHT_PAREN);
return builder.toString();
}
private String parseAndConditionsInline(JsonObject conditions) {
StringBuilder builder = new StringBuilder(WHERE);
boolean first = true;
for (JsonElement e : conditions.get(AND).getAsJsonArray()) {
JsonObject condition = e.getAsJsonObject();
String field = condition.keySet().iterator().next();
JsonElement match = condition.get(field).getAsJsonObject().get(MATCH);
if (match != null) {
if (first) {
first = false;
} else {
builder.append(SPACE).append(AND);
}
boolean isNumeric = match.getAsJsonObject().get(VALUE).isJsonPrimitive() &&
match.getAsJsonObject().get(VALUE).getAsJsonPrimitive().isNumber();
builder.append(SPACE).append(field).append(EQ);
builder.append(
isNumeric ?
match.getAsJsonObject().get(VALUE).getAsString() :
SINGLE_QUOTE + match.getAsJsonObject().get(VALUE).getAsString() + SINGLE_QUOTE
);
} else {
logger.error(() -> "No match found for: " + condition.keySet());
}
}
return builder.toString();
}
}

View File

@ -0,0 +1,64 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.util.function.LongFunction;
/**
* Binding function that accepts a long input value for the cycle and returns a string consisting of the
* CQL predicate parsed from a single record in an HDF5 dataset
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfDatasetToCqlPredicates implements LongFunction<String> {
private final HdfFile hdfFile;
private final Dataset dataset;
private final int recordCount;
private final DatasetParser parser;
/**
* Create a new binding function that accepts a long input value for the cycle and returns a string
* @param filename
* @param datasetname
* @param parsername
*/
public HdfDatasetToCqlPredicates(String filename, String datasetname, String parsername) {
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
dataset = hdfFile.getDatasetByPath(datasetname);
recordCount = dataset.getDimensions()[0];
parser = DatasetParser.parserFactory(parsername);
}
public HdfDatasetToCqlPredicates(String filename, String datasetname) {
this(filename, datasetname, "default");
}
@Override
public String apply(long l) {
long[] sliceOffset = {(l % recordCount)};
int[] sliceDimensions = {1};
String raw = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
return parser.parse(raw);
}
}

View File

@ -0,0 +1,33 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
/**
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
* accepts a string consisting of the desired CQL predicate as translated from the original jsonl files
* and simply adds the WHERE keyword to the beginning of the string if it is not already present, hence
* the new Just Add Where (JAW) parser.
*/
public class JAWDatasetParser implements DatasetParser {
private static final String WHERE = "WHERE";
@Override
public String parse(String raw) {
if (!raw.toUpperCase().startsWith(WHERE)) {
raw = WHERE + " " + raw;
}
return raw;
}
}

View File

@ -0,0 +1,28 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
/**
* This class is used to parse the raw JSON from the HDF dataset into a CQL predicate. This implementation
* accepts a string consisting of the desired CQL predicate as translated from the original jsonl files and
* simply returns the raw string, hence the name NoopDatasetParser.
*/
public class NoopDatasetParser implements DatasetParser {
@Override
public String parse(String raw) {
return raw;
}
}

View File

@ -0,0 +1,54 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.datamappers.functions.hdf_to_cql;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class DefaultDatasetParserTest {
String test1 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 53}}}]}}";
String test2 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": \"thirteen\"}}}, {\"b\": {\"match\": {\"value\": \"fifty-four\"}}}]}}";
String test3 = "{\"conditions\": {\"and\": [{\"a\": {\"match\": {\"value\": 13}}}, {\"b\": {\"match\": {\"value\": 54}}}, {\"a\": {\"match\": {\"value\": 154}}}]}}";
String test4 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}]}}";
String test5 = "{\"conditions\": {\"or\": [{\"a\": {\"match\": {\"value\": 9}}}, {\"a\": {\"match\": {\"value\": 71}}}, {\"a\": {\"match\": {\"value\": 7}}}]}}";
String test6 = "{\"conditions\": {\"or\": [{\"b\": {\"match\": {\"value\": \"foo\"}}}, {\"b\": {\"match\": {\"value\": \"bar\"}}}]}}";
@Test
public void testParse() {
DefaultDatasetParser parser = new DefaultDatasetParser();
String parsed = parser.parse(test1);
assertEquals("WHERE a=53", parsed);
parsed = parser.parse(test2);
assertEquals("WHERE a='thirteen' and b='fifty-four'", parsed);
parsed = parser.parse(test3);
assertEquals("WHERE a=13 and b=54 and a=154", parsed);
parsed = parser.parse(test4);
assertEquals("WHERE a IN(9,71)", parsed);
parsed = parser.parse(test5);
assertEquals("WHERE a IN(9,71,7)", parsed);
parsed = parser.parse(test6);
assertEquals("WHERE b IN('foo','bar')", parsed);
}
}

View File

@ -0,0 +1,78 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.util.function.LongFunction;
/**
* This function reads a vector dataset from an HDF5 file. The entire dataset is parsed into a single
* String Object with the discreet values separated by the user supplied separator character. It is
* intended for use only with small datasets where the entire dataset can be read into memory and there
* is no need to read individual vectors from the dataset.
* The lambda function simply returns the String representation of the dataset.
*/
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfDatasetToString implements LongFunction<String> {
private final HdfFile hdfFile;
private final Dataset dataset;
private final String separator;
private final String datasetAsString;
/**
* Create a new binding function that accepts a long input value for the cycle and returns a string representation
* of the specified dataset
* @param filename
* @param dataset
* @param separator
*/
public HdfDatasetToString(String filename, String dataset, String separator) {
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
this.dataset = hdfFile.getDatasetByPath(dataset);
this.separator = separator;
this.datasetAsString = parseDataset();
}
public HdfDatasetToString(String filename, String dataset) {
this(filename, dataset, ",");
}
private String parseDataset() {
String[] columnDataset = (String[])dataset.getData();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < columnDataset.length; i++) {
sb.append(columnDataset[i]);
if (i < columnDataset.length - 1) {
sb.append(separator);
}
}
return sb.toString();
}
@Override
public String apply(long value) {
return datasetAsString;
}
}

View File

@ -21,10 +21,11 @@ import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import io.nosqlbench.virtdata.library.hdf5.from_long.AbstractHdfFileToVectorType;
import java.util.Arrays;
import java.util.function.LongFunction;
/**
* This function reads a dataset from an HDF5 file. The dataset itself is not
* This function reads a dataset of any supported type from an HDF5 file. The dataset itself is not
* read into memory, only the metadata (the "dataset" Java Object). The lambda function
* reads a single vector from the dataset, based on the long input value.
*/
@ -42,11 +43,30 @@ public class HdfDatasetToStrings extends AbstractHdfFileToVectorType implements
int[] sliceDimensions = new int[dims.length];
sliceDimensions[0] = 1;
if (dims.length > 1) {
for (int i = 1; i < dims.length; i++) {
sliceDimensions[i] = dims[i];
}
System.arraycopy(dims, 1, sliceDimensions, 1, dims.length - 1);
}
return ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
String payload = null;
switch(dataset.getJavaType().getSimpleName().toLowerCase()) {
case "string" ->
payload = ((String[])dataset.getData(sliceOffset, sliceDimensions))[0];
case "int" ->
payload = Arrays.toString(((int[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
case "float" ->
payload = Arrays.toString(((float[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
case "short" ->
payload = Arrays.toString(((short[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
case "long" ->
payload = Arrays.toString(((long[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
case "double" ->
payload = Arrays.toString(((double[][]) dataset.getData(sliceOffset, sliceDimensions))[0]);
case "char" ->
payload = String.valueOf(((char[][])dataset.getData(sliceOffset, sliceDimensions))[0]);
}
if (payload == null) {
throw new RuntimeException("Unsupported datatype: " + dataset.getJavaType().getSimpleName());
}
payload = payload.replaceAll("\\[", "").replaceAll("]", "");
return payload;
}
}

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.hdf5.from_long.to_string;
import io.jhdf.HdfFile;
import io.jhdf.api.Dataset;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.util.function.LongFunction;
@ThreadSafeMapper
@Categories(Category.experimental)
public class HdfDatasetsToString implements LongFunction<String> {
private final HdfFile hdfFile;
private final Dataset DSLeft;
private final Dataset DSRight;
private final String intraSeparator;
private final String interSeparator;
public HdfDatasetsToString(String filename, String DSNameLeft, String DSNameRight, String intraSeparator, String interSeparator) {
hdfFile = new HdfFile(NBIO.all().search(filename).one().asPath());
DSLeft = hdfFile.getDatasetByPath(DSNameLeft);
DSRight = hdfFile.getDatasetByPath(DSNameRight);
this.intraSeparator = intraSeparator;
this.interSeparator = interSeparator;
}
/*
* Read the column names from the columns DS and store them in an array
* Read the column data types from the columnTypes DS and store them in an array
* Create a csv schema string from the column names and data types
*/
@Override
public String apply(long value) {
Object columnDataset = DSLeft.getData();
Object columnTypeDataset = DSRight.getData();
return pairByOrdinal((String[]) columnDataset, (String[])columnTypeDataset);
}
private String pairByOrdinal(String[] columnDataset, String[] columnTypeDataset) {
if (columnDataset.length != columnTypeDataset.length) {
throw new RuntimeException("Left hand dataset and right hand dataset must be the same length");
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < columnDataset.length; i++) {
sb.append(columnDataset[i]).append(intraSeparator).append(columnTypeDataset[i]);
if (i < columnDataset.length - 1) {
sb.append(interSeparator);
}
}
return sb.toString();
}
}

View File

@ -43,6 +43,12 @@ public class EmbeddingGeneratorFactory {
}
return generators.get(type);
}
case "long" -> {
if (!generators.containsKey(type)) {
generators.put(type, new LongEmbeddingGenerator());
}
return generators.get(type);
}
default -> throw new RuntimeException("Unknown embedding type: " + type);
}
}

View File

@ -0,0 +1,78 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package io.nosqlbench.virtdata.library.hdf5.helpers;
import java.util.List;
public class LongEmbeddingGenerator implements EmbeddingGenerator {
@Override
public List<Float> generateFloatListEmbeddingFrom(Object o, int[] dims) {
// in this case o will always be long[1][x]
long[] vector = ((long[][]) o)[0];
Float[] vector2 = new Float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return List.of(vector2);
}
@Override
public float[] generateFloatArrayEmbeddingFrom(Object o, int[] dims) {
long[] vector = ((long[][]) o)[0];
float[] vector2 = new float[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = (float) vector[i];
}
return vector2;
}
@Override
public List<Long> generateLongListEmbeddingFrom(Object o, int[] dims) {
long[] vector = ((long[][]) o)[0];
Long[] vector2 = new Long[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = vector[i];
}
return List.of(vector2);
}
@Override
public long[] generateLongArrayEmbeddingFrom(Object o, int[] dims) {
return ((long[][]) o)[0];
}
@Override
public List<Integer> generateIntListEmbeddingFrom(Object o, int[] dims) {
long[] vector = ((long[][]) o)[0];
Integer[] vector2 = new Integer[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = Math.toIntExact(vector[i]);
}
return List.of(vector2);
}
@Override
public int[] generateIntArrayEmbeddingFrom(Object o, int[] dims) {
long[] vector = ((long[][]) o)[0];
int[] vector2 = new int[vector.length];
for (int i = 0; i < vector.length; i++) {
vector2[i] = Math.toIntExact(vector[i]);
}
return vector2;
}
}