make using relevancy measures in verifiers easy and complete

This commit is contained in:
Jonathan Shook 2023-09-08 15:01:59 -05:00
parent e17d581578
commit ea847fed37
15 changed files with 501 additions and 55 deletions

View File

@ -0,0 +1,128 @@
description: |
This is a template for live vector search testing.
The goals of this test are to:
1. establish basic recall metrics on a knn computed dataset
2.
schema: Install the schema required to run the test
rampup: Measure how long it takes to load a set of embeddings
search_and_index: Measure how the system responds to queries while it
is indexing recently ingested data.
#? await_index: Pause and wait for the system to complete compactions or index processing
search: Run vector search with a set of default (or overridden) parameters
search_and_rewrite: Run the same search operations as above, but while rewriting the data
search_and_invalidate: Run the same search operations as above, but while overwriting the data
with different content using the same vector id.
In all of these phases, it is important to instance the metrics with distinct names.
Also, aggregates of recall should include total aggregate as well as a moving average.
scenarios:
default:
schema: run tags='block:schema' labels='target:astra' threads===1
rampup: run tags='block:rampup' labels='target:astra' threads=100 cycles=TEMPLATE(trainsize)
search_and_index: run tags='block:search_and_index,optype=select' labels='target:astra' cycles=TEMPLATE(testsize)
# one activity or two? data leap-frog? or concurrency separate for both?
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
# stop_search_and_index: stop search_and_index
# only possible if we have a triggering event to indicated
live_search: run tags='block:search' labels='target:astra'
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
params:
driver: cqld4
bindings:
id: ToString()
# This
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/distance")
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
blocks:
schema:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
prepared: false
ops:
# create-keyspace: |
# CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
# WITH replication = {'class': 'NetworkTopologyStrategy', 'TEMPLATE(region)': '3'};
create-table: |
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
key TEXT,
value vector<float,TEMPLATE(dimensions)>,
PRIMARY KEY (key)
);
create-sai-index: |
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
rampup:
params:
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
prepared: true
ops:
insert: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist});
# await_index:
# ops:
search_and_index:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
ORDER BY value ANN OF {test_floatlist} LIMIT TEMPLATE(select_limit,100);
tags:
optype: select
verifier-init: |
relevancy=scriptingmetrics.newRelevancyMeasures(_parsed_op,"group","relevancy");
for (int k in new int[]{1,2,5,10,25,50,100}) {
relevancy.addFunction(recall("recall",k));
relevancy.addFunction(precision("precision",k);
relevancy.addFunction(reciprocal_rank("RR",k));
relevancy.addFunction(average_precision("AP",k));
relevancy.addFunction(F1("F1",k));
}
verifier: |
# driver-specific function
actual_indices=cqlRowListToIntArray("id",result))
# driver-agnostic function
relevancy.accept({relevant_indices},actual_indices);
# because we are "verifying" although this needs to be reorganized
return true;
insert_rewrite:
stmt: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist});
tags:
optype: insert
# verifier-init: |
# verifier: |
search_and_rewrite:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
verifier-init: |
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
# verifier: |
upsert_same:
stmt: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});
search_and_invalidate:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
# verifier-init: |
# verifier: |
upsert_random: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});

View File

@ -38,6 +38,6 @@ public class ComputeFunctionPluginInfo implements ScriptingExtensionPluginInfo<C
@Override
public List<Class<?>> autoImportStaticMethodClasses() {
return List.of(ComputeFunctions.class);
return List.of(ComputeFunctions.class, RelevancyFunctions.class);
}
}

View File

@ -131,30 +131,6 @@ public class ComputeFunctions {
return (double) intersection / (double) actual.length;
}
/**
* Compute the intersection of two long arrays
*/
public static long[] intersection(long[] a, long[] b) {
return Intersections.find(a, b);
}
/**
* Compute the intersection of two int arrays
*/
public static int[] intersection(int[] a, int[] b) {
return Intersections.find(a, b);
}
/**
* Compute the size of the intersection of two int arrays
*/
public static int intersectionSize(int[] a, int[] b) {
return Intersections.count(a, b);
}
public static int intersectionSize(long[] a, long[] b) {
return Intersections.count(a, b);
}
public static double F1(int[] relevant, int[] actual) {
return F1(relevant, actual, relevant.length);
@ -229,6 +205,9 @@ public class ComputeFunctions {
return stats.getAverage();
}
public static double average_precision(long[] relevant, long[] actual) {
return average_precision(relevant, actual, actual.length);
}
public static double average_precision(long[] relevant, long[] actual, int k) {
int maxK = Math.min(k,actual.length);
HashSet<Long> refset = new HashSet<>(relevant.length);

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.relavency.*;
import java.util.Map;
public class RelevancyFunctions {
public static Recall recall(String name, int k, Map<String,String> labels) {
return new Recall(name, k, labels);
}
public static Precision precision(String name, int k, Map<String,String> labels) {
return new Precision(name, k, labels);
}
public static F1 F1(String name, int k, Map<String,String> labels) {
return new F1(name, k, labels);
}
public static AveragePrecision average_precision(String name, int k, Map<String,String> labels) {
return new AveragePrecision(name, k, labels);
}
public static ReciprocalRank rank_reciprocal(String name, int k, Map<String,String> labels) {
return new ReciprocalRank(name, k, NBLabels.forKV("k",k).andTypes(labels));
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.ComputeFunctions;
import java.util.Map;
public class AveragePrecision extends BaseRelevancyFunction {
private final int k;
public AveragePrecision(String name, int k, Map<String,String> labels) {
super(name, NBLabels.forKV("k",labels).andTypes(labels));
this.k = k;
}
public AveragePrecision(String name, int k, Object... labels) {
super(name, NBLabels.forKV("k",k).andTypes(labels));
this.k = k;
}
@Override
public double apply(int[] relevant, int[] actual) {
return ComputeFunctions.average_precision(relevant,actual,k);
}
}

View File

@ -14,33 +14,32 @@
* limitations under the License.
*/
package io.nosqlbench.api.engine.metrics.instruments;
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.api.engine.metrics.wrappers.RelevancyFunction;
import java.util.function.DoubleConsumer;
import java.util.Map;
public class CompoundGaugeFunction implements NBMetricGauge<Double>, DoubleConsumer {
private final NBLabels labels;
public abstract class BaseRelevancyFunction implements RelevancyFunction {
private final String name;
public CompoundGaugeFunction(NBLabels labels, String name) {
this.labels = labels;
private final NBLabels labels;
public BaseRelevancyFunction(String name, Object... labeldata) {
this.name = name;
this.labels = NBLabels.forKV(labeldata);
}
@Override
public Double getValue() {
return null;
public BaseRelevancyFunction(String name, Map<String,String> labels) {
this.name = name;
this.labels = NBLabels.forMap(labels);
}
@Override
public NBLabels getLabels() {
return null;
return this.labels;
}
@Override
public void accept(double value) {
public String getName() {
return this.name;
}
}

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.ComputeFunctions;
import java.util.Map;
public class F1 extends BaseRelevancyFunction {
private final int k;
public F1(String name, int k, Map<String,String> labels) {
super(name, NBLabels.forKV("k",labels).andTypes(labels));
this.k = k;
}
public F1(String name, int k, Object... labels) {
super(name, NBLabels.forKV("k",k).andTypes(labels));
this.k = k;
}
@Override
public double apply(int[] relevant, int[] actual) {
return ComputeFunctions.F1(relevant,actual,k);
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.ComputeFunctions;
import java.util.Map;
public class Precision extends BaseRelevancyFunction {
private final int k;
public Precision(String name, int k, Map<String,String> labels) {
super(name,NBLabels.forKV("k",labels).andTypes(labels));
this.k = k;
}
public Precision(String name, int k, Object... labels) {
super(name, NBLabels.forKV("k",k).andTypes(labels));
this.k = k;
}
@Override
public double apply(int[] relevant, int[] actual) {
return ComputeFunctions.precision(relevant, actual, k);
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.ComputeFunctions;
import java.util.Map;
public class Recall extends BaseRelevancyFunction {
private final int k;
public Recall(String name, int k, Object...labeldata) {
super(name, NBLabels.forKV("k",k).andTypes(labeldata));
this.k = k;
}
public Recall(String name, int k, Map<String,String> labels) {
super(name, NBLabels.forKV("k",labels).andTypes(labels));
this.k = k;
}
@Override
public double apply(int[] relevant, int[] actual) {
return ComputeFunctions.recall(relevant,actual,k);
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.engine.extensions.computefunctions.relavency;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.engine.extensions.computefunctions.ComputeFunctions;
import java.util.Map;
public class ReciprocalRank extends BaseRelevancyFunction {
private final int k;
public ReciprocalRank(String name, int k, Map<String,String> labels) {
super(name, NBLabels.forKV("k",labels).andTypes(labels));
this.k = k;
}
public ReciprocalRank(String name, int k, Object... labels) {
super(name, NBLabels.forKV("k",k).andTypes(labels));
this.k = k;
}
@Override
public double apply(int[] relevant, int[] actual) {
return ComputeFunctions.reciprocal_rank(relevant,actual,k);
}
}

View File

@ -21,8 +21,11 @@ import io.nosqlbench.api.config.LabeledScenarioContext;
import io.nosqlbench.api.config.NBLabeledElement;
import io.nosqlbench.api.engine.metrics.ActivityMetrics;
import io.nosqlbench.api.engine.metrics.DoubleSummaryGauge;
import io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures;
import org.apache.logging.log4j.Logger;
import java.util.Map;
public class ScriptingMetrics {
private final Logger logger;
private final MetricRegistry metricRegistry;
@ -53,6 +56,10 @@ public class ScriptingMetrics {
return summaryGauge;
}
public RelevancyMeasures newRelevancyMeasures(NBLabeledElement parent, Map<String,String> labels) {
return new RelevancyMeasures(parent,labels);
}
}

View File

@ -124,7 +124,7 @@ public class MapLabels implements NBLabels {
}
@Override
public MapLabels andTypes(final String... labelsAndValues) {
public MapLabels andTypes(final Object... labelsAndValues) {
final Map<String,String> childLabels = getStringStringMap(labelsAndValues);
return new MapLabels(labels,childLabels);
}
@ -136,7 +136,7 @@ public class MapLabels implements NBLabels {
@Override
public MapLabels andInstances(final String... labelsAndValues) {
public MapLabels andInstances(final Object... labelsAndValues) {
final Map<String,String> childLabels = getStringStringMap(labelsAndValues);
String[] childInstanceFields = getNamesArray(labelsAndValues);
return new MapLabels(this.labels,childLabels,concat(this.instanceFields,getNamesArray(labelsAndValues)));
@ -228,19 +228,19 @@ public class MapLabels implements NBLabels {
return c;
}
private static String[] getNamesArray(final String... labelsAndValues) {
private static String[] getNamesArray(final Object... labelsAndValues) {
String[] keys = new String[labelsAndValues.length>>1];
for (int i = 0; i < keys.length; i++) {
keys[i]=labelsAndValues[i<<1];
keys[i]=labelsAndValues[i<<1].toString();
}
return keys;
}
@NotNull
private static Map<String, String> getStringStringMap(String[] labelsAndValues) {
private static Map<String, String> getStringStringMap(Object[] labelsAndValues) {
if (0 != (labelsAndValues.length % 2))
throw new RuntimeException("Must provide even number of keys and values: " + Arrays.toString(labelsAndValues));
final Map<String, String> childLabels = new LinkedHashMap<>();
for (int i = 0; i < labelsAndValues.length; i+=2) childLabels.put(labelsAndValues[i], labelsAndValues[i + 1]);
for (int i = 0; i < labelsAndValues.length; i+=2) childLabels.put(labelsAndValues[i].toString(), labelsAndValues[i + 1].toString());
return childLabels;
}

View File

@ -105,11 +105,11 @@ public interface NBLabels {
* Keys and values such as "key1", "value1", "key2", "value2", ...
* @return a new NBLabels instance
*/
static NBLabels forKV(final String... keysAndValues) {
static NBLabels forKV(final Object... keysAndValues) {
if (0 != (keysAndValues.length % 2))
throw new RuntimeException("keys and values must be provided in pairs, not as: " + Arrays.toString(keysAndValues));
final LinkedHashMap<String,String> labels = new LinkedHashMap<>(keysAndValues.length >> 1);
for (int i = 0; i < keysAndValues.length; i += 2) labels.put(keysAndValues[i], keysAndValues[i + 1]);
for (int i = 0; i < keysAndValues.length; i += 2) labels.put(keysAndValues[i].toString(), keysAndValues[i + 1].toString());
return new MapLabels(labels);
}
@ -142,7 +142,7 @@ public interface NBLabels {
* Keys and values in "key1", "value1", "key2", "value2", ... form
* @return A new NBLabels instance
*/
NBLabels andTypes(String... typeLabelsAndValues);
NBLabels andTypes(Object... typeLabelsAndValues);
NBLabels and(NBLabels labels);
/**
@ -151,7 +151,7 @@ public interface NBLabels {
* @return A new NBLabels instance
*/
NBLabels andTypes(Map<String, String> typeLabelsAndValues);
NBLabels andInstances(String... instanceLabelsAndValues);
NBLabels andInstances(Object... instanceLabelsAndValues);
NBLabels andInstances(Map<String,String> instanceLabelsAndValues);

View File

@ -0,0 +1,24 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.api.engine.metrics.wrappers;
import io.nosqlbench.api.config.NBLabeledElement;
public interface RelevancyFunction extends NBLabeledElement {
double apply(int[] relevant, int[] actual);
String getName();
}

View File

@ -0,0 +1,68 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.api.engine.metrics.wrappers;
import io.nosqlbench.api.config.NBLabeledElement;
import io.nosqlbench.api.config.NBLabels;
import io.nosqlbench.api.engine.metrics.ActivityMetrics;
import io.nosqlbench.api.engine.metrics.DoubleSummaryGauge;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class RelevancyMeasures implements NBLabeledElement {
private final NBLabeledElement parent;
private final NBLabels labels;
private final List<RelevancyFunction> functions = new ArrayList<>();
private final List<DoubleSummaryGauge> gauges = new ArrayList<>();
public RelevancyMeasures(NBLabeledElement parent) {
this(parent,NBLabels.forKV());
}
public RelevancyMeasures(NBLabeledElement parent, NBLabels labels) {
this.parent = parent;
this.labels = labels;
}
public RelevancyMeasures(NBLabeledElement parent, Map<String,String> labels) {
this(parent,NBLabels.forMap(labels));
}
@Override
public NBLabels getLabels() {
return parent.getLabels().and(labels);
}
public RelevancyMeasures addFunction(RelevancyFunction... f) {
for (RelevancyFunction function : this.functions) {
this.functions.add(function);
DoubleSummaryGauge gauge = ActivityMetrics.summaryGauge(function, function.getName());
this.gauges.add(gauge);
}
return this;
}
public void accept(int[] relevant, int[] actual) {
for (int i = 0; i < functions.size(); i++) {
double metricValue = functions.get(i).apply(relevant, actual);
gauges.get(i).accept(metricValue);
}
}
}