workload in dev

This commit is contained in:
Jonathan Shook 2023-10-16 22:58:49 -05:00
parent b8c9af6e71
commit 4bb371b276
2 changed files with 175 additions and 0 deletions

View File

@ -0,0 +1,155 @@
description: |
This is a template for live vector search testing.
schema: Install the schema required to run the test
rampup: Measure how long it takes to load a set of embeddings
search_and_index: Measure how the system responds to queries while it
is indexing recently ingested data.
#? await_index: Pause and wait for the system to complete compactions or index processing
search: Run vector search with a set of default (or overridden) parameters
search_and_rewrite: Run the same search operations as above, but while rewriting the data
search_and_invalidate: Run the same search operations as above, but while overwriting the data
with different content using the same vector id.
In all of these phases, it is important to instance the metrics with distinct names.
Also, aggregates of recall should include total aggregate as well as a moving average.
scenarios:
cassandra:
# nb5 cql-vector cassandra.schema host=localhost localdc=datacenter1 dimensions=100
schema: run tags='block:schema' threads==undef cycles==undef
# nb5 cql-vector cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
# nb5 cql-vector cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
search_and_index: >-
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
cycles=TEMPLATE(testsize) stride=100 striderate=7.50
errors=counter,warn threads=1
astra_vectors:
drop: run tags='block:drop' labels='target:astra' threads===1 cycles===2 driverconfig=app.conf
schema: run tags='block:schema' labels='target:astra' threads===1 cycles===2
rampup: run tags='block:rampup' labels='target:astra' threads=100 cycles=TEMPLATE(trainsize) errors=counter
# search_and_index_unthrottled: >-
# run tags='block:search_and_index,optype=select' labels='target:astra'
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
search_and_index: >-
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:astra'
cycles=TEMPLATE(testsize) errors=count,retry stride=100 striderate=7.50
errors=counter threads=500
# one activity or two? data leap-frog? or concurrency separate for both?
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
# stop_search_and_index: stop search_and_index
# only possible if we have a triggering event to indicated
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
params:
driver: cqld4
instrument: true
bindings:
id: ToString()
# This
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/distance")
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
blocks:
drop:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
drop_index:
raw: |
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
drop_table:
raw: |
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
schema:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
create_keyspace:
raw: |
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
target: cassandra
create_table:
raw: |
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
key TEXT,
value vector<float,TEMPLATE(dimensions)>,
PRIMARY KEY (key)
);
tags:
target: astra
create_sai_index:
raw: |
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
tags:
target: astra
rampup:
params:
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
prepared: true
ops:
insert: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist});
# await_index:
# ops:
search_and_index:
ops:
select_ann_limit_TEMPLATE(k,100):
prepared: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
ORDER BY value ANN OF {test_floatlist} LIMIT TEMPLATE(select_limit,100);
tags:
optype: select
verifier-init: |
k=TEMPLATE(k,100)
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
verifier: |
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
relevancy.accept({relevant_indices},actual_indices);
return true;
insert_rewrite:
prepared: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist});
tags:
optype: insert
search_and_rewrite:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
verifier-init: |
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
# verifier: |
upsert_same:
stmt: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});
search_and_invalidate:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
# verifier-init: |
# verifier: |
upsert_random: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});

View File

@ -0,0 +1,20 @@
#!/bin/bash
DATASETS="glove-25-angular glove-50-angular glove-100-angular glove-200-angular deep-image-96-angular lastfm-64-dot"
mkdir -p testdata
pushd .
cd testdata
if [ -f _env.sh ]
then . _env.sh
fi
DATASET=${DATASETS?is required}
for dataset in ${DATASETS}
do
URL="http://ann-benchmarks.com/${dataset}.hdf5"
curl -OL "${URL}"
done