mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2024-12-23 15:40:44 -06:00
workload in dev
This commit is contained in:
parent
b8c9af6e71
commit
4bb371b276
@ -0,0 +1,155 @@
|
|||||||
|
description: |
|
||||||
|
This is a template for live vector search testing.
|
||||||
|
|
||||||
|
schema: Install the schema required to run the test
|
||||||
|
rampup: Measure how long it takes to load a set of embeddings
|
||||||
|
search_and_index: Measure how the system responds to queries while it
|
||||||
|
is indexing recently ingested data.
|
||||||
|
#? await_index: Pause and wait for the system to complete compactions or index processing
|
||||||
|
search: Run vector search with a set of default (or overridden) parameters
|
||||||
|
search_and_rewrite: Run the same search operations as above, but while rewriting the data
|
||||||
|
search_and_invalidate: Run the same search operations as above, but while overwriting the data
|
||||||
|
with different content using the same vector id.
|
||||||
|
In all of these phases, it is important to instance the metrics with distinct names.
|
||||||
|
Also, aggregates of recall should include total aggregate as well as a moving average.
|
||||||
|
|
||||||
|
scenarios:
|
||||||
|
cassandra:
|
||||||
|
# nb5 cql-vector cassandra.schema host=localhost localdc=datacenter1 dimensions=100
|
||||||
|
schema: run tags='block:schema' threads==undef cycles==undef
|
||||||
|
# nb5 cql-vector cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
|
||||||
|
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
|
||||||
|
# nb5 cql-vector cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
|
||||||
|
search_and_index: >-
|
||||||
|
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
|
||||||
|
cycles=TEMPLATE(testsize) stride=100 striderate=7.50
|
||||||
|
errors=counter,warn threads=1
|
||||||
|
astra_vectors:
|
||||||
|
drop: run tags='block:drop' labels='target:astra' threads===1 cycles===2 driverconfig=app.conf
|
||||||
|
schema: run tags='block:schema' labels='target:astra' threads===1 cycles===2
|
||||||
|
rampup: run tags='block:rampup' labels='target:astra' threads=100 cycles=TEMPLATE(trainsize) errors=counter
|
||||||
|
# search_and_index_unthrottled: >-
|
||||||
|
# run tags='block:search_and_index,optype=select' labels='target:astra'
|
||||||
|
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
|
||||||
|
search_and_index: >-
|
||||||
|
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:astra'
|
||||||
|
cycles=TEMPLATE(testsize) errors=count,retry stride=100 striderate=7.50
|
||||||
|
errors=counter threads=500
|
||||||
|
# one activity or two? data leap-frog? or concurrency separate for both?
|
||||||
|
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
|
||||||
|
# stop_search_and_index: stop search_and_index
|
||||||
|
# only possible if we have a triggering event to indicated
|
||||||
|
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
|
||||||
|
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
|
||||||
|
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
|
||||||
|
|
||||||
|
params:
|
||||||
|
driver: cqld4
|
||||||
|
instrument: true
|
||||||
|
|
||||||
|
bindings:
|
||||||
|
id: ToString()
|
||||||
|
# This
|
||||||
|
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
|
||||||
|
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
|
||||||
|
distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/distance")
|
||||||
|
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
|
||||||
|
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
|
||||||
|
|
||||||
|
blocks:
|
||||||
|
drop:
|
||||||
|
params:
|
||||||
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
ops:
|
||||||
|
drop_index:
|
||||||
|
raw: |
|
||||||
|
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||||
|
drop_table:
|
||||||
|
raw: |
|
||||||
|
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||||
|
schema:
|
||||||
|
params:
|
||||||
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
ops:
|
||||||
|
create_keyspace:
|
||||||
|
raw: |
|
||||||
|
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
|
||||||
|
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
|
||||||
|
target: cassandra
|
||||||
|
create_table:
|
||||||
|
raw: |
|
||||||
|
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
|
||||||
|
key TEXT,
|
||||||
|
value vector<float,TEMPLATE(dimensions)>,
|
||||||
|
PRIMARY KEY (key)
|
||||||
|
);
|
||||||
|
tags:
|
||||||
|
target: astra
|
||||||
|
create_sai_index:
|
||||||
|
raw: |
|
||||||
|
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
|
||||||
|
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
|
||||||
|
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
|
||||||
|
tags:
|
||||||
|
target: astra
|
||||||
|
rampup:
|
||||||
|
params:
|
||||||
|
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
|
||||||
|
prepared: true
|
||||||
|
ops:
|
||||||
|
insert: |
|
||||||
|
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
|
(key, value) VALUES ({id},{train_floatlist});
|
||||||
|
# await_index:
|
||||||
|
# ops:
|
||||||
|
search_and_index:
|
||||||
|
ops:
|
||||||
|
select_ann_limit_TEMPLATE(k,100):
|
||||||
|
prepared: |
|
||||||
|
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
|
ORDER BY value ANN OF {test_floatlist} LIMIT TEMPLATE(select_limit,100);
|
||||||
|
tags:
|
||||||
|
optype: select
|
||||||
|
verifier-init: |
|
||||||
|
k=TEMPLATE(k,100)
|
||||||
|
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
|
||||||
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
|
||||||
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
|
||||||
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
|
||||||
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
|
||||||
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
|
||||||
|
verifier: |
|
||||||
|
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
|
||||||
|
relevancy.accept({relevant_indices},actual_indices);
|
||||||
|
return true;
|
||||||
|
insert_rewrite:
|
||||||
|
prepared: |
|
||||||
|
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
|
(key, value) VALUES ({id},{train_floatlist});
|
||||||
|
tags:
|
||||||
|
optype: insert
|
||||||
|
|
||||||
|
search_and_rewrite:
|
||||||
|
ops:
|
||||||
|
select_ann_limit:
|
||||||
|
stmt: |
|
||||||
|
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
|
||||||
|
verifier-init: |
|
||||||
|
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
|
||||||
|
# verifier: |
|
||||||
|
upsert_same:
|
||||||
|
stmt: |
|
||||||
|
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
|
(key, value) VALUES ({rw_key},{train_vector});
|
||||||
|
search_and_invalidate:
|
||||||
|
ops:
|
||||||
|
select_ann_limit:
|
||||||
|
stmt: |
|
||||||
|
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
|
||||||
|
# verifier-init: |
|
||||||
|
# verifier: |
|
||||||
|
upsert_random: |
|
||||||
|
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
|
(key, value) VALUES ({rw_key},{train_vector});
|
||||||
|
|
||||||
|
|
20
adapter-cqld4/src/main/resources/activities/baselinesv2/get_datasets.sh
Executable file
20
adapter-cqld4/src/main/resources/activities/baselinesv2/get_datasets.sh
Executable file
@ -0,0 +1,20 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
DATASETS="glove-25-angular glove-50-angular glove-100-angular glove-200-angular deep-image-96-angular lastfm-64-dot"
|
||||||
|
|
||||||
|
mkdir -p testdata
|
||||||
|
pushd .
|
||||||
|
cd testdata
|
||||||
|
|
||||||
|
if [ -f _env.sh ]
|
||||||
|
then . _env.sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
DATASET=${DATASETS?is required}
|
||||||
|
|
||||||
|
for dataset in ${DATASETS}
|
||||||
|
do
|
||||||
|
URL="http://ann-benchmarks.com/${dataset}.hdf5"
|
||||||
|
curl -OL "${URL}"
|
||||||
|
done
|
||||||
|
|
Loading…
Reference in New Issue
Block a user