add file format variant for vecs vs hdf5

This commit is contained in:
Jonathan Shook 2023-10-26 14:37:37 -05:00
parent ece9618344
commit 7ed01e4671

View File

@ -0,0 +1,154 @@
min_version: 5.21
description: |
This is a template for live vector search testing.
schema: Install the schema required to run the test
rampup: Measure how long it takes to load a set of embeddings
search_and_index: Measure how the system responds to queries while it
is indexing recently ingested data.
#? await_index: Pause and wait for the system to complete compactions or index processing
search: Run vector search with a set of default (or overridden) parameters
search_and_rewrite: Run the same search operations as above, but while rewriting the data
search_and_invalidate: Run the same search operations as above, but while overwriting the data
with different content using the same vector id.
In all of these phases, it is important to instance the metrics with distinct names.
Also, aggregates of recall should include total aggregate as well as a moving average.
scenarios:
cassandra:
drop: run tags='block:drop' threads==undef cycles==undef
# nb5 cql-vector2 cassandra.schema host=localhost localdc=datacenter1 dimensions=100
schema: run tags='op=create_.*' threads==undef cycles==undef
# nb5 cql-vector2 cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
# nb5 cql-vector2 cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
read_recall: >-
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
cycles=TEMPLATE(testsize) errors=counter,warn threads=1
astra_vectors:
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
# search_and_index_unthrottled: >-
# run tags='block:search_and_index,optype=select' labels='target:astra'
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
testann: >-
run tags='block:testann' cycles=TEMPLATE(testsize) errors=count,retry maxtries=2 threads=auto
# one activity or two? data leap-frog? or concurrency separate for both?
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
# stop_search_and_index: stop search_and_index
# only possible if we have a triggering event to indicated
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
params:
driver: cqld4
instrument: true
bindings:
id: ToString()
# filetype=hdf5 for TEMPLATE(filetype,hdf5)
test_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/test"); ToCqlVector();
relevant_indices_hdf5: HdfFileToIntArray("testdata/TEMPLATE(datafile).hdf5", "/neighbors")
distance_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/distance")
train_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/train"); ToCqlVector();
# filetype=vecs for TEMPLATE(filetype,vecs)
test_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec"); ToCqlVector();
relevant_indices_vecs: IVecReader("testdata/TEMPLATE(datafile).ivec");
distance_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec");
train_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec"); ToCqlVector();
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
blocks:
drop:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
drop_index:
raw: |
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
drop_table:
raw: |
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
schema:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
create_keyspace:
raw: |
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
create_table:
raw: |
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
key TEXT,
value vector<float,TEMPLATE(dimensions,set-the-dimensions-template-var)>,
PRIMARY KEY (key)
);
create_sai_index:
raw: |
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
rampup:
params:
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
prepared: true
ops:
insert: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
# await_index:
# ops:
testann:
ops:
select_ann_limit_TEMPLATE(k,100):
prepared: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
ORDER BY value ANN OF {test_floatlist_TEMPLATE(filetype,hdf5)} LIMIT TEMPLATE(select_limit,100);
tags:
optype: select
verifier-init: |
k=TEMPLATE(k,100)
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
verifier: |
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
relevancy.accept({relevant_indices_TEMPLATE(filetype,hdf5)},actual_indices);
return true;
insert_rewrite:
prepared: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
tags:
optype: insert
search_and_rewrite:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
verifier-init: |
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
# verifier: |
upsert_same:
stmt: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});
search_and_invalidate:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
# verifier-init: |
# verifier: |
upsert_random: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});