mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
update vec workload
This commit is contained in:
parent
093e5c9b93
commit
895d7d6659
@ -1,10 +1,14 @@
|
|||||||
min_version: 5.21
|
min_version: 5.21
|
||||||
description: |
|
description: |
|
||||||
This is a template for live vector search testing.
|
This is a template for live vector search testing.
|
||||||
|
Key parameters:
|
||||||
|
trainsize: TEMPLATE(trainsize)
|
||||||
|
testsize: TEMPLATE(testsize)
|
||||||
|
source_model: TEMPLATE(other)
|
||||||
|
|
||||||
schema: Install the schema required to run the test
|
schema: Install the schema required to run the test
|
||||||
rampup: Measure how long it takes to load a set of embeddings
|
rampup: Measure how long it takes to load a set of embeddings
|
||||||
search_and_index: Measure how the system responds to queries while it
|
search_and_verify: Measure how the system responds to queries while it
|
||||||
is indexing recently ingested data.
|
is indexing recently ingested data.
|
||||||
#? await_index: Pause and wait for the system to complete compactions or index processing
|
#? await_index: Pause and wait for the system to complete compactions or index processing
|
||||||
search: Run vector search with a set of default (or overridden) parameters
|
search: Run vector search with a set of default (or overridden) parameters
|
||||||
@ -15,32 +19,157 @@ description: |
|
|||||||
Also, aggregates of recall should include total aggregate as well as a moving average.
|
Also, aggregates of recall should include total aggregate as well as a moving average.
|
||||||
|
|
||||||
scenarios:
|
scenarios:
|
||||||
cassandra:
|
|
||||||
drop: run tags='block:drop' threads==undef cycles==undef context=cassandra
|
default:
|
||||||
# nb5 cql-vector2 cassandra.schema host=localhost localdc=datacenter1 dimensions=100
|
# Remove any existing data
|
||||||
schema: run tags='op=create_.*' threads==undef cycles==undef context=cassandra
|
drop: >-
|
||||||
# nb5 cql-vector2 cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
|
run tags='block:drop' threads===1 cycles===UNDEF
|
||||||
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn context=cassandra
|
errors=count
|
||||||
# nb5 cql-vector2 cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
|
|
||||||
read_recall: >-
|
# Install the schema required to run the test
|
||||||
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
|
schema_ks: >-
|
||||||
cycles=TEMPLATE(testsize) errors=counter,warn threads=1
|
run tags='block:schema_ks' threads===1 cycles===UNDEF
|
||||||
|
schema: >-
|
||||||
|
run tags='block:schema' threads===1 cycles===UNDEF
|
||||||
|
|
||||||
|
# Truncate any data before loading
|
||||||
|
# truncate: run tags='block:truncate' threads===1 cycles===UNDEF
|
||||||
|
|
||||||
|
# Load training data, measure how long it takes to load
|
||||||
|
rampup: >-
|
||||||
|
run tags='block:rampup' threads=TEMPLATE(rampup_threads,auto)
|
||||||
|
cycles===TEMPLATE(rampup_cycles,TEMPLATE(trainsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
# Measure how the system responds to queries under a read only workload
|
||||||
|
search_and_verify: >-
|
||||||
|
run alias=search_and_verify tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
verify_recall: >-
|
||||||
|
run alias=verify_recall tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
astra_vectors:
|
astra_vectors:
|
||||||
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
|
|
||||||
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
|
# Remove any existing data
|
||||||
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
|
drop: >-
|
||||||
# search_and_index_unthrottled: >-
|
run tags='block:drop' threads===1 cycles===UNDEF
|
||||||
# run tags='block:search_and_index,optype=select' labels='target:astra'
|
errors=count
|
||||||
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
|
|
||||||
testann: >-
|
# Install the schema required to run the test
|
||||||
run tags='block:testann' cycles=TEMPLATE(testsize) errors=count,retry maxtries=2 threads=auto
|
schema_ks: >-
|
||||||
# one activity or two? data leap-frog? or concurrency separate for both?
|
run tags='block:schema_ks' threads===1 cycles===UNDEF
|
||||||
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
|
schema: >-
|
||||||
# stop_search_and_index: stop search_and_index
|
run tags='block:schema' threads===1 cycles===UNDEF
|
||||||
# only possible if we have a triggering event to indicated
|
|
||||||
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
|
# Truncate any data before loading
|
||||||
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
|
# truncate: run tags='block:truncate' threads===1 cycles===UNDEF
|
||||||
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
|
|
||||||
|
# Load training data, measure how long it takes to load
|
||||||
|
rampup: >-
|
||||||
|
run tags='block:rampup' threads=TEMPLATE(rampup_threads,auto)
|
||||||
|
cycles===TEMPLATE(rampup_cycles,TEMPLATE(trainsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
# Measure how the system responds to queries under a read only workload
|
||||||
|
search_and_verify: >-
|
||||||
|
run alias=search_and_verify tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
verify_recall: >-
|
||||||
|
run alias=verify_recall tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
astra_vectors_with_source_model:
|
||||||
|
|
||||||
|
# Remove any existing data
|
||||||
|
drop: >-
|
||||||
|
run tags='block:drop' threads===1 cycles===UNDEF
|
||||||
|
errors=count
|
||||||
|
|
||||||
|
# Install the schema required to run the test
|
||||||
|
schema: >-
|
||||||
|
run tags='block:schema_with_source_model' threads===1 cycles===UNDEF
|
||||||
|
|
||||||
|
# Truncate any data before loading
|
||||||
|
# truncate: run tags='block:truncate' threads===1 cycles===UNDEF
|
||||||
|
|
||||||
|
# Load training data, measure how long it takes to load
|
||||||
|
rampup: >-
|
||||||
|
run tags='block:rampup' threads=TEMPLATE(rampup_threads,auto)
|
||||||
|
cycles===TEMPLATE(rampup_cycles,TEMPLATE(trainsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
# Measure how the system responds to queries under a read only workload
|
||||||
|
search_and_verify: >-
|
||||||
|
run alias=search_and_verify tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
verify_recall: >-
|
||||||
|
run alias=verify_recall tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
astra_vectors_mixed_workload:
|
||||||
|
# Measure how the system responds to queries while
|
||||||
|
# it is indexing recently ingested data
|
||||||
|
search_and_verify: >-
|
||||||
|
run alias=search_and_verify tags='block:search_and_verify'
|
||||||
|
cycles===TEMPLATE(search_cycles) errors=count,retry stride=100 striderate=7.50
|
||||||
|
errors=counter threads=500
|
||||||
|
|
||||||
|
# search_and_rewrite: run tags='block:search_and_rewrite'
|
||||||
|
# search_and_invalidate: run tags='block:search_and_invalidate'
|
||||||
|
|
||||||
|
optimize:
|
||||||
|
# Remove any existing data
|
||||||
|
drop: >-
|
||||||
|
run tags='block:drop' threads===1 cycles===UNDEF
|
||||||
|
errors=count
|
||||||
|
|
||||||
|
# Install the schema required to run the test
|
||||||
|
schema: >-
|
||||||
|
run tags='block:schema' threads===1 cycles===UNDEF
|
||||||
|
|
||||||
|
# Load training data, measure how long it takes to load
|
||||||
|
rampup: >-
|
||||||
|
run tags='block:rampup' threads=TEMPLATE(rampup_threads,auto)
|
||||||
|
cycles===TEMPLATE(rampup_cycles,TEMPLATE(trainsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
# Start the read only vectory query workload
|
||||||
|
search_and_verify: >-
|
||||||
|
start alias=search_and_verify tags='block:search_and_verify,optype=select'
|
||||||
|
threads=TEMPLATE(search_threads,auto) cycles===TEMPLATE(search_cycles,TEMPLATE(testsize))
|
||||||
|
errors=count,warn
|
||||||
|
|
||||||
|
# Find the optimal rate for the search workload
|
||||||
|
findmax: >-
|
||||||
|
findmax activity=search_and_verify
|
||||||
|
base_value=200
|
||||||
|
step_value=50
|
||||||
|
min_frames=10
|
||||||
|
optimization_type=rate
|
||||||
|
|
||||||
|
# Optimize the search workload
|
||||||
|
optimo: >-
|
||||||
|
optimo activity=search_and_verify
|
||||||
|
startrate=${findmax.rate}
|
||||||
|
sample_time_ms=1000
|
||||||
|
|
||||||
|
# Retest the search workload with the optimized rate and thread count
|
||||||
|
retest: >-
|
||||||
|
reset activity=search_and_verify
|
||||||
|
threads==${optimo.threads}
|
||||||
|
rate==${optimo.rate}
|
||||||
|
|
||||||
params:
|
params:
|
||||||
driver: cqld4
|
driver: cqld4
|
||||||
@ -48,43 +177,70 @@ params:
|
|||||||
|
|
||||||
bindings:
|
bindings:
|
||||||
id: ToString()
|
id: ToString()
|
||||||
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/test"); ToCqlVector();
|
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
|
||||||
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(datafile).hdf5", "/neighbors")
|
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
|
||||||
distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/distance")
|
distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/distances")
|
||||||
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/train"); ToCqlVector();
|
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
|
||||||
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
|
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
|
||||||
|
|
||||||
blocks:
|
blocks:
|
||||||
drop:
|
drop:
|
||||||
params:
|
params:
|
||||||
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
prepared: false
|
||||||
|
timeout: 600
|
||||||
ops:
|
ops:
|
||||||
drop_index:
|
drop_index: |
|
||||||
raw: |
|
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)_value_idx;
|
||||||
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
drop_table: |
|
||||||
drop_table:
|
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||||
raw: |
|
truncate:
|
||||||
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
params:
|
||||||
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
prepared: false
|
||||||
|
timeout: 600
|
||||||
|
ops:
|
||||||
|
truncate_table: |
|
||||||
|
truncate TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||||
|
|
||||||
|
schema_ks:
|
||||||
|
params:
|
||||||
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
prepared: false
|
||||||
|
ops:
|
||||||
|
create_keyspace: |
|
||||||
|
create keyspace if not exists TEMPLATE(keyspace,baselines)
|
||||||
|
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 'TEMPLATE(rf:1)'}
|
||||||
|
AND durable_writes = true;
|
||||||
|
|
||||||
schema:
|
schema:
|
||||||
params:
|
params:
|
||||||
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
|
prepared: false
|
||||||
ops:
|
ops:
|
||||||
create_keyspace:
|
create_table: |
|
||||||
raw: |
|
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
|
||||||
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
|
key TEXT,
|
||||||
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
|
value vector<float,TEMPLATE(dimensions)>,
|
||||||
create_table:
|
PRIMARY KEY (key)
|
||||||
raw: |
|
);
|
||||||
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
|
create_sai_index: |
|
||||||
key TEXT,
|
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
|
||||||
value vector<float,TEMPLATE(dimensions,set-the-dimensions-template-var)>,
|
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
|
||||||
PRIMARY KEY (key)
|
schema_with_source_model:
|
||||||
);
|
params:
|
||||||
create_sai_index:
|
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||||
raw: |
|
prepared: false
|
||||||
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
|
ops:
|
||||||
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
|
create_table: |
|
||||||
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
|
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
|
||||||
|
key TEXT,
|
||||||
|
value vector<float,TEMPLATE(dimensions)>,
|
||||||
|
PRIMARY KEY (key)
|
||||||
|
);
|
||||||
|
create_sai_index: |
|
||||||
|
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
|
||||||
|
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)', 'source_model' : 'TEMPLATE(source_model,other)'};
|
||||||
rampup:
|
rampup:
|
||||||
params:
|
params:
|
||||||
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
|
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
|
||||||
@ -93,9 +249,7 @@ blocks:
|
|||||||
insert: |
|
insert: |
|
||||||
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||||
(key, value) VALUES ({id},{train_floatlist});
|
(key, value) VALUES ({id},{train_floatlist});
|
||||||
# await_index:
|
search_and_verify:
|
||||||
# ops:
|
|
||||||
testann:
|
|
||||||
ops:
|
ops:
|
||||||
select_ann_limit_TEMPLATE(k,100):
|
select_ann_limit_TEMPLATE(k,100):
|
||||||
prepared: |
|
prepared: |
|
||||||
@ -105,14 +259,14 @@ blocks:
|
|||||||
optype: select
|
optype: select
|
||||||
verifier-init: |
|
verifier-init: |
|
||||||
k=TEMPLATE(k,100)
|
k=TEMPLATE(k,100)
|
||||||
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
|
relevancy=new io.nosqlbench.nb.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op)
|
||||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
|
||||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
|
||||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
|
||||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
|
||||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
|
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
|
||||||
verifier: |
|
verifier: |
|
||||||
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
|
actual_indices=cql_utils.cqlStringColumnToIntArray("key",result);
|
||||||
relevancy.accept({relevant_indices},actual_indices);
|
relevancy.accept({relevant_indices},actual_indices);
|
||||||
return true;
|
return true;
|
||||||
insert_rewrite:
|
insert_rewrite:
|
||||||
|
Loading…
Reference in New Issue
Block a user