workload in dev

2024-12-24 08:00:00 -06:00 · 2023-10-16 22:58:49 -05:00 · 2023-10-16 22:58:49 -05:00 · 4bb371b276
commit 4bb371b276
parent b8c9af6e71
2 changed files with 175 additions and 0 deletions
--- a/adapter-cqld4/src/main/resources/activities/baselinesv2/cql-vector.yaml
+++ b/adapter-cqld4/src/main/resources/activities/baselinesv2/cql-vector.yaml
@ -0,0 +1,155 @@
 description: |
  This is a template for live vector search testing.
  schema: Install the schema required to run the test
  rampup: Measure how long it takes to load a set of embeddings
  search_and_index: Measure how the system responds to queries while it
   is indexing recently ingested data.
  #? await_index: Pause and wait for the system to complete compactions or index processing
  search: Run vector search with a set of default (or overridden) parameters
  search_and_rewrite: Run the same search operations as above, but while rewriting the data
  search_and_invalidate: Run the same search operations as above, but while overwriting the data
   with different content using the same vector id.
  In all of these phases, it is important to instance the metrics with distinct names.
  Also, aggregates of recall should include total aggregate as well as a moving average.
 scenarios:
  cassandra:
    # nb5 cql-vector cassandra.schema host=localhost localdc=datacenter1 dimensions=100
    schema: run tags='block:schema' threads==undef cycles==undef
    # nb5 cql-vector cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
    rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
    # nb5 cql-vector cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
    search_and_index: >-
      run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
      cycles=TEMPLATE(testsize) stride=100 striderate=7.50
      errors=counter,warn threads=1
  astra_vectors:
    drop: run tags='block:drop' labels='target:astra' threads===1 cycles===2 driverconfig=app.conf
    schema: run tags='block:schema' labels='target:astra' threads===1 cycles===2
    rampup: run tags='block:rampup' labels='target:astra' threads=100 cycles=TEMPLATE(trainsize) errors=counter
 #    search_and_index_unthrottled: >-
 #      run tags='block:search_and_index,optype=select' labels='target:astra'
 #      cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
    search_and_index: >-
      run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:astra'
      cycles=TEMPLATE(testsize) errors=count,retry stride=100 striderate=7.50
      errors=counter threads=500
    # one activity or two? data leap-frog? or concurrency separate for both?
  #  await_index: run tags='block:await_index' # This would need to exit when a condition is met
  #  stop_search_and_index: stop search_and_index
    # only possible if we have a triggering event to indicated
    # live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
    search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
    search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
 params:
  driver: cqld4
  instrument: true
 bindings:
  id: ToString()
  # This
  test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
  relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
  distance_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/distance")
  train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
  synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
 blocks:
  drop:
    params:
      cl: TEMPLATE(cl,LOCAL_QUORUM)
    ops:
      drop_index:
        raw: |
          DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
      drop_table:
        raw: |
          DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
  schema:
    params:
      cl: TEMPLATE(cl,LOCAL_QUORUM)
    ops:
      create_keyspace:
        raw: |
          CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
          WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
        target: cassandra
      create_table:
        raw: |
          CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
            key TEXT,
            value vector<float,TEMPLATE(dimensions)>,
            PRIMARY KEY (key)
          );
        tags:
          target: astra
      create_sai_index:
        raw: |
          CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
          WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
 #         WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
        tags:
          target: astra
  rampup:
    params:
      cl: TEMPLATE(write_cl,LOCAL_QUORUM)
      prepared: true
    ops:
      insert: |
        INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
        (key, value) VALUES ({id},{train_floatlist});
 #  await_index:
 #    ops:
  search_and_index:
    ops:
      select_ann_limit_TEMPLATE(k,100):
        prepared: |
          SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
          ORDER BY value ANN OF {test_floatlist} LIMIT TEMPLATE(select_limit,100);
        tags:
          optype: select
        verifier-init: |
          k=TEMPLATE(k,100)
          relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
        verifier: |
          actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
          relevancy.accept({relevant_indices},actual_indices);
          return true;
      insert_rewrite:
        prepared: |
          INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
          (key, value) VALUES ({id},{train_floatlist});
        tags:
          optype: insert
  search_and_rewrite:
    ops:
      select_ann_limit:
        stmt: |
          SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
        verifier-init: |
          scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
 #        verifier: |
      upsert_same:
        stmt: |
          INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
          (key, value) VALUES ({rw_key},{train_vector});
  search_and_invalidate:
    ops:
      select_ann_limit:
        stmt: |
          SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
 #        verifier-init: |
 #        verifier: |
      upsert_random: |
        INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
        (key, value) VALUES ({rw_key},{train_vector});
--- a/adapter-cqld4/src/main/resources/activities/baselinesv2/get_datasets.sh
+++ b/adapter-cqld4/src/main/resources/activities/baselinesv2/get_datasets.sh
@ -0,0 +1,20 @@
 #!/bin/bash
 DATASETS="glove-25-angular glove-50-angular glove-100-angular glove-200-angular deep-image-96-angular lastfm-64-dot"
 mkdir -p testdata
 pushd .
 cd testdata
 if [ -f _env.sh ]
 then . _env.sh
 fi
 DATASET=${DATASETS?is required}
 for dataset in ${DATASETS}
 do
 URL="http://ann-benchmarks.com/${dataset}.hdf5"
 curl -OL "${URL}"
 done