Merge pull request #1650 from nosqlbench/jshook/ivec_fvec

add support for ivec and fvec formats
This commit is contained in:
Jonathan Shook 2023-10-26 20:33:55 -05:00 committed by GitHub
commit 14b836dd51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
37 changed files with 805 additions and 4 deletions

1
.gitignore vendored
View File

@ -1,6 +1,5 @@
exported_docs.zip
.nosqlbench/**
.run/**
workspaces/**
workshop/**
local/**

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__drop__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels &quot;dimensions:768,dataset=e5_base_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__drop__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels &quot;dimensions:1024,dataset=e5_large_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__drop__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_muli&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__drop__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_q_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__drop__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__schema__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels &quot;dimensions:768,dataset=e5_base_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__schema__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels &quot;dimensions:1024,dataset=e5_large_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__schema__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_muli&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__schema__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_q_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__schema__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__testann__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels &quot;dimensions:768,dataset=e5_base_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__testann__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels &quot;dimensions:1024,dataset=e5_large_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__testann__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_muli&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__testann__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_q_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__testann__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__train__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels &quot;dimensions:768,dataset=e5_base_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__train__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels &quot;dimensions:1024,dataset=e5_large_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__train__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small_muli&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__train__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels &quot;dimensions:384,dataset:e5_small_q_v2&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,14 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="cql_vector2__train__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels &quot;dimensions:384,dataset=e5_small&quot;" />
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
<method v="2" />
</configuration>
</component>

15
.run/linkedinput.run.xml Normal file
View File

@ -0,0 +1,15 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="linkedinput" type="JarApplication" folderName="nbr integration tests">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nbr/target/nbr.jar" />
<option name="PROGRAM_PARAMETERS" value="script src/test/resources/scripts/examples/linkedinput.js" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/nbr-examples" />
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="true" />
<option name="ALTERNATIVE_JRE_PATH" value="/usr/java/jdk-21" />
<method v="2" />
</configuration>
</component>

View File

@ -0,0 +1,15 @@
<component name="ProjectRunConfigurationManager">
<configuration default="false" name="vectorsearch-consistency-levels" type="JarApplication">
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
<option name="credential" />
<option name="region" />
<option name="useCurrentConnection" value="false" />
</extension>
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
<option name="PROGRAM_PARAMETERS" value="vector-search.yaml reads errors=stop driverconfig=driver-config.json dimensions=128 read_ratio=1 main-cycles=1 keyspace=baselines128 --report-csv-to metrics read_cl=LOCAL_ONE -v --show-stacktraces" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/local/vectors-consistency" />
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="true" />
<option name="ALTERNATIVE_JRE_PATH" value="17" />
<method v="2" />
</configuration>
</component>

View File

@ -28,7 +28,7 @@ scenarios:
astra_vectors:
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
train: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
# search_and_index_unthrottled: >-
# run tags='block:search_and_index,optype=select' labels='target:astra'
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter

View File

@ -0,0 +1,155 @@
min_version: 5.21
description: |
This is a template for live vector search testing.
schema: Install the schema required to run the test
rampup: Measure how long it takes to load a set of embeddings
search_and_index: Measure how the system responds to queries while it
is indexing recently ingested data.
#? await_index: Pause and wait for the system to complete compactions or index processing
search: Run vector search with a set of default (or overridden) parameters
search_and_rewrite: Run the same search operations as above, but while rewriting the data
search_and_invalidate: Run the same search operations as above, but while overwriting the data
with different content using the same vector id.
In all of these phases, it is important to instance the metrics with distinct names.
Also, aggregates of recall should include total aggregate as well as a moving average.
scenarios:
cassandra:
drop: run tags='block:drop' threads==undef cycles==undef
# nb5 cql-vector2 cassandra.schema host=localhost localdc=datacenter1 dimensions=100
schema: run tags='op=create_.*' threads==undef cycles==undef
# nb5 cql-vector2 cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
# nb5 cql-vector2 cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
read_recall: >-
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
cycles=TEMPLATE(testsize) errors=counter,warn threads=1
astra_vectors:
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
# search_and_index_unthrottled: >-
# run tags='block:search_and_index,optype=select' labels='target:astra'
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
testann: >-
run tags='block:testann' cycles=TEMPLATE(testsize) errors=count,retry maxtries=2 threads=auto
# one activity or two? data leap-frog? or concurrency separate for both?
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
# stop_search_and_index: stop search_and_index
# only possible if we have a triggering event to indicated
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
params:
driver: cqld4
instrument: true
bindings:
id: ToString()
# filetype=hdf5 for TEMPLATE(filetype,hdf5)
test_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/test"); ToCqlVector();
relevant_indices_hdf5: HdfFileToIntArray("testdata/TEMPLATE(datafile).hdf5", "/neighbors")
distance_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/distance")
train_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/train"); ToCqlVector();
# filetype=fvec for TEMPLATE(filetype,fvec)
test_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_query_vectors.fvec"); ToCqlVector();
relevant_indices_fvec: IVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_indices_query.ivec");
distance_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(testsize)_distances_count.fvec",TEMPLATE(dimensions),0);
train_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_base_vectors.fvec",TEMPLATE(dimensions),0); ToCqlVector();
# synthetic
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
blocks:
drop:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
drop_index:
raw: |
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
drop_table:
raw: |
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
schema:
params:
cl: TEMPLATE(cl,LOCAL_QUORUM)
ops:
create_keyspace:
raw: |
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
create_table:
raw: |
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
key TEXT,
value vector<float,TEMPLATE(dimensions,set-the-dimensions-template-var)>,
PRIMARY KEY (key)
);
create_sai_index:
raw: |
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
rampup:
params:
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
prepared: true
ops:
insert: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
# await_index:
# ops:
testann:
ops:
select_ann_limit_TEMPLATE(k,100):
prepared: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
ORDER BY value ANN OF {test_floatlist_TEMPLATE(filetype,hdf5)} LIMIT TEMPLATE(select_limit,100);
tags:
optype: select
verifier-init: |
k=TEMPLATE(k,100)
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
verifier: |
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
relevancy.accept({relevant_indices_TEMPLATE(filetype,hdf5)},actual_indices);
return true;
insert_rewrite:
prepared: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
tags:
optype: insert
search_and_rewrite:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
verifier-init: |
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
# verifier: |
upsert_same:
stmt: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});
search_and_invalidate:
ops:
select_ann_limit:
stmt: |
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
# verifier-init: |
# verifier: |
upsert_random: |
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
(key, value) VALUES ({rw_key},{train_vector});

View File

@ -31,6 +31,7 @@ import org.apache.logging.log4j.Logger;
import java.util.function.Function;
@Service(value = DriverAdapter.class,selector = "jdbc")
public class JDBCDriverAdapter extends BaseDriverAdapter<JDBCOp, JDBCSpace> {
private final static Logger logger = LogManager.getLogger(JDBCDriverAdapter.class);

View File

@ -537,7 +537,7 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<argLine>-ea ${argLine}</argLine>
<argLine>-ea @{argLine}</argLine>
<systemPropertyVariables>
<Log4jContextSelector>
org.apache.logging.log4j.core.async.AsyncLoggerContextSelector

View File

@ -33,5 +33,6 @@ public enum Category {
experimental,
combinitoric,
vectors,
HOF
HOF,
readers
}

View File

@ -53,6 +53,8 @@
<directory>src/test/resources</directory>
<excludes>
<exclude>h5ex_t_float.h5</exclude>
<exclude>**/*.ivec</exclude>
<exclude>**/*.fvec</exclude>
</excludes>
<filtering>true</filtering>
</testResource>

View File

@ -0,0 +1,94 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import io.nosqlbench.api.content.Content;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.function.LongFunction;
/**
* Reads ivec files with random access, using the input to specify the record number.
*/
@ThreadSafeMapper
@Categories(Category.readers)
public class FVecReader implements LongFunction<float[]> {
private final MappedByteBuffer bb;
private final int dimensions;
private final int reclen;
private final long filesize;
private final Path path;
private final int reclim;
public FVecReader(String pathname) {
this(pathname,0,0);
}
public FVecReader(String pathname, int expectedDimensions, int recordLimit) {
Content<?> src = NBIO.fs().search(pathname).one();
this.path = src.asPath();
try {
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
this.filesize = channel.size();
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.dimensions = Integer.reverseBytes(bb.getInt(0));
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
}
int datalen = (dimensions * Float.BYTES);
this.reclen = Integer.BYTES + datalen;
int totalRecords = (int) (filesize/reclen);
if (recordLimit > totalRecords) {
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
}
this.reclim = recordLimit==0? totalRecords : recordLimit;
if ((filesize % reclen)!=0) {
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
}
}
@Override
public float[] apply(long value) {
int recordIdx = (int) (value % reclim);
int recpos = recordIdx*reclen;
int recdim = Integer.reverseBytes(bb.getInt(recpos));
if(recdim!=dimensions) {
throw new RuntimeException("dimensions are not uniform for fvec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
}
var vbuf = new byte[dimensions*Float.BYTES];
bb.get(recpos + Integer.BYTES, vbuf);
FloatBuffer fbuf=ByteBuffer.wrap(vbuf).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
var vectors = new float[dimensions];
fbuf.get(vectors);
return vectors;
}
}

View File

@ -0,0 +1,103 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import io.nosqlbench.api.content.Content;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Categories;
import io.nosqlbench.virtdata.api.annotations.Category;
import io.nosqlbench.virtdata.api.annotations.Example;
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.function.LongFunction;
/**
* Reads ivec files with random access, using the input to specify the record number.
* This is used for testing with generated KNN test data which is uniform in dimensions and neighborhood size.
* While it is possible to specify different dimensioned vectors per record, this is not supported, since this
* function honors the pure-function behavior of other NB binding functions. This requires uniform record structure for random access.
*/
@ThreadSafeMapper
@Categories(Category.readers)
public class IVecReader implements LongFunction<int[]> {
private final MappedByteBuffer bb;
private final int dimensions;
private final int reclen;
private final long filesize;
private final Path path;
private final int reclim;
/**
* Read the ivec file, determining the record size from the first record.
* @param pathname The location of the ivec file
*/
@Example({"IvecReader('testfile.ivec')","Create a reader for int vectors, detecting the dimensions and dataset size automatically."})
public IVecReader(String pathname) {
this(pathname,0,0);
}
@Example({"IvecReader('testfile.ivec', 46, 12)","Create a reader for int vectors, asserting 46 dimensions and limit total records to 12."})
public IVecReader(String pathname, int expectedDimensions, int recordLimit) {
Content<?> src = NBIO.fs().search(pathname).one();
this.path = src.asPath();
try {
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
this.filesize = channel.size();
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.dimensions = Integer.reverseBytes(bb.getInt(0));
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
}
int datalen = (dimensions * Integer.BYTES);
this.reclen = Integer.BYTES + datalen;
int totalRecords = (int) (filesize/reclen);
if (recordLimit > totalRecords) {
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
}
this.reclim = recordLimit==0? totalRecords : recordLimit;
if ((filesize % reclen)!=0) {
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
}
}
@Override
public int[] apply(long value) {
int recordIdx = (int) (value % reclim);
int recpos = recordIdx*reclen;
byte[] buf = new byte[reclen];
this.bb.get(recpos,buf);
ByteBuffer record = ByteBuffer.wrap(buf);
int recdim = Integer.reverseBytes(record.getInt());
if(recdim!=dimensions) {
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
}
int[] data = new int[recdim];
for (int i = 0; i < dimensions; i++) {
data[i]=Integer.reverseBytes(record.getInt());
}
return data;
}
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.HashSet;
public class IvecFvecMethods {
public static ArrayList<float[]> readFvecs(String filePath) throws IOException {
var vectors = new ArrayList<float[]>();
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
while (dis.available() > 0) {
var dimension = Integer.reverseBytes(dis.readInt());
assert dimension > 0 : dimension;
var buffer = new byte[dimension * Float.BYTES];
dis.readFully(buffer);
var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
var vector = new float[dimension];
var floatBuffer = byteBuffer.asFloatBuffer();
floatBuffer.get(vector);
vectors.add(vector);
}
}
return vectors;
}
public static ArrayList<HashSet<Integer>> readIvecs(String filename) {
var groundTruthTopK = new ArrayList<HashSet<Integer>>();
try (var dis = new DataInputStream(new FileInputStream(filename))) {
while (dis.available() > 0) {
var numNeighbors = Integer.reverseBytes(dis.readInt());
var neighbors = new HashSet<Integer>(numNeighbors);
for (var i = 0; i < numNeighbors; i++) {
var neighbor = Integer.reverseBytes(dis.readInt());
neighbors.add(neighbor);
}
groundTruthTopK.add(neighbors);
}
} catch (IOException e) {
e.printStackTrace();
}
return groundTruthTopK;
}
}

View File

@ -0,0 +1,65 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.HashSet;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.*;
class IVecReaderTest {
@Test
public void testReadIvec() {
ArrayList<HashSet<Integer>> idx_ref = IvecFvecMethods.readIvecs("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
for (int i = 0; i < 10; i++) {
int[] indices = ir.apply(0);
HashSet<Integer> ref = idx_ref.get(0);
for (int j = 0; j < indices.length; j++) {
assertThat(indices[j]).isGreaterThanOrEqualTo(0);
assertThat(indices[j]).isLessThanOrEqualTo(10000);
}
}
}
@Test
public void testReadFvec() {
FVecReader ir = new FVecReader("src/test/resources/ivecfvec/test_ada_002_10000_distances_count.fvec");
for (int i = 0; i < 10; i++) {
float[] dist = ir.apply(i);
for (int j = 1; j < dist.length; j++) {
assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]).describedAs("dist[" + j +"]=(" +dist[j]+") dist[j-1]=(" + dist[j-1] + ")");
}
}
}
@Test
public void testReadFvecSpecificDims() {
FVecReader ir = new FVecReader(
"src/test/resources/ivecfvec/test_ada_002_10000_base_vectors.fvec",
1536,0);
float[] vec0 = ir.apply(0);
assertThat(vec0.length).isEqualTo(1536);
}
}