mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2024-12-22 23:23:56 -06:00
Merge pull request #1650 from nosqlbench/jshook/ivec_fvec
add support for ivec and fvec formats
This commit is contained in:
commit
14b836dd51
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,6 +1,5 @@
|
||||
exported_docs.zip
|
||||
.nosqlbench/**
|
||||
.run/**
|
||||
workspaces/**
|
||||
workshop/**
|
||||
local/**
|
||||
|
14
.run/cql_vector2__drop__E5-BASE-V2.run.xml
Normal file
14
.run/cql_vector2__drop__E5-BASE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__drop__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels "dimensions:768,dataset=e5_base_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__drop__E5-LARGE-V2.run.xml
Normal file
14
.run/cql_vector2__drop__E5-LARGE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__drop__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels "dimensions:1024,dataset=e5_large_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__drop__E5-SMALL-MULI.run.xml
Normal file
14
.run/cql_vector2__drop__E5-SMALL-MULI.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__drop__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_muli"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__drop__E5-SMALL-Q.run.xml
Normal file
14
.run/cql_vector2__drop__E5-SMALL-Q.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__drop__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_q_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__drop__E5-SMALL-V2.run.xml
Normal file
14
.run/cql_vector2__drop__E5-SMALL-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__drop__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.drop userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__schema__E5-BASE-V2.run.xml
Normal file
14
.run/cql_vector2__schema__E5-BASE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__schema__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels "dimensions:768,dataset=e5_base_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__schema__E5-LARGE-V2.run.xml
Normal file
14
.run/cql_vector2__schema__E5-LARGE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__schema__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels "dimensions:1024,dataset=e5_large_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__schema__E5-SMALL-MULI.run.xml
Normal file
14
.run/cql_vector2__schema__E5-SMALL-MULI.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__schema__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_muli"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__schema__E5-SMALL-Q.run.xml
Normal file
14
.run/cql_vector2__schema__E5-SMALL-Q.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__schema__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_q_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__schema__E5-SMALL-V2.run.xml
Normal file
14
.run/cql_vector2__schema__E5-SMALL-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__schema__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.schema userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__testann__E5-BASE-V2.run.xml
Normal file
14
.run/cql_vector2__testann__E5-BASE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__testann__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels "dimensions:768,dataset=e5_base_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__testann__E5-LARGE-V2.run.xml
Normal file
14
.run/cql_vector2__testann__E5-LARGE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__testann__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels "dimensions:1024,dataset=e5_large_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__testann__E5-SMALL-MULI.run.xml
Normal file
14
.run/cql_vector2__testann__E5-SMALL-MULI.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__testann__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_muli"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__testann__E5-SMALL-Q.run.xml
Normal file
14
.run/cql_vector2__testann__E5-SMALL-Q.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__testann__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_q_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__testann__E5-SMALL-V2.run.xml
Normal file
14
.run/cql_vector2__testann__E5-SMALL-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__testann__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.testann userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__train__E5-BASE-V2.run.xml
Normal file
14
.run/cql_vector2__train__E5-BASE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__train__E5-BASE-V2" type="JarApplication" folderName="E5-BASE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=768 testsize=10000 trainsize=100000 datafile=intfloat_e5-base-v2 filetype=fvec table=e5_base_v2 similarity_function=cosine --add-labels "dimensions:768,dataset=e5_base_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__train__E5-LARGE-V2.run.xml
Normal file
14
.run/cql_vector2__train__E5-LARGE-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__train__E5-LARGE-V2" type="JarApplication" folderName="E5-LARGE-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=1024 testsize=10000 trainsize=100000 datafile=intfloat_e5-large-v2 filetype=fvec table=e5_large_v2 similarity_function=cosine --add-labels "dimensions:1024,dataset=e5_large_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__train__E5-SMALL-MULI.run.xml
Normal file
14
.run/cql_vector2__train__E5-SMALL-MULI.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__train__E5-SMALL-MULI" type="JarApplication" folderName="E5-SMALL-MULI">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_multilingual-e5-small filetype=fvec table=e5_small_muli similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small_muli"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__train__E5-SMALL-Q.run.xml
Normal file
14
.run/cql_vector2__train__E5-SMALL-Q.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__train__E5-SMALL-Q" type="JarApplication" folderName="E5-SMALL-Q-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-q-v2 filetype=fvec table=e5_small_q_v2 similarity_function=cosine --add-labels "dimensions:384,dataset:e5_small_q_v2"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
14
.run/cql_vector2__train__E5-SMALL-V2.run.xml
Normal file
14
.run/cql_vector2__train__E5-SMALL-V2.run.xml
Normal file
@ -0,0 +1,14 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="cql_vector2__train__E5-SMALL-V2" type="JarApplication" folderName="E5-SMALL-V2">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="cql_vector2_fvec astra_vectors.train userfile=auth/userfile passfile=auth/passfile scb=auth/scb.zip --show-stacktraces dimensions=384 testsize=10000 trainsize=100000 datafile=intfloat_e5-small-v2 filetype=fvec table=e5_small similarity_function=cosine --add-labels "dimensions:384,dataset=e5_small"" />
|
||||
<option name="WORKING_DIRECTORY" value="$ProjectFileDir$/local/jvector" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="jdk21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
15
.run/linkedinput.run.xml
Normal file
15
.run/linkedinput.run.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="linkedinput" type="JarApplication" folderName="nbr integration tests">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nbr/target/nbr.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="script src/test/resources/scripts/examples/linkedinput.js" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/nbr-examples" />
|
||||
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="true" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="/usr/java/jdk-21" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
15
.run/vectorsearch-consistency-levels.run.xml
Normal file
15
.run/vectorsearch-consistency-levels.run.xml
Normal file
@ -0,0 +1,15 @@
|
||||
<component name="ProjectRunConfigurationManager">
|
||||
<configuration default="false" name="vectorsearch-consistency-levels" type="JarApplication">
|
||||
<extension name="software.aws.toolkits.jetbrains.core.execution.JavaAwsConnectionExtension">
|
||||
<option name="credential" />
|
||||
<option name="region" />
|
||||
<option name="useCurrentConnection" value="false" />
|
||||
</extension>
|
||||
<option name="JAR_PATH" value="$PROJECT_DIR$/nb5/target/nb5.jar" />
|
||||
<option name="PROGRAM_PARAMETERS" value="vector-search.yaml reads errors=stop driverconfig=driver-config.json dimensions=128 read_ratio=1 main-cycles=1 keyspace=baselines128 --report-csv-to metrics read_cl=LOCAL_ONE -v --show-stacktraces" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/local/vectors-consistency" />
|
||||
<option name="ALTERNATIVE_JRE_PATH_ENABLED" value="true" />
|
||||
<option name="ALTERNATIVE_JRE_PATH" value="17" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
</component>
|
@ -28,7 +28,7 @@ scenarios:
|
||||
astra_vectors:
|
||||
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
|
||||
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
|
||||
train: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
|
||||
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
|
||||
# search_and_index_unthrottled: >-
|
||||
# run tags='block:search_and_index,optype=select' labels='target:astra'
|
||||
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
|
||||
|
@ -0,0 +1,155 @@
|
||||
min_version: 5.21
|
||||
description: |
|
||||
This is a template for live vector search testing.
|
||||
|
||||
schema: Install the schema required to run the test
|
||||
rampup: Measure how long it takes to load a set of embeddings
|
||||
search_and_index: Measure how the system responds to queries while it
|
||||
is indexing recently ingested data.
|
||||
#? await_index: Pause and wait for the system to complete compactions or index processing
|
||||
search: Run vector search with a set of default (or overridden) parameters
|
||||
search_and_rewrite: Run the same search operations as above, but while rewriting the data
|
||||
search_and_invalidate: Run the same search operations as above, but while overwriting the data
|
||||
with different content using the same vector id.
|
||||
In all of these phases, it is important to instance the metrics with distinct names.
|
||||
Also, aggregates of recall should include total aggregate as well as a moving average.
|
||||
|
||||
scenarios:
|
||||
cassandra:
|
||||
drop: run tags='block:drop' threads==undef cycles==undef
|
||||
# nb5 cql-vector2 cassandra.schema host=localhost localdc=datacenter1 dimensions=100
|
||||
schema: run tags='op=create_.*' threads==undef cycles==undef
|
||||
# nb5 cql-vector2 cassandra.rampup host=localhost localdc=datacenter1 dimensions=100 trainsize=1000000 dataset=glove-100-angular rate=10000
|
||||
rampup: run tags='block:rampup' threads=auto cycles=TEMPLATE(trainsize,set-the-trainsize) errors=counter,warn
|
||||
# nb5 cql-vector2 cassandra.search_and_index testsize=10000 host=localhost localdc=datacenter1 dimensions=100 dataset=glove-100-angular --report-csv-to rmetrics:.*:5s
|
||||
read_recall: >-
|
||||
run alias=search_and_index tags='block:search_and_index,optype=select' labels='target:cassandra'
|
||||
cycles=TEMPLATE(testsize) errors=counter,warn threads=1
|
||||
astra_vectors:
|
||||
drop: run tags='block:drop' tags='block:drop' threads==undef cycles==undef
|
||||
schema: run tags='block:schema' tags='op=create_.*(table|index)' threads==undef cycles==undef dimensions==TEMPLATE(dimensions,25)
|
||||
train: run tags='block:rampup' threads=20x cycles=TEMPLATE(trainsize) errors=counter,warn maxtries=2 dimensions==TEMPLATE(dimensions,25)
|
||||
# search_and_index_unthrottled: >-
|
||||
# run tags='block:search_and_index,optype=select' labels='target:astra'
|
||||
# cycles=TEMPLATE(testsize) threads=10 errors=count,retry stride=500 errors=counter
|
||||
testann: >-
|
||||
run tags='block:testann' cycles=TEMPLATE(testsize) errors=count,retry maxtries=2 threads=auto
|
||||
# one activity or two? data leap-frog? or concurrency separate for both?
|
||||
# await_index: run tags='block:await_index' # This would need to exit when a condition is met
|
||||
# stop_search_and_index: stop search_and_index
|
||||
# only possible if we have a triggering event to indicated
|
||||
# live_search: run tags='block:search' labels='target:astra' threads=1 cycles=TEMPLATE(testsize,10000)
|
||||
search_and_rewrite: run tags='block:search_and_rewrite' labels='target:astra'
|
||||
search_and_invalidate: run tags='block:search_and_invalidate' labels='target:astra'
|
||||
|
||||
params:
|
||||
driver: cqld4
|
||||
instrument: true
|
||||
|
||||
bindings:
|
||||
id: ToString()
|
||||
# filetype=hdf5 for TEMPLATE(filetype,hdf5)
|
||||
test_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/test"); ToCqlVector();
|
||||
relevant_indices_hdf5: HdfFileToIntArray("testdata/TEMPLATE(datafile).hdf5", "/neighbors")
|
||||
distance_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/distance")
|
||||
train_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/train"); ToCqlVector();
|
||||
# filetype=fvec for TEMPLATE(filetype,fvec)
|
||||
test_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_query_vectors.fvec"); ToCqlVector();
|
||||
relevant_indices_fvec: IVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_indices_query.ivec");
|
||||
distance_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(testsize)_distances_count.fvec",TEMPLATE(dimensions),0);
|
||||
train_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_base_vectors.fvec",TEMPLATE(dimensions),0); ToCqlVector();
|
||||
# synthetic
|
||||
synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions));
|
||||
|
||||
blocks:
|
||||
drop:
|
||||
params:
|
||||
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||
ops:
|
||||
drop_index:
|
||||
raw: |
|
||||
DROP INDEX IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||
drop_table:
|
||||
raw: |
|
||||
DROP TABLE IF EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors);
|
||||
schema:
|
||||
params:
|
||||
cl: TEMPLATE(cl,LOCAL_QUORUM)
|
||||
ops:
|
||||
create_keyspace:
|
||||
raw: |
|
||||
CREATE KEYSPACE IF NOT EXISTS TEMPLATE(keyspace,baselines)
|
||||
WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
|
||||
create_table:
|
||||
raw: |
|
||||
CREATE TABLE IF NOT EXISTS TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (
|
||||
key TEXT,
|
||||
value vector<float,TEMPLATE(dimensions,set-the-dimensions-template-var)>,
|
||||
PRIMARY KEY (key)
|
||||
);
|
||||
create_sai_index:
|
||||
raw: |
|
||||
CREATE CUSTOM INDEX IF NOT EXISTS ON TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) (value) USING 'StorageAttachedIndex'
|
||||
WITH OPTIONS = {'similarity_function' : 'TEMPLATE(similarity_function,cosine)'};
|
||||
# WITH OPTIONS = {'maximum_node_connections' : TEMPLATE(M,16), 'construction_beam_width' : TEMPLATE(ef,100), 'similarity_function' : 'TEMPLATE(similarity_function,dot_product)'};
|
||||
rampup:
|
||||
params:
|
||||
cl: TEMPLATE(write_cl,LOCAL_QUORUM)
|
||||
prepared: true
|
||||
ops:
|
||||
insert: |
|
||||
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
|
||||
# await_index:
|
||||
# ops:
|
||||
testann:
|
||||
ops:
|
||||
select_ann_limit_TEMPLATE(k,100):
|
||||
prepared: |
|
||||
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||
ORDER BY value ANN OF {test_floatlist_TEMPLATE(filetype,hdf5)} LIMIT TEMPLATE(select_limit,100);
|
||||
tags:
|
||||
optype: select
|
||||
verifier-init: |
|
||||
k=TEMPLATE(k,100)
|
||||
relevancy= new io.nosqlbench.api.engine.metrics.wrappers.RelevancyMeasures(_parsed_op);
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
|
||||
verifier: |
|
||||
actual_indices=io.nosqlbench.engine.extensions.vectormath.CqlUtils.cqlStringColumnToIntArray("key",result);
|
||||
relevancy.accept({relevant_indices_TEMPLATE(filetype,hdf5)},actual_indices);
|
||||
return true;
|
||||
insert_rewrite:
|
||||
prepared: |
|
||||
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||
(key, value) VALUES ({id},{train_floatlist_TEMPLATE(filetype,hdf5)});
|
||||
tags:
|
||||
optype: insert
|
||||
|
||||
search_and_rewrite:
|
||||
ops:
|
||||
select_ann_limit:
|
||||
stmt: |
|
||||
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
|
||||
verifier-init: |
|
||||
scriptingmetrics.newSummaryGauge(_parsed_op,"recall")
|
||||
# verifier: |
|
||||
upsert_same:
|
||||
stmt: |
|
||||
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||
(key, value) VALUES ({rw_key},{train_vector});
|
||||
search_and_invalidate:
|
||||
ops:
|
||||
select_ann_limit:
|
||||
stmt: |
|
||||
SELECT * FROM TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors) ORDER BY value ANN OF {test_vector} LIMIT TEMPLATE(select_limit,100);
|
||||
# verifier-init: |
|
||||
# verifier: |
|
||||
upsert_random: |
|
||||
INSERT INTO TEMPLATE(keyspace,baselines).TEMPLATE(table,vectors)
|
||||
(key, value) VALUES ({rw_key},{train_vector});
|
||||
|
||||
|
@ -31,6 +31,7 @@ import org.apache.logging.log4j.Logger;
|
||||
|
||||
import java.util.function.Function;
|
||||
|
||||
@Service(value = DriverAdapter.class,selector = "jdbc")
|
||||
public class JDBCDriverAdapter extends BaseDriverAdapter<JDBCOp, JDBCSpace> {
|
||||
private final static Logger logger = LogManager.getLogger(JDBCDriverAdapter.class);
|
||||
|
||||
|
@ -537,7 +537,7 @@
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<argLine>-ea ${argLine}</argLine>
|
||||
<argLine>-ea @{argLine}</argLine>
|
||||
<systemPropertyVariables>
|
||||
<Log4jContextSelector>
|
||||
org.apache.logging.log4j.core.async.AsyncLoggerContextSelector
|
||||
|
@ -33,5 +33,6 @@ public enum Category {
|
||||
experimental,
|
||||
combinitoric,
|
||||
vectors,
|
||||
HOF
|
||||
HOF,
|
||||
readers
|
||||
}
|
||||
|
@ -53,6 +53,8 @@
|
||||
<directory>src/test/resources</directory>
|
||||
<excludes>
|
||||
<exclude>h5ex_t_float.h5</exclude>
|
||||
<exclude>**/*.ivec</exclude>
|
||||
<exclude>**/*.fvec</exclude>
|
||||
</excludes>
|
||||
<filtering>true</filtering>
|
||||
</testResource>
|
||||
|
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||
|
||||
import io.nosqlbench.api.content.Content;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.nio.FloatBuffer;
|
||||
import java.nio.MappedByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* Reads ivec files with random access, using the input to specify the record number.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.readers)
|
||||
public class FVecReader implements LongFunction<float[]> {
|
||||
|
||||
private final MappedByteBuffer bb;
|
||||
private final int dimensions;
|
||||
private final int reclen;
|
||||
private final long filesize;
|
||||
private final Path path;
|
||||
private final int reclim;
|
||||
|
||||
public FVecReader(String pathname) {
|
||||
this(pathname,0,0);
|
||||
}
|
||||
public FVecReader(String pathname, int expectedDimensions, int recordLimit) {
|
||||
Content<?> src = NBIO.fs().search(pathname).one();
|
||||
this.path = src.asPath();
|
||||
try {
|
||||
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
|
||||
this.filesize = channel.size();
|
||||
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.dimensions = Integer.reverseBytes(bb.getInt(0));
|
||||
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
|
||||
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
|
||||
}
|
||||
int datalen = (dimensions * Float.BYTES);
|
||||
this.reclen = Integer.BYTES + datalen;
|
||||
int totalRecords = (int) (filesize/reclen);
|
||||
if (recordLimit > totalRecords) {
|
||||
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
|
||||
}
|
||||
this.reclim = recordLimit==0? totalRecords : recordLimit;
|
||||
if ((filesize % reclen)!=0) {
|
||||
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public float[] apply(long value) {
|
||||
int recordIdx = (int) (value % reclim);
|
||||
int recpos = recordIdx*reclen;
|
||||
int recdim = Integer.reverseBytes(bb.getInt(recpos));
|
||||
if(recdim!=dimensions) {
|
||||
throw new RuntimeException("dimensions are not uniform for fvec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
|
||||
}
|
||||
var vbuf = new byte[dimensions*Float.BYTES];
|
||||
bb.get(recpos + Integer.BYTES, vbuf);
|
||||
|
||||
FloatBuffer fbuf=ByteBuffer.wrap(vbuf).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer();
|
||||
var vectors = new float[dimensions];
|
||||
fbuf.get(vectors);
|
||||
return vectors;
|
||||
}
|
||||
}
|
@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||
|
||||
import io.nosqlbench.api.content.Content;
|
||||
import io.nosqlbench.api.content.NBIO;
|
||||
import io.nosqlbench.virtdata.api.annotations.Categories;
|
||||
import io.nosqlbench.virtdata.api.annotations.Category;
|
||||
import io.nosqlbench.virtdata.api.annotations.Example;
|
||||
import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.MappedByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.function.LongFunction;
|
||||
|
||||
/**
|
||||
* Reads ivec files with random access, using the input to specify the record number.
|
||||
* This is used for testing with generated KNN test data which is uniform in dimensions and neighborhood size.
|
||||
* While it is possible to specify different dimensioned vectors per record, this is not supported, since this
|
||||
* function honors the pure-function behavior of other NB binding functions. This requires uniform record structure for random access.
|
||||
*/
|
||||
@ThreadSafeMapper
|
||||
@Categories(Category.readers)
|
||||
public class IVecReader implements LongFunction<int[]> {
|
||||
|
||||
private final MappedByteBuffer bb;
|
||||
private final int dimensions;
|
||||
private final int reclen;
|
||||
private final long filesize;
|
||||
private final Path path;
|
||||
private final int reclim;
|
||||
|
||||
/**
|
||||
* Read the ivec file, determining the record size from the first record.
|
||||
* @param pathname The location of the ivec file
|
||||
*/
|
||||
@Example({"IvecReader('testfile.ivec')","Create a reader for int vectors, detecting the dimensions and dataset size automatically."})
|
||||
public IVecReader(String pathname) {
|
||||
this(pathname,0,0);
|
||||
}
|
||||
@Example({"IvecReader('testfile.ivec', 46, 12)","Create a reader for int vectors, asserting 46 dimensions and limit total records to 12."})
|
||||
public IVecReader(String pathname, int expectedDimensions, int recordLimit) {
|
||||
Content<?> src = NBIO.fs().search(pathname).one();
|
||||
this.path = src.asPath();
|
||||
try {
|
||||
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
|
||||
this.filesize = channel.size();
|
||||
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
this.dimensions = Integer.reverseBytes(bb.getInt(0));
|
||||
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
|
||||
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
|
||||
}
|
||||
int datalen = (dimensions * Integer.BYTES);
|
||||
this.reclen = Integer.BYTES + datalen;
|
||||
int totalRecords = (int) (filesize/reclen);
|
||||
if (recordLimit > totalRecords) {
|
||||
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
|
||||
}
|
||||
this.reclim = recordLimit==0? totalRecords : recordLimit;
|
||||
if ((filesize % reclen)!=0) {
|
||||
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] apply(long value) {
|
||||
int recordIdx = (int) (value % reclim);
|
||||
int recpos = recordIdx*reclen;
|
||||
byte[] buf = new byte[reclen];
|
||||
this.bb.get(recpos,buf);
|
||||
ByteBuffer record = ByteBuffer.wrap(buf);
|
||||
int recdim = Integer.reverseBytes(record.getInt());
|
||||
if(recdim!=dimensions) {
|
||||
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
|
||||
}
|
||||
int[] data = new int[recdim];
|
||||
for (int i = 0; i < dimensions; i++) {
|
||||
data[i]=Integer.reverseBytes(record.getInt());
|
||||
}
|
||||
return data;
|
||||
}
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class IvecFvecMethods {
|
||||
|
||||
public static ArrayList<float[]> readFvecs(String filePath) throws IOException {
|
||||
var vectors = new ArrayList<float[]>();
|
||||
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
|
||||
while (dis.available() > 0) {
|
||||
var dimension = Integer.reverseBytes(dis.readInt());
|
||||
assert dimension > 0 : dimension;
|
||||
var buffer = new byte[dimension * Float.BYTES];
|
||||
dis.readFully(buffer);
|
||||
var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
||||
var vector = new float[dimension];
|
||||
var floatBuffer = byteBuffer.asFloatBuffer();
|
||||
floatBuffer.get(vector);
|
||||
vectors.add(vector);
|
||||
}
|
||||
}
|
||||
return vectors;
|
||||
}
|
||||
|
||||
public static ArrayList<HashSet<Integer>> readIvecs(String filename) {
|
||||
var groundTruthTopK = new ArrayList<HashSet<Integer>>();
|
||||
|
||||
try (var dis = new DataInputStream(new FileInputStream(filename))) {
|
||||
while (dis.available() > 0) {
|
||||
var numNeighbors = Integer.reverseBytes(dis.readInt());
|
||||
var neighbors = new HashSet<Integer>(numNeighbors);
|
||||
|
||||
for (var i = 0; i < numNeighbors; i++) {
|
||||
var neighbor = Integer.reverseBytes(dis.readInt());
|
||||
neighbors.add(neighbor);
|
||||
}
|
||||
|
||||
groundTruthTopK.add(neighbors);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return groundTruthTopK;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class IVecReaderTest {
|
||||
|
||||
@Test
|
||||
public void testReadIvec() {
|
||||
|
||||
ArrayList<HashSet<Integer>> idx_ref = IvecFvecMethods.readIvecs("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
|
||||
|
||||
IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
|
||||
for (int i = 0; i < 10; i++) {
|
||||
int[] indices = ir.apply(0);
|
||||
HashSet<Integer> ref = idx_ref.get(0);
|
||||
for (int j = 0; j < indices.length; j++) {
|
||||
assertThat(indices[j]).isGreaterThanOrEqualTo(0);
|
||||
assertThat(indices[j]).isLessThanOrEqualTo(10000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadFvec() {
|
||||
FVecReader ir = new FVecReader("src/test/resources/ivecfvec/test_ada_002_10000_distances_count.fvec");
|
||||
for (int i = 0; i < 10; i++) {
|
||||
float[] dist = ir.apply(i);
|
||||
for (int j = 1; j < dist.length; j++) {
|
||||
assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]).describedAs("dist[" + j +"]=(" +dist[j]+") dist[j-1]=(" + dist[j-1] + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadFvecSpecificDims() {
|
||||
FVecReader ir = new FVecReader(
|
||||
"src/test/resources/ivecfvec/test_ada_002_10000_base_vectors.fvec",
|
||||
1536,0);
|
||||
float[] vec0 = ir.apply(0);
|
||||
assertThat(vec0.length).isEqualTo(1536);
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user