Merge pull request #1626 from nosqlbench/ms/mdb_vs

MongoDB Atlas Vector Search workload addition
This commit is contained in:
Jonathan Shook
2023-10-16 14:50:31 -05:00
committed by GitHub
4 changed files with 252 additions and 1 deletions

View File

@@ -42,7 +42,7 @@
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongodb-driver-sync</artifactId>
<version>4.10.2</version>
<version>4.11.0</version>
</dependency>
</dependencies>

View File

@@ -0,0 +1,36 @@
package io.nosqlbench.adapter.mongodb;
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.bson.Document;
import java.util.ArrayList;
import java.util.List;
public class MongoDbUtils {
public static int[] getFieldFromResults(String field, Document document) {
Document _cursor = document.get("cursor", Document.class);
List<Document> _firstBatch = _cursor.getList("firstBatch", Document.class);
List<String> keyStrings = new ArrayList<>();
for (Document matchingVector : _firstBatch) {
keyStrings.add(matchingVector.get("key",String.class));
}
return keyStrings.stream().mapToInt(Integer::parseInt).toArray();
}
}

View File

@@ -18,8 +18,11 @@ package io.nosqlbench.adapter.mongodb.core;
import com.mongodb.ConnectionString;
import com.mongodb.MongoClientSettings;
import com.mongodb.ServerApi;
import com.mongodb.ServerApiVersion;
import com.mongodb.client.MongoClient;
import com.mongodb.client.MongoClients;
import com.mongodb.client.MongoDatabase;
import io.nosqlbench.api.config.NBNamedElement;
import io.nosqlbench.api.config.standard.ConfigModel;
import io.nosqlbench.api.config.standard.NBConfigModel;
@@ -27,6 +30,7 @@ import io.nosqlbench.api.config.standard.NBConfiguration;
import io.nosqlbench.api.config.standard.Param;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.bson.Document;
import org.bson.UuidRepresentation;
import org.bson.codecs.UuidCodec;
import org.bson.codecs.configuration.CodecRegistry;
@@ -80,12 +84,26 @@ public class MongoSpace implements NBNamedElement, AutoCloseable {
MongoClientSettings.getDefaultCodecRegistry()
);
// https://www.mongodb.com/docs/v7.0/reference/stable-api
ServerApi serverApi = ServerApi.builder()
.version(ServerApiVersion.V1)
.deprecationErrors(false)
.strict(false)//Needed because createSearchIndexes is not in stable API
.build();
MongoClientSettings settings = MongoClientSettings.builder()
.applyConnectionString(new ConnectionString(connectionURL))
.codecRegistry(codecRegistry)
.serverApi(serverApi)
.uuidRepresentation(UuidRepresentation.STANDARD)
.applicationName("NoSQLBench")
.build();
this.mongoClient = MongoClients.create(settings);
// Send a ping to confirm a successful connection
MongoDatabase mdb = this.mongoClient.getDatabase("admin");
mdb.runCommand(new Document("ping", 1));
logger.info(() -> "Connection ping test to the cluster successful.");
}
public MongoClient getClient() {

View File

@@ -0,0 +1,197 @@
# Connection Guide: https://www.mongodb.com/docs/drivers/java/sync/current/fundamentals/connection/
# Troubleshoot connection: https://www.mongodb.com/docs/atlas/troubleshoot-connection/#special-characters-in-connection-string-password
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:"schema.*" connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:rampup cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags='block:main-.*' cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
min_version: "5.17.5"
# https://www.mongodb.com/docs/atlas/app-services/data-api/
description: |
This workload is analogous to the cql-keyvalue2 workload, just implemented for MongoDB Atlas Vector Search.
scenarios:
default:
schema: run driver=mongodb tags==block:"schema.*" threads==1 cycles==UNDEF database=baselines
rampup: run driver=mongodb tags==block:rampup cycles===TEMPLATE(trainsize) threads=auto database=baselines errors=counter,retry
main: run driver=mongodb tags==block:'main-.*' cycles===TEMPLATE(main-cycles,100000) threads=auto database=baselines
drop: run driver=mongodb tags==block:drop-entire-collection threads==1 cycles==UNDEF database=baselines
search_and_index: >-
run driver=mongodb alias=search_and_index tags='block:main-read' labels='target:mongodbatlas'
cycles=TEMPLATE(testsize) errors=counter,retry,warn stride=100 striderate=7.50
read_ratio=1 threads=500 database=baselines
params:
instrument: true
bindings:
rw_key: TEMPLATE(keydist,Uniform(0,1000000000)); ToString() -> String
#WRITE
seq_key: ToString();
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
#READ
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
blocks:
schema:
params:
prepared: false
ops:
# https://www.mongodb.com/docs/manual/reference/method/db.createCollection/
# https://www.mongodb.com/docs/manual/core/schema-validation/specify-json-schema/
# `clusteredIndex` only support creation of an index on `_id` field (as shown below) so its optional
create_collection: |
{
create: "TEMPLATE(collection,keyvalue)",
clusteredIndex: {
key: { "_id": 1 },
unique: true,
name: "_id_idx"
},
writeConcern: { w: "majority" },
validator: {
$jsonSchema: {
bsonType: "object",
title: "Key/Value collection schema validation",
required: [ "key" ],
properties: {
key: {
bsonType: "string",
description: "'key' must be a string and is required"
},
value: {
bsonType: "array",
description: "'value' must be an array of numbers of BSON double type and is optional but, recommended"
}
}
}
},
validationLevel: "strict",
validationAction: "error",
comment: "keyvalue collection creation with strict types and a required 'key' field."
}
create_key_index: |
{
createIndexes: "TEMPLATE(collection,keyvalue)",
indexes: [
{
key: {
key: 1,
},
name: "kv_key_idx",
unique: true
}
],
writeConcern: { w: "majority" },
comment: "'key' index creation for keyvalue collection. Values should be unique.",
commitQuorum: "majority"
}
create_vector_search_index: |
{
createSearchIndexes: "TEMPLATE(collection,keyvalue)",
indexes: [
{
name: "kv_value_vector_search_idx",
definition: {
mappings: {
dynamic: true,
fields: {
value: {
type: "knnVector",
dimensions: TEMPLATE(dimensions,1536),
similarity: "TEMPLATE(similarity_function,cosine)"
}
}
}
}
}
]
}
rampup:
ops:
rampup-insert: |
{
insert: "TEMPLATE(collection,keyvalue)",
documents: [
{
key: "{seq_key}",
value: {train_floatlist}
}
],
comment: "Insert documents into keyvalue collection."
}
main-read:
params:
ratio: TEMPLATE(read_ratio,5)
ops:
main_select:
op: |
{
"aggregate": "TEMPLATE(collection,keyvalue)",
"cursor" : {
"batchSize": TEMPLATE(top_k,100)
},
"pipeline": [
{
"$vectorSearch": {
"index": "kv_value_vector_search_idx",
"path": "value",
"queryVector": {test_floatlist},
"numCandidates": TEMPLATE(num_candidates,100),
"limit": TEMPLATE(top_k,100)
}
},
{
"$project": {
"_id": 0,
"key": 1,
"value": 1,
"score": { "$meta": "vectorSearchScore" }
}
}
],
"readConcern": { "level": "local" },
"comment": "Find the results for the given 'value' vector search embedding."
}
verifier-imports:
- io.nosqlbench.adapter.mongodb.MongoDbUtils
verifier-init: |
k=TEMPLATE(top_k,100)
relevancy=scriptingmetrics.newRelevancyMeasures(_parsed_op);
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
verifier: |
actual_indices=MongoDbUtils.getFieldFromResults("key",result);
relevancy.accept({relevant_indices},actual_indices);
return true;
main-write:
params:
ratio: TEMPLATE(write_ratio,5)
ops:
main-insert: |
{
insert: "TEMPLATE(collection,keyvalue)",
documents: [
{
key: "{rw_key}",
value: {train_floatlist}
}
],
writeConcern: { w: "majority" },
comment: "Insert documents into keyvalue collection."
}
drop-entire-collection:
ops:
drop-vsearch-index: |
{
dropSearchIndex: "TEMPLATE(collection,keyvalue)",
name: "kv_value_vector_search_idx"
}
drop-collection: |
{
drop: "TEMPLATE(collection,keyvalue)",
comment: "Drop keyvalue collection to start afresh."
}