mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
Merge pull request #1626 from nosqlbench/ms/mdb_vs
MongoDB Atlas Vector Search workload addition
This commit is contained in:
@@ -42,7 +42,7 @@
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongodb-driver-sync</artifactId>
|
||||
<version>4.10.2</version>
|
||||
<version>4.11.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
package io.nosqlbench.adapter.mongodb;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2023 nosqlbench
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.bson.Document;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class MongoDbUtils {
|
||||
public static int[] getFieldFromResults(String field, Document document) {
|
||||
Document _cursor = document.get("cursor", Document.class);
|
||||
List<Document> _firstBatch = _cursor.getList("firstBatch", Document.class);
|
||||
List<String> keyStrings = new ArrayList<>();
|
||||
for (Document matchingVector : _firstBatch) {
|
||||
keyStrings.add(matchingVector.get("key",String.class));
|
||||
}
|
||||
return keyStrings.stream().mapToInt(Integer::parseInt).toArray();
|
||||
}
|
||||
}
|
||||
@@ -18,8 +18,11 @@ package io.nosqlbench.adapter.mongodb.core;
|
||||
|
||||
import com.mongodb.ConnectionString;
|
||||
import com.mongodb.MongoClientSettings;
|
||||
import com.mongodb.ServerApi;
|
||||
import com.mongodb.ServerApiVersion;
|
||||
import com.mongodb.client.MongoClient;
|
||||
import com.mongodb.client.MongoClients;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import io.nosqlbench.api.config.NBNamedElement;
|
||||
import io.nosqlbench.api.config.standard.ConfigModel;
|
||||
import io.nosqlbench.api.config.standard.NBConfigModel;
|
||||
@@ -27,6 +30,7 @@ import io.nosqlbench.api.config.standard.NBConfiguration;
|
||||
import io.nosqlbench.api.config.standard.Param;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.bson.Document;
|
||||
import org.bson.UuidRepresentation;
|
||||
import org.bson.codecs.UuidCodec;
|
||||
import org.bson.codecs.configuration.CodecRegistry;
|
||||
@@ -80,12 +84,26 @@ public class MongoSpace implements NBNamedElement, AutoCloseable {
|
||||
MongoClientSettings.getDefaultCodecRegistry()
|
||||
);
|
||||
|
||||
// https://www.mongodb.com/docs/v7.0/reference/stable-api
|
||||
ServerApi serverApi = ServerApi.builder()
|
||||
.version(ServerApiVersion.V1)
|
||||
.deprecationErrors(false)
|
||||
.strict(false)//Needed because createSearchIndexes is not in stable API
|
||||
.build();
|
||||
|
||||
MongoClientSettings settings = MongoClientSettings.builder()
|
||||
.applyConnectionString(new ConnectionString(connectionURL))
|
||||
.codecRegistry(codecRegistry)
|
||||
.serverApi(serverApi)
|
||||
.uuidRepresentation(UuidRepresentation.STANDARD)
|
||||
.applicationName("NoSQLBench")
|
||||
.build();
|
||||
this.mongoClient = MongoClients.create(settings);
|
||||
|
||||
// Send a ping to confirm a successful connection
|
||||
MongoDatabase mdb = this.mongoClient.getDatabase("admin");
|
||||
mdb.runCommand(new Document("ping", 1));
|
||||
logger.info(() -> "Connection ping test to the cluster successful.");
|
||||
}
|
||||
|
||||
public MongoClient getClient() {
|
||||
|
||||
@@ -0,0 +1,197 @@
|
||||
# Connection Guide: https://www.mongodb.com/docs/drivers/java/sync/current/fundamentals/connection/
|
||||
# Troubleshoot connection: https://www.mongodb.com/docs/atlas/troubleshoot-connection/#special-characters-in-connection-string-password
|
||||
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:"schema.*" connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
|
||||
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:rampup cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
|
||||
# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags='block:main-.*' cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
|
||||
min_version: "5.17.5"
|
||||
|
||||
# https://www.mongodb.com/docs/atlas/app-services/data-api/
|
||||
|
||||
description: |
|
||||
This workload is analogous to the cql-keyvalue2 workload, just implemented for MongoDB Atlas Vector Search.
|
||||
|
||||
scenarios:
|
||||
default:
|
||||
schema: run driver=mongodb tags==block:"schema.*" threads==1 cycles==UNDEF database=baselines
|
||||
rampup: run driver=mongodb tags==block:rampup cycles===TEMPLATE(trainsize) threads=auto database=baselines errors=counter,retry
|
||||
main: run driver=mongodb tags==block:'main-.*' cycles===TEMPLATE(main-cycles,100000) threads=auto database=baselines
|
||||
drop: run driver=mongodb tags==block:drop-entire-collection threads==1 cycles==UNDEF database=baselines
|
||||
search_and_index: >-
|
||||
run driver=mongodb alias=search_and_index tags='block:main-read' labels='target:mongodbatlas'
|
||||
cycles=TEMPLATE(testsize) errors=counter,retry,warn stride=100 striderate=7.50
|
||||
read_ratio=1 threads=500 database=baselines
|
||||
|
||||
params:
|
||||
instrument: true
|
||||
bindings:
|
||||
rw_key: TEMPLATE(keydist,Uniform(0,1000000000)); ToString() -> String
|
||||
#WRITE
|
||||
seq_key: ToString();
|
||||
train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
|
||||
#READ
|
||||
test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
|
||||
relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
|
||||
|
||||
blocks:
|
||||
schema:
|
||||
params:
|
||||
prepared: false
|
||||
ops:
|
||||
# https://www.mongodb.com/docs/manual/reference/method/db.createCollection/
|
||||
# https://www.mongodb.com/docs/manual/core/schema-validation/specify-json-schema/
|
||||
# `clusteredIndex` only support creation of an index on `_id` field (as shown below) so its optional
|
||||
create_collection: |
|
||||
{
|
||||
create: "TEMPLATE(collection,keyvalue)",
|
||||
clusteredIndex: {
|
||||
key: { "_id": 1 },
|
||||
unique: true,
|
||||
name: "_id_idx"
|
||||
},
|
||||
writeConcern: { w: "majority" },
|
||||
validator: {
|
||||
$jsonSchema: {
|
||||
bsonType: "object",
|
||||
title: "Key/Value collection schema validation",
|
||||
required: [ "key" ],
|
||||
properties: {
|
||||
key: {
|
||||
bsonType: "string",
|
||||
description: "'key' must be a string and is required"
|
||||
},
|
||||
value: {
|
||||
bsonType: "array",
|
||||
description: "'value' must be an array of numbers of BSON double type and is optional but, recommended"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
validationLevel: "strict",
|
||||
validationAction: "error",
|
||||
comment: "keyvalue collection creation with strict types and a required 'key' field."
|
||||
}
|
||||
create_key_index: |
|
||||
{
|
||||
createIndexes: "TEMPLATE(collection,keyvalue)",
|
||||
indexes: [
|
||||
{
|
||||
key: {
|
||||
key: 1,
|
||||
},
|
||||
name: "kv_key_idx",
|
||||
unique: true
|
||||
}
|
||||
],
|
||||
writeConcern: { w: "majority" },
|
||||
comment: "'key' index creation for keyvalue collection. Values should be unique.",
|
||||
commitQuorum: "majority"
|
||||
}
|
||||
create_vector_search_index: |
|
||||
{
|
||||
createSearchIndexes: "TEMPLATE(collection,keyvalue)",
|
||||
indexes: [
|
||||
{
|
||||
name: "kv_value_vector_search_idx",
|
||||
definition: {
|
||||
mappings: {
|
||||
dynamic: true,
|
||||
fields: {
|
||||
value: {
|
||||
type: "knnVector",
|
||||
dimensions: TEMPLATE(dimensions,1536),
|
||||
similarity: "TEMPLATE(similarity_function,cosine)"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
rampup:
|
||||
ops:
|
||||
rampup-insert: |
|
||||
{
|
||||
insert: "TEMPLATE(collection,keyvalue)",
|
||||
documents: [
|
||||
{
|
||||
key: "{seq_key}",
|
||||
value: {train_floatlist}
|
||||
}
|
||||
],
|
||||
comment: "Insert documents into keyvalue collection."
|
||||
}
|
||||
main-read:
|
||||
params:
|
||||
ratio: TEMPLATE(read_ratio,5)
|
||||
ops:
|
||||
main_select:
|
||||
op: |
|
||||
{
|
||||
"aggregate": "TEMPLATE(collection,keyvalue)",
|
||||
"cursor" : {
|
||||
"batchSize": TEMPLATE(top_k,100)
|
||||
},
|
||||
"pipeline": [
|
||||
{
|
||||
"$vectorSearch": {
|
||||
"index": "kv_value_vector_search_idx",
|
||||
"path": "value",
|
||||
"queryVector": {test_floatlist},
|
||||
"numCandidates": TEMPLATE(num_candidates,100),
|
||||
"limit": TEMPLATE(top_k,100)
|
||||
}
|
||||
},
|
||||
{
|
||||
"$project": {
|
||||
"_id": 0,
|
||||
"key": 1,
|
||||
"value": 1,
|
||||
"score": { "$meta": "vectorSearchScore" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"readConcern": { "level": "local" },
|
||||
"comment": "Find the results for the given 'value' vector search embedding."
|
||||
}
|
||||
verifier-imports:
|
||||
- io.nosqlbench.adapter.mongodb.MongoDbUtils
|
||||
verifier-init: |
|
||||
k=TEMPLATE(top_k,100)
|
||||
relevancy=scriptingmetrics.newRelevancyMeasures(_parsed_op);
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
|
||||
relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
|
||||
verifier: |
|
||||
actual_indices=MongoDbUtils.getFieldFromResults("key",result);
|
||||
relevancy.accept({relevant_indices},actual_indices);
|
||||
return true;
|
||||
main-write:
|
||||
params:
|
||||
ratio: TEMPLATE(write_ratio,5)
|
||||
ops:
|
||||
main-insert: |
|
||||
{
|
||||
insert: "TEMPLATE(collection,keyvalue)",
|
||||
documents: [
|
||||
{
|
||||
key: "{rw_key}",
|
||||
value: {train_floatlist}
|
||||
}
|
||||
],
|
||||
writeConcern: { w: "majority" },
|
||||
comment: "Insert documents into keyvalue collection."
|
||||
}
|
||||
drop-entire-collection:
|
||||
ops:
|
||||
drop-vsearch-index: |
|
||||
{
|
||||
dropSearchIndex: "TEMPLATE(collection,keyvalue)",
|
||||
name: "kv_value_vector_search_idx"
|
||||
}
|
||||
drop-collection: |
|
||||
{
|
||||
drop: "TEMPLATE(collection,keyvalue)",
|
||||
comment: "Drop keyvalue collection to start afresh."
|
||||
}
|
||||
Reference in New Issue
Block a user