Merge pull request #1626 from nosqlbench/ms/mdb_vs

MongoDB Atlas Vector Search workload addition
2025-02-25 18:55:28 -06:00 · 2023-10-16 14:50:31 -05:00
parent cc1f8a6c44 5fde5bbe0f
commit 86d445cd44
4 changed files with 252 additions and 1 deletions
--- a/adapter-mongodb/pom.xml
+++ b/adapter-mongodb/pom.xml
@@ -42,7 +42,7 @@
        <dependency>
            <groupId>org.mongodb</groupId>
            <artifactId>mongodb-driver-sync</artifactId>
-            <version>4.10.2</version>
+            <version>4.11.0</version>
        </dependency>
    </dependencies>

--- a/adapter-mongodb/src/main/java/io/nosqlbench/adapter/mongodb/MongoDbUtils.java
+++ b/adapter-mongodb/src/main/java/io/nosqlbench/adapter/mongodb/MongoDbUtils.java
@@ -0,0 +1,36 @@
+package io.nosqlbench.adapter.mongodb;
+
+/*
+ * Copyright (c) 2023 nosqlbench
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import org.bson.Document;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class MongoDbUtils {
+    public static int[] getFieldFromResults(String field, Document document) {
+        Document _cursor = document.get("cursor", Document.class);
+        List<Document> _firstBatch = _cursor.getList("firstBatch", Document.class);
+        List<String> keyStrings = new ArrayList<>();
+        for (Document matchingVector : _firstBatch) {
+            keyStrings.add(matchingVector.get("key",String.class));
+        }
+        return keyStrings.stream().mapToInt(Integer::parseInt).toArray();
+    }
+}
--- a/adapter-mongodb/src/main/java/io/nosqlbench/adapter/mongodb/core/MongoSpace.java
+++ b/adapter-mongodb/src/main/java/io/nosqlbench/adapter/mongodb/core/MongoSpace.java
@@ -18,8 +18,11 @@ package io.nosqlbench.adapter.mongodb.core;

 import com.mongodb.ConnectionString;
 import com.mongodb.MongoClientSettings;
+import com.mongodb.ServerApi;
+import com.mongodb.ServerApiVersion;
 import com.mongodb.client.MongoClient;
 import com.mongodb.client.MongoClients;
+import com.mongodb.client.MongoDatabase;
 import io.nosqlbench.api.config.NBNamedElement;
 import io.nosqlbench.api.config.standard.ConfigModel;
 import io.nosqlbench.api.config.standard.NBConfigModel;
@@ -27,6 +30,7 @@ import io.nosqlbench.api.config.standard.NBConfiguration;
 import io.nosqlbench.api.config.standard.Param;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
+import org.bson.Document;
 import org.bson.UuidRepresentation;
 import org.bson.codecs.UuidCodec;
 import org.bson.codecs.configuration.CodecRegistry;
@@ -80,12 +84,26 @@ public class MongoSpace implements NBNamedElement, AutoCloseable {
                MongoClientSettings.getDefaultCodecRegistry()
        );

+        // https://www.mongodb.com/docs/v7.0/reference/stable-api
+        ServerApi serverApi = ServerApi.builder()
+            .version(ServerApiVersion.V1)
+            .deprecationErrors(false)
+            .strict(false)//Needed because createSearchIndexes is not in stable API
+            .build();
+
        MongoClientSettings settings = MongoClientSettings.builder()
                .applyConnectionString(new ConnectionString(connectionURL))
                .codecRegistry(codecRegistry)
+                .serverApi(serverApi)
                .uuidRepresentation(UuidRepresentation.STANDARD)
+                .applicationName("NoSQLBench")
                .build();
        this.mongoClient = MongoClients.create(settings);
+
+        // Send a ping to confirm a successful connection
+        MongoDatabase mdb = this.mongoClient.getDatabase("admin");
+        mdb.runCommand(new Document("ping", 1));
+        logger.info(() -> "Connection ping test to the cluster successful.");
    }

    public MongoClient getClient() {
--- a/adapter-mongodb/src/main/resources/activities/baselinesv2/mongodb_vector_search.yaml
+++ b/adapter-mongodb/src/main/resources/activities/baselinesv2/mongodb_vector_search.yaml
@@ -0,0 +1,197 @@
+# Connection Guide: https://www.mongodb.com/docs/drivers/java/sync/current/fundamentals/connection/
+# Troubleshoot connection: https://www.mongodb.com/docs/atlas/troubleshoot-connection/#special-characters-in-connection-string-password
+# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:"schema.*" connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
+# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags=block:rampup cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
+# nb5 run driver=mongodb workload=/path/to/mongodb_verctor_search.yaml tags='block:main-.*' cycles=25 connection='mongodb+srv://user:pass@sample-db.host.mongodb.net/?retryWrites=true&w=majority' database=baselines -vv --show-stacktraces
+min_version: "5.17.5"
+
+# https://www.mongodb.com/docs/atlas/app-services/data-api/
+
+description: |
+  This workload is analogous to the cql-keyvalue2 workload, just implemented for MongoDB Atlas Vector Search.
+
+scenarios:
+  default:
+    schema: run driver=mongodb tags==block:"schema.*" threads==1 cycles==UNDEF database=baselines
+    rampup: run driver=mongodb tags==block:rampup cycles===TEMPLATE(trainsize) threads=auto database=baselines errors=counter,retry
+    main: run driver=mongodb tags==block:'main-.*' cycles===TEMPLATE(main-cycles,100000) threads=auto database=baselines
+    drop: run driver=mongodb tags==block:drop-entire-collection threads==1 cycles==UNDEF database=baselines
+    search_and_index: >-
+      run driver=mongodb alias=search_and_index tags='block:main-read' labels='target:mongodbatlas'
+      cycles=TEMPLATE(testsize) errors=counter,retry,warn stride=100 striderate=7.50
+      read_ratio=1 threads=500 database=baselines
+
+params:
+  instrument: true
+bindings:
+  rw_key: TEMPLATE(keydist,Uniform(0,1000000000)); ToString() -> String
+  #WRITE
+  seq_key: ToString();
+  train_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/train"); ToCqlVector();
+  #READ
+  test_floatlist: HdfFileToFloatList("testdata/TEMPLATE(dataset).hdf5", "/test"); ToCqlVector();
+  relevant_indices: HdfFileToIntArray("testdata/TEMPLATE(dataset).hdf5", "/neighbors")
+
+blocks:
+  schema:
+    params:
+      prepared: false
+    ops:
+      # https://www.mongodb.com/docs/manual/reference/method/db.createCollection/
+      # https://www.mongodb.com/docs/manual/core/schema-validation/specify-json-schema/
+      # `clusteredIndex` only support creation of an index on `_id` field (as shown below) so its optional
+      create_collection: |
+        {
+          create: "TEMPLATE(collection,keyvalue)",
+          clusteredIndex: {
+            key: { "_id": 1 },
+            unique: true,
+            name: "_id_idx"
+          },
+          writeConcern: { w: "majority" },
+          validator: {
+            $jsonSchema: {
+              bsonType: "object",
+              title: "Key/Value collection schema validation",
+              required: [ "key" ],
+              properties: {
+                key: {
+                  bsonType: "string",
+                  description: "'key' must be a string and is required"
+                },
+                value: {
+                  bsonType: "array",
+                  description: "'value' must be an array of numbers of BSON double type and is optional but, recommended"
+                }
+              }
+            }
+          },
+          validationLevel: "strict",
+          validationAction: "error",
+          comment: "keyvalue collection creation with strict types and a required 'key' field."
+        }
+      create_key_index: |
+        {
+          createIndexes: "TEMPLATE(collection,keyvalue)",
+          indexes: [
+            {
+              key: {
+                key: 1,
+              },
+              name: "kv_key_idx",
+              unique: true
+            }
+          ],
+          writeConcern: { w: "majority" },
+          comment: "'key' index creation for keyvalue collection. Values should be unique.",
+          commitQuorum: "majority"
+        }
+      create_vector_search_index: |
+        {
+          createSearchIndexes: "TEMPLATE(collection,keyvalue)",
+          indexes: [
+            {
+              name: "kv_value_vector_search_idx",
+              definition: {
+                mappings: {
+                  dynamic: true,
+                  fields: {
+                    value: {
+                      type: "knnVector",
+                      dimensions: TEMPLATE(dimensions,1536),
+                      similarity: "TEMPLATE(similarity_function,cosine)"
+                    }
+                  }
+                }
+              }
+            }
+          ]
+        }
+  rampup:
+    ops:
+      rampup-insert: |
+        {
+          insert: "TEMPLATE(collection,keyvalue)",
+          documents: [
+            {
+              key: "{seq_key}",
+              value: {train_floatlist}
+            }
+          ],
+          comment: "Insert documents into keyvalue collection."
+        }
+  main-read:
+    params:
+      ratio: TEMPLATE(read_ratio,5)
+    ops:
+      main_select:
+        op: |
+          {
+            "aggregate": "TEMPLATE(collection,keyvalue)",
+            "cursor" : {
+              "batchSize": TEMPLATE(top_k,100)
+            },
+            "pipeline": [
+              {
+                "$vectorSearch": {
+                  "index": "kv_value_vector_search_idx",
+                  "path": "value",
+                  "queryVector": {test_floatlist},
+                  "numCandidates": TEMPLATE(num_candidates,100),
+                  "limit": TEMPLATE(top_k,100)
+                }
+              },
+              {
+                "$project": {
+                  "_id": 0,
+                  "key": 1,
+                  "value": 1,
+                  "score": { "$meta": "vectorSearchScore" }
+                }
+              }
+            ],
+            "readConcern": { "level": "local" },
+            "comment": "Find the results for the given 'value' vector search embedding."
+          }
+        verifier-imports:
+          - io.nosqlbench.adapter.mongodb.MongoDbUtils
+        verifier-init: |
+          k=TEMPLATE(top_k,100)
+          relevancy=scriptingmetrics.newRelevancyMeasures(_parsed_op);
+          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.recall("recall",k));
+          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.precision("precision",k));
+          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.F1("F1",k));
+          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.reciprocal_rank("RR",k));
+          relevancy.addFunction(io.nosqlbench.engine.extensions.computefunctions.RelevancyFunctions.average_precision("AP",k));
+        verifier: |
+          actual_indices=MongoDbUtils.getFieldFromResults("key",result);
+          relevancy.accept({relevant_indices},actual_indices);
+          return true;
+  main-write:
+    params:
+      ratio: TEMPLATE(write_ratio,5)
+    ops:
+      main-insert: |
+        {
+          insert: "TEMPLATE(collection,keyvalue)",
+          documents: [
+            {
+              key: "{rw_key}",
+              value: {train_floatlist}
+            }
+          ],
+          writeConcern: { w: "majority" },
+          comment: "Insert documents into keyvalue collection."
+        }
+  drop-entire-collection:
+    ops:
+      drop-vsearch-index: |
+        {
+          dropSearchIndex: "TEMPLATE(collection,keyvalue)",
+          name: "kv_value_vector_search_idx"
+        }
+      drop-collection: |
+        {
+          drop: "TEMPLATE(collection,keyvalue)",
+          comment: "Drop keyvalue collection to start afresh."
+        }