From 43703e16c1082e20a473be86fa3564e1f14fd803 Mon Sep 17 00:00:00 2001 From: Mark Wolters Date: Mon, 27 Jan 2025 11:06:21 -0500 Subject: [PATCH] add script for hdf5 generation --- .../resources/generatePredicateDataset.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 nb-virtdata/virtdata-lib-hdf5/src/test/resources/generatePredicateDataset.py diff --git a/nb-virtdata/virtdata-lib-hdf5/src/test/resources/generatePredicateDataset.py b/nb-virtdata/virtdata-lib-hdf5/src/test/resources/generatePredicateDataset.py new file mode 100644 index 000000000..48d0e9a3f --- /dev/null +++ b/nb-virtdata/virtdata-lib-hdf5/src/test/resources/generatePredicateDataset.py @@ -0,0 +1,69 @@ +# Copyright (c) 2025 nosqlbench +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +from sklearn.neighbors import NearestNeighbors +import h5py + +def generate_knn_dataset(n, p, x, output_file): + # Step 1: Generate 'train' data (n vectors of size p) with associated ids + train_data = np.random.rand(n, p).astype(np.float32) + train_ids = np.repeat(np.arange(1, n // 100 + 1), 100) # Assign 100 contiguous vectors per id + + # Step 2: Generate 'test' data (x vectors of size p) with associated ids + test_data = np.random.rand(x, p).astype(np.float32) + test_ids = [] + for _ in range(x): + num_ids = np.random.randint(1, 6) # Each test query is associated with 1 to 5 training ids + associated_ids = np.random.choice(np.arange(1, n // 100 + 1), size=num_ids, replace=False) + test_ids.append(associated_ids) + + # Step 3: Compute KNN for 'test' data using 'train' data filtered by associated ids + neighbors_list = [] + for i in range(x): + query_vector = test_data[i] + query_ids = test_ids[i] + + # Filter train data by matching ids + mask = np.isin(train_ids, query_ids) + filtered_train_data = train_data[mask] + global_indices = np.where(mask)[0] # Get global indices of the filtered train data + + knn = NearestNeighbors(n_neighbors=100, algorithm='auto') + knn.fit(filtered_train_data) + neighbors = knn.kneighbors(query_vector.reshape(1, -1), return_distance=False) + + # Map local indices back to global indices + global_neighbors = global_indices[neighbors[0]] + neighbors_list.append(global_neighbors) + + # Step 4: Write data to HDF5 file + with h5py.File(output_file, 'w') as h5f: + h5f.create_dataset('train', data=train_data) + h5f.create_dataset('train_ids', data=train_ids) + h5f.create_dataset('test', data=test_data) + h5f.create_dataset('test_ids', data=np.array(test_ids, dtype=object), dtype=h5py.special_dtype(vlen=np.int32)) + h5f.create_dataset('neighbors', data=np.array(neighbors_list, dtype=np.int32)) + + print(f"Dataset saved to {output_file}") + +# Example usage +if __name__ == "__main__": + n = int(input("Enter the number of train vectors (n): ")) + p = int(input("Enter the dimensionality of each vector (p): ")) + x = int(input("Enter the number of test vectors (x): ")) + output_file = input("Enter the output HDF5 file name: ") + + generate_knn_dataset(n, p, x, output_file)