diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java new file mode 100644 index 000000000..45ff82e48 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.ivecfvec; + +import io.nosqlbench.api.content.Content; +import io.nosqlbench.api.content.NBIO; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.function.LongFunction; + +/** + * Reads ivec files with random access, using the input to specify the record number. + */ +public class FVecReader implements LongFunction { + + private final MappedByteBuffer bb; + private final int dimensions; + private final int reclen; + private final long filesize; + private final Path path; + private final int reclim; + + public FVecReader(String pathname) { + this(pathname,0,0); + } + public FVecReader(String pathname, int expectedDimensions, int recordLimit) { + Content src = NBIO.fs().search(pathname).one(); + this.path = src.asPath(); + try { + FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE); + this.filesize = channel.size(); + this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize); + } catch (IOException e) { + throw new RuntimeException(e); + } + this.dimensions = Integer.reverseBytes(bb.getInt(0)); + if(expectedDimensions>0 && expectedDimensions!=dimensions) { + throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions); + } + int datalen = (dimensions * Float.BYTES); + this.reclen = Integer.BYTES + datalen; + int totalRecords = (int) (filesize/reclen); + if (recordLimit > totalRecords) { + throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total"); + } + this.reclim = recordLimit==0? totalRecords : recordLimit; + if ((filesize % reclen)!=0) { + throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")"); + } + } + + @Override + public float[] apply(long value) { + int recordIdx = (int) (value % reclim); + long offset = value * recordIdx; + int recpos = (int) (offset %filesize) ; + byte[] buf = new byte[reclen]; + ByteBuffer record = this.bb.get(recpos,buf).order(ByteOrder.LITTLE_ENDIAN); + int recdim = record.getInt(); + if(recdim!=dimensions) { + throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value); + } + float[] data = new float[recdim]; + for (int i = 0; i < dimensions; i++) { + data[i]=bb.getFloat(); + } + return data; + } +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java new file mode 100644 index 000000000..fb1069871 --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.ivecfvec; + +import io.nosqlbench.api.config.standard.ConfigModel; +import io.nosqlbench.api.content.Content; +import io.nosqlbench.api.content.NBIO; +import io.nosqlbench.virtdata.api.annotations.Example; + +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.function.LongFunction; +import java.util.function.LongToIntFunction; + +/** + * Reads ivec files with random access, using the input to specify the record number. + * This is used for testing with generated KNN test data which is uniform in dimensions and neighborhood size. + * While it is possible to specify different dimensioned vectors per record, this is not supported, since this + * function honors the pure-function behavior of other NB binding functions. This requires uniform record structure for random access. + */ +public class IVecReader implements LongFunction { + + private final MappedByteBuffer bb; + private final int dimensions; + private final int reclen; + private final long filesize; + private final Path path; + private final int reclim; + + /** + * Read the ivec file, determining the record size from the first record. + * @param pathname The location of the ivec file + */ + @Example({"IvecReader('testfile.ivec')","Create a reader for int vectors, detecting the dimensions and dataset size automatically."}) + public IVecReader(String pathname) { + this(pathname,0,0); + } + @Example({"IvecReader('testfile.ivec', 46, 12)","Create a reader for int vectors, asserting 46 dimensions and limit total records to 12."}) + public IVecReader(String pathname, int expectedDimensions, int recordLimit) { + Content src = NBIO.fs().search(pathname).one(); + this.path = src.asPath(); + try { + FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE); + this.filesize = channel.size(); + this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize); + } catch (IOException e) { + throw new RuntimeException(e); + } + this.dimensions = Integer.reverseBytes(bb.getInt(0)); + if(expectedDimensions>0 && expectedDimensions!=dimensions) { + throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions); + } + int datalen = (dimensions * Integer.BYTES); + this.reclen = Integer.BYTES + datalen; + int totalRecords = (int) (filesize/reclen); + if (recordLimit > totalRecords) { + throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total"); + } + this.reclim = recordLimit==0? totalRecords : recordLimit; + if ((filesize % reclen)!=0) { + throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")"); + } + } + + @Override + public int[] apply(long value) { + int recordIdx = (int) (value % reclim); + long offset = value * recordIdx; + int recpos = (int) (offset %filesize) ; + byte[] buf = new byte[reclen]; + ByteBuffer record = this.bb.get(recpos,buf); + int recdim = Integer.reverseBytes(record.getInt()); + if(recdim!=dimensions) { + throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value); + } + int[] data = new int[recdim]; + for (int i = 0; i < dimensions; i++) { + data[i]=Integer.reverseBytes(bb.getInt()); + } + return data; + } +} diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IvecFvecMethods.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IvecFvecMethods.java new file mode 100644 index 000000000..552c24c2b --- /dev/null +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IvecFvecMethods.java @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.ivecfvec; + +import java.io.BufferedInputStream; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.HashSet; + +public class IvecFvecMethods { + + public static ArrayList readFvecs(String filePath) throws IOException { + var vectors = new ArrayList(); + try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) { + while (dis.available() > 0) { + var dimension = Integer.reverseBytes(dis.readInt()); + assert dimension > 0 : dimension; + var buffer = new byte[dimension * Float.BYTES]; + dis.readFully(buffer); + var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN); + + var vector = new float[dimension]; + var floatBuffer = byteBuffer.asFloatBuffer(); + floatBuffer.get(vector); + vectors.add(vector); + } + } + return vectors; + } + + public static ArrayList> readIvecs(String filename) { + var groundTruthTopK = new ArrayList>(); + + try (var dis = new DataInputStream(new FileInputStream(filename))) { + while (dis.available() > 0) { + var numNeighbors = Integer.reverseBytes(dis.readInt()); + var neighbors = new HashSet(numNeighbors); + + for (var i = 0; i < numNeighbors; i++) { + var neighbor = Integer.reverseBytes(dis.readInt()); + neighbors.add(neighbor); + } + + groundTruthTopK.add(neighbors); + } + } catch (IOException e) { + e.printStackTrace(); + } + + return groundTruthTopK; + } + +} diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java new file mode 100644 index 000000000..17d66531c --- /dev/null +++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 nosqlbench + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.nosqlbench.virtdata.library.ivecfvec; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.*; + +class IVecReaderTest { + + @Test + public void testReadIvec() { + IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec"); + for (int i = 0; i < 10; i++) { + int[] indices = ir.apply(0); + for (int j = 0; j < indices.length; j++) { + assertThat(indices[j]).isGreaterThanOrEqualTo(0); + assertThat(indices[j]).isLessThanOrEqualTo(10000); + } + } + } + + @Test + public void testReadFvec() { + FVecReader ir = new FVecReader("src/test/resources/ivecfvec/test_ada_002_10000_distances_count.fvec"); + for (int i = 0; i < 10; i++) { + float[] dist = ir.apply(i); + for (int j = 1; j < dist.length; j++) { + assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]); + } + } + } + + @Test + public void testReadFvecSpecificDims() { + FVecReader ir = new FVecReader( + "src/test/resources/ivecfvec/test_ada_002_10000_base_vectors.fvec", + 1536,0); + float[] vec0 = ir.apply(0); + assertThat(vec0.length).isEqualTo(1536); + } + +}