From e99196c36e9a82aad2d417b348b423e5e10a465a Mon Sep 17 00:00:00 2001 From: Jonathan Shook Date: Thu, 26 Oct 2023 20:20:37 -0500 Subject: [PATCH] fix readers --- .../baselinesv2/cql_vector2_fvec.yaml | 11 +++++----- .../virtdata/library/ivecfvec/FVecReader.java | 20 +++++++++---------- .../virtdata/library/ivecfvec/IVecReader.java | 9 +++------ .../library/ivecfvec/IVecReaderTest.java | 9 ++++++++- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/adapter-cqld4/src/main/resources/activities/baselinesv2/cql_vector2_fvec.yaml b/adapter-cqld4/src/main/resources/activities/baselinesv2/cql_vector2_fvec.yaml index 4d3f8b00a..e09963c11 100644 --- a/adapter-cqld4/src/main/resources/activities/baselinesv2/cql_vector2_fvec.yaml +++ b/adapter-cqld4/src/main/resources/activities/baselinesv2/cql_vector2_fvec.yaml @@ -53,11 +53,12 @@ bindings: relevant_indices_hdf5: HdfFileToIntArray("testdata/TEMPLATE(datafile).hdf5", "/neighbors") distance_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/distance") train_floatlist_hdf5: HdfFileToFloatList("testdata/TEMPLATE(datafile).hdf5", "/train"); ToCqlVector(); - # filetype=vecs for TEMPLATE(filetype,vecs) - test_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec"); ToCqlVector(); - relevant_indices_vecs: IVecReader("testdata/TEMPLATE(datafile).ivec"); - distance_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec"); - train_floatlist_vecs: FVecReader("testdata/TEMPLATE(datafile).fvec"); ToCqlVector(); + # filetype=fvec for TEMPLATE(filetype,fvec) + test_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_query_vectors.fvec"); ToCqlVector(); + relevant_indices_fvec: IVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_indices_query.ivec"); + distance_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(testsize)_distances_count.fvec",TEMPLATE(dimensions),0); + train_floatlist_fvec: FVecReader("testdata/TEMPLATE(datafile)_TEMPLATE(trainsize)_base_vectors.fvec",TEMPLATE(dimensions),0); ToCqlVector(); + # synthetic synthetic_vectors: HashedFloatVectors(TEMPLATE(dimensions)); blocks: diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java index c17d93d11..0fa26fb14 100644 --- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/FVecReader.java @@ -25,6 +25,7 @@ import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.FloatBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Path; @@ -77,18 +78,17 @@ public class FVecReader implements LongFunction { @Override public float[] apply(long value) { int recordIdx = (int) (value % reclim); - long offset = value * recordIdx; - int recpos = (int) (offset %filesize) ; - byte[] buf = new byte[reclen]; - ByteBuffer record = this.bb.get(recpos,buf).order(ByteOrder.LITTLE_ENDIAN); - int recdim = record.getInt(); + int recpos = recordIdx*reclen; + int recdim = Integer.reverseBytes(bb.getInt(recpos)); if(recdim!=dimensions) { throw new RuntimeException("dimensions are not uniform for fvec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value); } - float[] data = new float[recdim]; - for (int i = 0; i < dimensions; i++) { - data[i]=record.getFloat(); - } - return data; + var vbuf = new byte[dimensions*Float.BYTES]; + bb.get(recpos + Integer.BYTES, vbuf); + + FloatBuffer fbuf=ByteBuffer.wrap(vbuf).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer(); + var vectors = new float[dimensions]; + fbuf.get(vectors); + return vectors; } } diff --git a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java index 6e912c55e..6a46f779c 100644 --- a/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java +++ b/virtdata-lib-hdf5/src/main/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReader.java @@ -16,7 +16,6 @@ package io.nosqlbench.virtdata.library.ivecfvec; -import io.nosqlbench.api.config.standard.ConfigModel; import io.nosqlbench.api.content.Content; import io.nosqlbench.api.content.NBIO; import io.nosqlbench.virtdata.api.annotations.Categories; @@ -25,14 +24,12 @@ import io.nosqlbench.virtdata.api.annotations.Example; import io.nosqlbench.virtdata.api.annotations.ThreadSafeMapper; import java.io.IOException; -import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.file.Path; import java.nio.file.StandardOpenOption; import java.util.function.LongFunction; -import java.util.function.LongToIntFunction; /** * Reads ivec files with random access, using the input to specify the record number. @@ -89,10 +86,10 @@ public class IVecReader implements LongFunction { @Override public int[] apply(long value) { int recordIdx = (int) (value % reclim); - long offset = value * recordIdx; - int recpos = (int) (offset %filesize) ; + int recpos = recordIdx*reclen; byte[] buf = new byte[reclen]; - ByteBuffer record = this.bb.get(recpos,buf); + this.bb.get(recpos,buf); + ByteBuffer record = ByteBuffer.wrap(buf); int recdim = Integer.reverseBytes(record.getInt()); if(recdim!=dimensions) { throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value); diff --git a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java index 17d66531c..9c8bf645a 100644 --- a/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java +++ b/virtdata-lib-hdf5/src/test/java/io/nosqlbench/virtdata/library/ivecfvec/IVecReaderTest.java @@ -18,6 +18,9 @@ package io.nosqlbench.virtdata.library.ivecfvec; import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.HashSet; + import static org.assertj.core.api.Assertions.assertThat; import static org.junit.jupiter.api.Assertions.*; @@ -25,9 +28,13 @@ class IVecReaderTest { @Test public void testReadIvec() { + + ArrayList> idx_ref = IvecFvecMethods.readIvecs("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec"); + IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec"); for (int i = 0; i < 10; i++) { int[] indices = ir.apply(0); + HashSet ref = idx_ref.get(0); for (int j = 0; j < indices.length; j++) { assertThat(indices[j]).isGreaterThanOrEqualTo(0); assertThat(indices[j]).isLessThanOrEqualTo(10000); @@ -41,7 +48,7 @@ class IVecReaderTest { for (int i = 0; i < 10; i++) { float[] dist = ir.apply(i); for (int j = 1; j < dist.length; j++) { - assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]); + assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]).describedAs("dist[" + j +"]=(" +dist[j]+") dist[j-1]=(" + dist[j-1] + ")"); } } }