add support for ivec and fvec formats

This commit is contained in:
Jonathan Shook 2023-10-26 14:00:14 -05:00
parent a54d51ee62
commit f68bc37dc7
4 changed files with 319 additions and 0 deletions

View File

@ -0,0 +1,89 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import io.nosqlbench.api.content.Content;
import io.nosqlbench.api.content.NBIO;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.function.LongFunction;
/**
* Reads ivec files with random access, using the input to specify the record number.
*/
public class FVecReader implements LongFunction<float[]> {
private final MappedByteBuffer bb;
private final int dimensions;
private final int reclen;
private final long filesize;
private final Path path;
private final int reclim;
public FVecReader(String pathname) {
this(pathname,0,0);
}
public FVecReader(String pathname, int expectedDimensions, int recordLimit) {
Content<?> src = NBIO.fs().search(pathname).one();
this.path = src.asPath();
try {
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
this.filesize = channel.size();
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.dimensions = Integer.reverseBytes(bb.getInt(0));
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
}
int datalen = (dimensions * Float.BYTES);
this.reclen = Integer.BYTES + datalen;
int totalRecords = (int) (filesize/reclen);
if (recordLimit > totalRecords) {
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
}
this.reclim = recordLimit==0? totalRecords : recordLimit;
if ((filesize % reclen)!=0) {
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
}
}
@Override
public float[] apply(long value) {
int recordIdx = (int) (value % reclim);
long offset = value * recordIdx;
int recpos = (int) (offset %filesize) ;
byte[] buf = new byte[reclen];
ByteBuffer record = this.bb.get(recpos,buf).order(ByteOrder.LITTLE_ENDIAN);
int recdim = record.getInt();
if(recdim!=dimensions) {
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
}
float[] data = new float[recdim];
for (int i = 0; i < dimensions; i++) {
data[i]=bb.getFloat();
}
return data;
}
}

View File

@ -0,0 +1,101 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import io.nosqlbench.api.config.standard.ConfigModel;
import io.nosqlbench.api.content.Content;
import io.nosqlbench.api.content.NBIO;
import io.nosqlbench.virtdata.api.annotations.Example;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.function.LongFunction;
import java.util.function.LongToIntFunction;
/**
* Reads ivec files with random access, using the input to specify the record number.
* This is used for testing with generated KNN test data which is uniform in dimensions and neighborhood size.
* While it is possible to specify different dimensioned vectors per record, this is not supported, since this
* function honors the pure-function behavior of other NB binding functions. This requires uniform record structure for random access.
*/
public class IVecReader implements LongFunction<int[]> {
private final MappedByteBuffer bb;
private final int dimensions;
private final int reclen;
private final long filesize;
private final Path path;
private final int reclim;
/**
* Read the ivec file, determining the record size from the first record.
* @param pathname The location of the ivec file
*/
@Example({"IvecReader('testfile.ivec')","Create a reader for int vectors, detecting the dimensions and dataset size automatically."})
public IVecReader(String pathname) {
this(pathname,0,0);
}
@Example({"IvecReader('testfile.ivec', 46, 12)","Create a reader for int vectors, asserting 46 dimensions and limit total records to 12."})
public IVecReader(String pathname, int expectedDimensions, int recordLimit) {
Content<?> src = NBIO.fs().search(pathname).one();
this.path = src.asPath();
try {
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
this.filesize = channel.size();
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.dimensions = Integer.reverseBytes(bb.getInt(0));
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
}
int datalen = (dimensions * Integer.BYTES);
this.reclen = Integer.BYTES + datalen;
int totalRecords = (int) (filesize/reclen);
if (recordLimit > totalRecords) {
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
}
this.reclim = recordLimit==0? totalRecords : recordLimit;
if ((filesize % reclen)!=0) {
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
}
}
@Override
public int[] apply(long value) {
int recordIdx = (int) (value % reclim);
long offset = value * recordIdx;
int recpos = (int) (offset %filesize) ;
byte[] buf = new byte[reclen];
ByteBuffer record = this.bb.get(recpos,buf);
int recdim = Integer.reverseBytes(record.getInt());
if(recdim!=dimensions) {
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
}
int[] data = new int[recdim];
for (int i = 0; i < dimensions; i++) {
data[i]=Integer.reverseBytes(bb.getInt());
}
return data;
}
}

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.HashSet;
public class IvecFvecMethods {
public static ArrayList<float[]> readFvecs(String filePath) throws IOException {
var vectors = new ArrayList<float[]>();
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
while (dis.available() > 0) {
var dimension = Integer.reverseBytes(dis.readInt());
assert dimension > 0 : dimension;
var buffer = new byte[dimension * Float.BYTES];
dis.readFully(buffer);
var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
var vector = new float[dimension];
var floatBuffer = byteBuffer.asFloatBuffer();
floatBuffer.get(vector);
vectors.add(vector);
}
}
return vectors;
}
public static ArrayList<HashSet<Integer>> readIvecs(String filename) {
var groundTruthTopK = new ArrayList<HashSet<Integer>>();
try (var dis = new DataInputStream(new FileInputStream(filename))) {
while (dis.available() > 0) {
var numNeighbors = Integer.reverseBytes(dis.readInt());
var neighbors = new HashSet<Integer>(numNeighbors);
for (var i = 0; i < numNeighbors; i++) {
var neighbor = Integer.reverseBytes(dis.readInt());
neighbors.add(neighbor);
}
groundTruthTopK.add(neighbors);
}
} catch (IOException e) {
e.printStackTrace();
}
return groundTruthTopK;
}
}

View File

@ -0,0 +1,58 @@
/*
* Copyright (c) 2023 nosqlbench
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.nosqlbench.virtdata.library.ivecfvec;
import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.*;
class IVecReaderTest {
@Test
public void testReadIvec() {
IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
for (int i = 0; i < 10; i++) {
int[] indices = ir.apply(0);
for (int j = 0; j < indices.length; j++) {
assertThat(indices[j]).isGreaterThanOrEqualTo(0);
assertThat(indices[j]).isLessThanOrEqualTo(10000);
}
}
}
@Test
public void testReadFvec() {
FVecReader ir = new FVecReader("src/test/resources/ivecfvec/test_ada_002_10000_distances_count.fvec");
for (int i = 0; i < 10; i++) {
float[] dist = ir.apply(i);
for (int j = 1; j < dist.length; j++) {
assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]);
}
}
}
@Test
public void testReadFvecSpecificDims() {
FVecReader ir = new FVecReader(
"src/test/resources/ivecfvec/test_ada_002_10000_base_vectors.fvec",
1536,0);
float[] vec0 = ir.apply(0);
assertThat(vec0.length).isEqualTo(1536);
}
}