mirror of
https://github.com/nosqlbench/nosqlbench.git
synced 2025-02-25 18:55:28 -06:00
add support for ivec and fvec formats
This commit is contained in:
parent
a54d51ee62
commit
f68bc37dc7
@ -0,0 +1,89 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||||
|
|
||||||
|
import io.nosqlbench.api.content.Content;
|
||||||
|
import io.nosqlbench.api.content.NBIO;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.nio.MappedByteBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads ivec files with random access, using the input to specify the record number.
|
||||||
|
*/
|
||||||
|
public class FVecReader implements LongFunction<float[]> {
|
||||||
|
|
||||||
|
private final MappedByteBuffer bb;
|
||||||
|
private final int dimensions;
|
||||||
|
private final int reclen;
|
||||||
|
private final long filesize;
|
||||||
|
private final Path path;
|
||||||
|
private final int reclim;
|
||||||
|
|
||||||
|
public FVecReader(String pathname) {
|
||||||
|
this(pathname,0,0);
|
||||||
|
}
|
||||||
|
public FVecReader(String pathname, int expectedDimensions, int recordLimit) {
|
||||||
|
Content<?> src = NBIO.fs().search(pathname).one();
|
||||||
|
this.path = src.asPath();
|
||||||
|
try {
|
||||||
|
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
|
||||||
|
this.filesize = channel.size();
|
||||||
|
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
this.dimensions = Integer.reverseBytes(bb.getInt(0));
|
||||||
|
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
|
||||||
|
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
|
||||||
|
}
|
||||||
|
int datalen = (dimensions * Float.BYTES);
|
||||||
|
this.reclen = Integer.BYTES + datalen;
|
||||||
|
int totalRecords = (int) (filesize/reclen);
|
||||||
|
if (recordLimit > totalRecords) {
|
||||||
|
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
|
||||||
|
}
|
||||||
|
this.reclim = recordLimit==0? totalRecords : recordLimit;
|
||||||
|
if ((filesize % reclen)!=0) {
|
||||||
|
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] apply(long value) {
|
||||||
|
int recordIdx = (int) (value % reclim);
|
||||||
|
long offset = value * recordIdx;
|
||||||
|
int recpos = (int) (offset %filesize) ;
|
||||||
|
byte[] buf = new byte[reclen];
|
||||||
|
ByteBuffer record = this.bb.get(recpos,buf).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
int recdim = record.getInt();
|
||||||
|
if(recdim!=dimensions) {
|
||||||
|
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
|
||||||
|
}
|
||||||
|
float[] data = new float[recdim];
|
||||||
|
for (int i = 0; i < dimensions; i++) {
|
||||||
|
data[i]=bb.getFloat();
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,101 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||||
|
|
||||||
|
import io.nosqlbench.api.config.standard.ConfigModel;
|
||||||
|
import io.nosqlbench.api.content.Content;
|
||||||
|
import io.nosqlbench.api.content.NBIO;
|
||||||
|
import io.nosqlbench.virtdata.api.annotations.Example;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.RandomAccessFile;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.MappedByteBuffer;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.function.LongFunction;
|
||||||
|
import java.util.function.LongToIntFunction;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads ivec files with random access, using the input to specify the record number.
|
||||||
|
* This is used for testing with generated KNN test data which is uniform in dimensions and neighborhood size.
|
||||||
|
* While it is possible to specify different dimensioned vectors per record, this is not supported, since this
|
||||||
|
* function honors the pure-function behavior of other NB binding functions. This requires uniform record structure for random access.
|
||||||
|
*/
|
||||||
|
public class IVecReader implements LongFunction<int[]> {
|
||||||
|
|
||||||
|
private final MappedByteBuffer bb;
|
||||||
|
private final int dimensions;
|
||||||
|
private final int reclen;
|
||||||
|
private final long filesize;
|
||||||
|
private final Path path;
|
||||||
|
private final int reclim;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read the ivec file, determining the record size from the first record.
|
||||||
|
* @param pathname The location of the ivec file
|
||||||
|
*/
|
||||||
|
@Example({"IvecReader('testfile.ivec')","Create a reader for int vectors, detecting the dimensions and dataset size automatically."})
|
||||||
|
public IVecReader(String pathname) {
|
||||||
|
this(pathname,0,0);
|
||||||
|
}
|
||||||
|
@Example({"IvecReader('testfile.ivec', 46, 12)","Create a reader for int vectors, asserting 46 dimensions and limit total records to 12."})
|
||||||
|
public IVecReader(String pathname, int expectedDimensions, int recordLimit) {
|
||||||
|
Content<?> src = NBIO.fs().search(pathname).one();
|
||||||
|
this.path = src.asPath();
|
||||||
|
try {
|
||||||
|
FileChannel channel = FileChannel.open(this.path, StandardOpenOption.READ, StandardOpenOption.SPARSE);
|
||||||
|
this.filesize = channel.size();
|
||||||
|
this.bb = channel.map(FileChannel.MapMode.READ_ONLY, 0, filesize);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
this.dimensions = Integer.reverseBytes(bb.getInt(0));
|
||||||
|
if(expectedDimensions>0 && expectedDimensions!=dimensions) {
|
||||||
|
throw new RuntimeException("Invalid dimensions specified for '" +pathname + "', found " + dimensions + ", but expected " + expectedDimensions);
|
||||||
|
}
|
||||||
|
int datalen = (dimensions * Integer.BYTES);
|
||||||
|
this.reclen = Integer.BYTES + datalen;
|
||||||
|
int totalRecords = (int) (filesize/reclen);
|
||||||
|
if (recordLimit > totalRecords) {
|
||||||
|
throw new RuntimeException("Specified record range of " + recordLimit + ", but file only contained " + totalRecords + " total");
|
||||||
|
}
|
||||||
|
this.reclim = recordLimit==0? totalRecords : recordLimit;
|
||||||
|
if ((filesize % reclen)!=0) {
|
||||||
|
throw new RuntimeException("The filesize (" + filesize + ") for '" + pathname + "' must be a multiple of the reclen (" + reclen + ")");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int[] apply(long value) {
|
||||||
|
int recordIdx = (int) (value % reclim);
|
||||||
|
long offset = value * recordIdx;
|
||||||
|
int recpos = (int) (offset %filesize) ;
|
||||||
|
byte[] buf = new byte[reclen];
|
||||||
|
ByteBuffer record = this.bb.get(recpos,buf);
|
||||||
|
int recdim = Integer.reverseBytes(record.getInt());
|
||||||
|
if(recdim!=dimensions) {
|
||||||
|
throw new RuntimeException("dimensions are not uniform for ivec file '" + this.path.toString() + "', found dim " + recdim + " at record " + value);
|
||||||
|
}
|
||||||
|
int[] data = new int[recdim];
|
||||||
|
for (int i = 0; i < dimensions; i++) {
|
||||||
|
data[i]=Integer.reverseBytes(bb.getInt());
|
||||||
|
}
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,71 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||||
|
|
||||||
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
public class IvecFvecMethods {
|
||||||
|
|
||||||
|
public static ArrayList<float[]> readFvecs(String filePath) throws IOException {
|
||||||
|
var vectors = new ArrayList<float[]>();
|
||||||
|
try (var dis = new DataInputStream(new BufferedInputStream(new FileInputStream(filePath)))) {
|
||||||
|
while (dis.available() > 0) {
|
||||||
|
var dimension = Integer.reverseBytes(dis.readInt());
|
||||||
|
assert dimension > 0 : dimension;
|
||||||
|
var buffer = new byte[dimension * Float.BYTES];
|
||||||
|
dis.readFully(buffer);
|
||||||
|
var byteBuffer = ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
|
||||||
|
var vector = new float[dimension];
|
||||||
|
var floatBuffer = byteBuffer.asFloatBuffer();
|
||||||
|
floatBuffer.get(vector);
|
||||||
|
vectors.add(vector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return vectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArrayList<HashSet<Integer>> readIvecs(String filename) {
|
||||||
|
var groundTruthTopK = new ArrayList<HashSet<Integer>>();
|
||||||
|
|
||||||
|
try (var dis = new DataInputStream(new FileInputStream(filename))) {
|
||||||
|
while (dis.available() > 0) {
|
||||||
|
var numNeighbors = Integer.reverseBytes(dis.readInt());
|
||||||
|
var neighbors = new HashSet<Integer>(numNeighbors);
|
||||||
|
|
||||||
|
for (var i = 0; i < numNeighbors; i++) {
|
||||||
|
var neighbor = Integer.reverseBytes(dis.readInt());
|
||||||
|
neighbors.add(neighbor);
|
||||||
|
}
|
||||||
|
|
||||||
|
groundTruthTopK.add(neighbors);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return groundTruthTopK;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,58 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2023 nosqlbench
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package io.nosqlbench.virtdata.library.ivecfvec;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class IVecReaderTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReadIvec() {
|
||||||
|
IVecReader ir = new IVecReader("src/test/resources/ivecfvec/test_ada_002_10000_indices_query_10000.ivec");
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
int[] indices = ir.apply(0);
|
||||||
|
for (int j = 0; j < indices.length; j++) {
|
||||||
|
assertThat(indices[j]).isGreaterThanOrEqualTo(0);
|
||||||
|
assertThat(indices[j]).isLessThanOrEqualTo(10000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReadFvec() {
|
||||||
|
FVecReader ir = new FVecReader("src/test/resources/ivecfvec/test_ada_002_10000_distances_count.fvec");
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
float[] dist = ir.apply(i);
|
||||||
|
for (int j = 1; j < dist.length; j++) {
|
||||||
|
assertThat(dist[j]).isGreaterThanOrEqualTo(dist[j-1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReadFvecSpecificDims() {
|
||||||
|
FVecReader ir = new FVecReader(
|
||||||
|
"src/test/resources/ivecfvec/test_ada_002_10000_base_vectors.fvec",
|
||||||
|
1536,0);
|
||||||
|
float[] vec0 = ir.apply(0);
|
||||||
|
assertThat(vec0.length).isEqualTo(1536);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user