Fix the issue that causes the vector relevancy score verification test failed.

This commit is contained in:
yabinmeng 2023-10-16 14:44:54 -05:00
parent 439180ca9f
commit 1242ff2dd0
6 changed files with 49 additions and 93 deletions

View File

@ -66,8 +66,7 @@ public class JDBCSpace implements AutoCloseable {
public JDBCSpace(String spaceName, NBConfiguration cfg) {
this.spaceName = spaceName;
this.totalCycleNum = NumberUtils.toLong(cfg.get("cycles"));
this.totalCycleNum = NumberUtils.toLong(cfg.getOptional("cycles").orElse("1"));
int totalThreads = NumberUtils.toInt(cfg.getOptional("threads").orElse("1"));
int numConnInput = NumberUtils.toInt(cfg.getOptional("num_conn").orElse("10"));
this.maxNumConn = Math.min(totalThreads, numConnInput);

View File

@ -36,10 +36,12 @@ public class JDBCDMLOpDispenser extends JDBCBaseOpDispenser {
private static final Logger logger = LogManager.getLogger(JDBCDMLOpDispenser.class);
private final boolean isReadStatement;
private final LongFunction<String> pStmtSqlStrFunc;
private final LongFunction<List<Object>> pStmtValListFunc;
// Only for Vector relevancy score testing (Vector read statement)
private final String verifierKeyName;
public JDBCDMLOpDispenser(DriverAdapter<JDBCOp, JDBCSpace> adapter,
JDBCSpace jdbcSpace,
ParsedOp op,
@ -66,6 +68,8 @@ public class JDBCDMLOpDispenser extends JDBCBaseOpDispenser {
}
return pStmtValListObj;
};
this.verifierKeyName = op.getStaticConfigOr("verifier-key", "");
}
@Override
@ -77,7 +81,8 @@ public class JDBCDMLOpDispenser extends JDBCBaseOpDispenser {
jdbcSpace,
true,
pStmtSqlStrFunc.apply(cycle),
pStmtValListFunc.apply(cycle));
pStmtValListFunc.apply(cycle),
this.verifierKeyName);
}
else {
int ddlStmtBatchNum = jdbcSpace.getDmlBatchNum();

View File

@ -17,8 +17,7 @@ package io.nosqlbench.adapter.jdbc.optypes;
import io.nosqlbench.adapter.jdbc.JDBCSpace;
import io.nosqlbench.adapter.jdbc.exceptions.JDBCAdapterUnexpectedException;
import io.nosqlbench.adapter.jdbc.utils.JDBCPgVector;
import io.nosqlbench.engine.extensions.vectormath.PgvecUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -29,11 +28,15 @@ import java.util.List;
public class JDBCDMLReadOp extends JDBCDMLOp {
private static final Logger LOGGER = LogManager.getLogger(JDBCDMLReadOp.class);
private String verifierKeyName;
public JDBCDMLReadOp(JDBCSpace jdbcSpace,
boolean isReadStmt,
String pStmtSqlStr,
List<Object> pStmtValList) {
List<Object> pStmtValList,
String verifierKeyName) {
super(jdbcSpace, isReadStmt, pStmtSqlStr, pStmtValList);
this.verifierKeyName = verifierKeyName;
}
@Override
@ -44,14 +47,17 @@ public class JDBCDMLReadOp extends JDBCDMLOp {
}
try {
int resultFetched = 0;
List<ResultSet> resultSetList = new ArrayList<>();
// key string list to be used in the "Vector" relevancy score verification
List<String> verifierValueList = new ArrayList<>();
ResultSet rs;
if (!isPreparedStmt) {
rs = stmt.executeQuery(pStmtSqlStr);
do {
resultSetList.add(rs);
String keyVal = rs.getString(this.verifierKeyName);
if (StringUtils.isNotBlank(keyVal)) {
verifierValueList.add(keyVal);
}
} while (rs.next());
closeStatement(stmt);
}
@ -63,8 +69,10 @@ public class JDBCDMLReadOp extends JDBCDMLOp {
if(isResultSet) {
rs = stmt.getResultSet();
while(rs.next()) {
resultSetList.add(rs);
resultFetched++;
String keyVal = rs.getString(this.verifierKeyName);
if (StringUtils.isNotBlank(keyVal)) {
verifierValueList.add(keyVal);
}
}
rs.close();
} else {
@ -78,11 +86,7 @@ public class JDBCDMLReadOp extends JDBCDMLOp {
closeStatement(stmt);
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Total {} of results have been returned.", resultFetched);
}
return resultSetList;
return verifierValueList;
}
catch (SQLException sqlException) {
throw new JDBCAdapterUnexpectedException(

View File

@ -42,13 +42,10 @@ public abstract class JDBCOp implements CycleOp {
}
protected void closeStatement(Statement stmt) throws SQLException {
/*
* NO-op for now
* ------------------
if (! (stmt instanceof PreparedStatement)) {
stmt.close();
} else if (jdbcSpace.isShuttingDown()) {
stmt.close();
}*/
}
}
}

View File

@ -16,75 +16,15 @@
package io.nosqlbench.engine.extensions.vectormath;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
public class PgvecUtils {
public static long[] sqlResultSetFieldsToLongArray(String fieldName, List<ResultSet> resultSets) {
return resultSets.stream().filter(r -> {
try {
return ((r!=null) && !r.isClosed());
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).mapToLong(r -> {
try {
return r.getLong(fieldName);
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).toArray();
}
public static String[] sqlResultSetFieldsToStringArray(String fieldName, List<ResultSet> resultSets) {
return resultSets.stream().filter(r -> {
try {
return ((r!=null) && !r.isClosed());
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).map(r -> {
try {
return r.getString(fieldName);
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).toArray(String[]::new);
}
public static int[] sqlResultSetListToIntArray(String fieldName, List<ResultSet> resultSets) {
return resultSets.stream().filter(r -> {
try {
return ((r!=null) && !r.isClosed());
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).mapToInt(r -> {
try {
return r.getInt(fieldName);
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).toArray();
}
public static int[] sqlStringColumnToIntArray(String fieldName, List<ResultSet> resultSets) {
return resultSets.stream().filter(r -> {
try {
return ((r!=null) && !r.isClosed());
} catch (SQLException e) {
throw new RuntimeException(e);
}
}).mapToInt(r -> {
try {
return Integer.parseInt(Objects.requireNonNull(r.getString(fieldName)));
} catch (SQLException e) {
throw new RuntimeException(e);
}
public static int[] getValueListForVerifierKey(List<String> values) {
int[] intArr = values.stream().mapToInt(v -> {
return Integer.parseInt(Objects.requireNonNull(v));
}).toArray();
return intArr;
}
}

View File

@ -15,23 +15,23 @@ scenarios:
###
## For DDL workload, turn on 'AutoCommit'. Turning it off will cause errors.
###
drop-tbl: run driver=jdbc tags==block:drop-tbl threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
drop-tbl: run driver=jdbc tags==block:drop-tbl threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
# The following CLI parameters is needed for 'create-tbl' named scenario:
# - dimensions: vector dimension size (MUST match the actual ANN benchmark data)
create-tbl: run driver=jdbc tags==block:create-tbl threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
create-tbl: run driver=jdbc tags==block:create-tbl threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
#
# Vectors with up to 2,000 dimensions can be indexed.
#
# The following extra CLI parameter is needed for both 'create-vec-idx' and 'drop-vec-idx' named scenarios:
# - indexName: index name
drop-vec-idx: run driver=jdbc tags==block:drop-vec-idx threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
drop-vec-idx: run driver=jdbc tags==block:drop-vec-idx threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
# The following extra CLI parameters are needed for 'create-vec-idx' named scenario:
# - indexType: index type; valid values: 'ivfflat' or 'hnsw' (see: https://github.com/pgvector/pgvector#indexing)
# - indexOpt: index options
# * for 'ivfflat' index type, the option is like: "lists=<number>"
# * for 'hnsw' index type, the option is like: "m=<number>,ef_construction =<number>"
# - relFunc: relevancy function; valid values: 'l2' (L2 distance), 'ip' (Inner product), or 'cosine' (Cosine distance)
create-vec-idx: run driver=jdbc tags==block:create-vec-idx threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
create-vec-idx: run driver=jdbc tags==block:create-vec-idx threads==1 cycles==UNDEF url="jdbc:postgresql://host:port/" databaseName="defaultdb" portNumber=5432 user="newuser" password="CHANGE_ME" sslmode="prefer" serverName="pgsql" sslrootcert="/path/to/postgresql_certs/root.crt" autoCommit="true"
###
## For DML workload, 'AutoCommit' can be off or on
@ -103,7 +103,7 @@ blocks:
# Using PostgreSQl upsert (INSERT ON CONFLICT statement)
vec-write:
params:
# DML statement MUST be prepared
# DML write statement MUST be prepared
prepared: true
ops:
main-insert:
@ -115,14 +115,25 @@ blocks:
vec-read:
ops:
params:
# DML READ statement can be prepared or not
prepared: true
main-select:
dmlread: |
SELECT key, (value <-> ?) as relevancy, value
FROM TEMPLATE(schema,public).TEMPLATE(table,pgvec)
ORDER BY value <-> ?
LIMIT TEMPLATE(queryLimit,10);
LIMIT TEMPLATE(queryLimit,100);
prep_stmt_val_arr: |
{test_vector},{test_vector}
#################################
## NOTE:
# 1). The script blocks below are ONLY relevant with Vector relevancy score verification
# 2). The "verifier-key" must match the Vector data identifier column name (e.g. primary key name)
# right now the identifier must be a type that can be converted to int.
verifier-key: "key"
verifier-imports:
- io.nosqlbench.adapter.mongodb.MongoDbUtils
verifier-init: |
relevancy=scriptingmetrics.newRelevancyMeasures(_parsed_op);
for (int k in List.of(100)) {
@ -134,7 +145,7 @@ blocks:
}
verifier: |
// driver-specific function
actual_indices=pgvec_utils.sqlStringColumnToIntArray("key",result);
actual_indices=pgvec_utils.getValueListForVerifierKey(result);
// driver-agnostic function
relevancy.accept({validation_set},actual_indices);
// because we are "verifying" although this needs to be reorganized