mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Allow cusparseSolver to use jacMatrix
This commit is contained in:
@@ -104,10 +104,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
@@ -135,10 +135,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
@@ -188,14 +188,24 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::initialize(int Nb_, int nnzb) {
|
||||
this->Nb = Nb_;
|
||||
void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = nnzb;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
if (jacMatrix) {
|
||||
useJacMatrix = true;
|
||||
nnzbs_prec = jacMatrix->nnzbs;
|
||||
} else {
|
||||
nnzbs_prec = nnzb;
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
OpmLog::info(out.str());
|
||||
|
||||
@@ -230,7 +240,15 @@ void cusparseSolverBackend<block_size>::initialize(int Nb_, int nnzb) {
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
if (useJacMatrix) {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
||||
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
||||
} else {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
}
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
@@ -259,6 +277,10 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
if (useJacMatrix) {
|
||||
cudaFree(d_mCols);
|
||||
cudaFree(d_mRows);
|
||||
}
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
@@ -281,23 +303,32 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
@@ -312,19 +343,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
@@ -339,12 +375,6 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
@@ -378,12 +408,12 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
|
||||
descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
@@ -391,7 +421,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
@@ -402,11 +432,11 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
@@ -426,10 +456,8 @@ template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
Timer t;
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
cudaCheckLastError("Could not perform ilu decomposition");
|
||||
|
||||
@@ -480,22 +508,21 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
template <unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix->Nb, matrix->nnzbs);
|
||||
copy_system_to_gpu(matrix->nnzValues, matrix->rowPointers, matrix->colIndices, b);
|
||||
initialize(matrix, jacMatrix);
|
||||
copy_system_to_gpu(matrix, b, jacMatrix);
|
||||
} else {
|
||||
update_system_on_gpu(matrix->nnzValues, matrix->rowPointers, b);
|
||||
update_system_on_gpu(matrix, b, jacMatrix);
|
||||
}
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if (create_preconditioner()) {
|
||||
solve_system(wellContribs, res);
|
||||
} else {
|
||||
|
||||
@@ -68,6 +68,9 @@ private:
|
||||
|
||||
bool analysis_done = false;
|
||||
|
||||
bool useJacMatrix = false;
|
||||
int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner
|
||||
// could be jacMatrix or matrix
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
@@ -75,28 +78,26 @@ private:
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of blocks
|
||||
void initialize(int Nb, int nnzbs);
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
/// Copy linear system to GPU
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnzb values
|
||||
/// \param[in] b input vector, contains N values
|
||||
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(double *vals, int *rows, double *b);
|
||||
|
||||
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
|
||||
void reset_prec_on_gpu();
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
|
||||
Reference in New Issue
Block a user