Replaced tabs with 4 spaces

This commit is contained in:
T.D. (Tongdong) Qiu 2019-12-18 15:50:09 +01:00
parent b543947e66
commit 950d1c92c1
6 changed files with 590 additions and 592 deletions

View File

@ -37,9 +37,9 @@ namespace Opm
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED) : use_gpu(use_gpu_) {
#if HAVE_CUDA
if(use_gpu){
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
}
if(use_gpu){
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
}
#endif
}
@ -48,41 +48,41 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int
#if HAVE_CUDA
template <class BridgeMatrix>
int checkZeroDiagonal(BridgeMatrix& mat) {
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
int numZeros = 0;
const int dim = 3;
const double zero_replace = 1e-15;
if(diag_indices.size() == 0){
int N = mat.N();
diag_indices.reserve(N);
int roff = 0;
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
auto diag = r->find(r.index()); // diag is an iterator
assert(diag.index() == r.index());
for(int rr = 0; rr < dim; ++rr){
auto& val = (*diag)[rr][rr]; // reference to easily change the value
if (val == 0.0){ // could be replaced by '< 1e-30' or similar
val = zero_replace;
++numZeros;
}
}
diag_indices.emplace_back(diag.offset());
}
}else{
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
for(int rr = 0; rr < dim; ++rr){
auto& val = diag_block[rr][rr];
if(val == 0.0){ // could be replaced by '< 1e-30' or similar
val = zero_replace;
++numZeros;
}
}
}
}
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
int numZeros = 0;
const int dim = 3;
const double zero_replace = 1e-15;
if(diag_indices.size() == 0){
int N = mat.N();
diag_indices.reserve(N);
int roff = 0;
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
auto diag = r->find(r.index()); // diag is an iterator
assert(diag.index() == r.index());
for(int rr = 0; rr < dim; ++rr){
auto& val = (*diag)[rr][rr]; // reference to easily change the value
if (val == 0.0){ // could be replaced by '< 1e-30' or similar
val = zero_replace;
++numZeros;
}
}
diag_indices.emplace_back(diag.offset());
}
}else{
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
for(int rr = 0; rr < dim; ++rr){
auto& val = diag_block[rr][rr];
if(val == 0.0){ // could be replaced by '< 1e-30' or similar
val = zero_replace;
++numZeros;
}
}
}
}
return numZeros;
return numZeros;
}
@ -91,24 +91,24 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
template <class BridgeMatrix>
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
int sum_nnzs = 0;
int sum_nnzs = 0;
// convert colIndices and rowPointers
if(h_rows.size() == 0){
h_rows.emplace_back(0);
for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
int size_row = 0;
for(auto c = r->begin(); c != r->end(); ++c){
h_cols.emplace_back(c.index());
size_row++;
}
sum_nnzs += size_row;
h_rows.emplace_back(sum_nnzs);
}
if(h_rows[mat.N()] != mat.nonzeroes()){
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
}
}
// convert colIndices and rowPointers
if(h_rows.size() == 0){
h_rows.emplace_back(0);
for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
int size_row = 0;
for(auto c = r->begin(); c != r->end(); ++c){
h_cols.emplace_back(c.index());
size_row++;
}
sum_nnzs += size_row;
h_rows.emplace_back(sum_nnzs);
}
if(h_rows[mat.N()] != mat.nonzeroes()){
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
}
}
} // end getSparsityPattern()
#endif
@ -118,73 +118,73 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
{
#if HAVE_CUDA
if(use_gpu){
BdaResult result;
result.converged = false;
static std::vector<int> h_rows;
static std::vector<int> h_cols;
int dim = (*mat)[0][0].N();
int N = mat->N()*dim;
int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
if(use_gpu){
BdaResult result;
result.converged = false;
static std::vector<int> h_rows;
static std::vector<int> h_cols;
int dim = (*mat)[0][0].N();
int N = mat->N()*dim;
int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
if(dim != 3){
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
use_gpu = false;
}
if(dim != 3){
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
use_gpu = false;
}
if(h_rows.capacity() == 0){
h_rows.reserve(N+1);
h_cols.reserve(nnz);
if(h_rows.capacity() == 0){
h_rows.reserve(N+1);
h_cols.reserve(nnz);
#if PRINT_TIMERS_BRIDGE
Dune::Timer t;
Dune::Timer t;
#endif
getSparsityPattern(*mat, h_rows, h_cols);
getSparsityPattern(*mat, h_rows, h_cols);
#if PRINT_TIMERS_BRIDGE
std::ostringstream out;
out << "getSparsityPattern() took: " << t.stop() << " s";
OpmLog::info(out.str());
std::ostringstream out;
out << "getSparsityPattern() took: " << t.stop() << " s";
OpmLog::info(out.str());
#endif
}
}
#if PRINT_TIMERS_BRIDGE
Dune::Timer t_zeros;
int numZeros = checkZeroDiagonal(*mat);
std::ostringstream out;
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
OpmLog::info(out.str());
Dune::Timer t_zeros;
int numZeros = checkZeroDiagonal(*mat);
std::ostringstream out;
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
OpmLog::info(out.str());
#else
checkZeroDiagonal(*mat);
checkZeroDiagonal(*mat);
#endif
/////////////////////////
// actually solve
/////////////////////////
// actually solve
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
switch(status){
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
//OpmLog::info("cusparseSolver converged");
break;
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
default:
OpmLog::warning("cusparseSolver returned unknown status code");
}
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
switch(status){
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
//OpmLog::info("cusparseSolver converged");
break;
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
break;
default:
OpmLog::warning("cusparseSolver returned unknown status code");
}
res.iterations = result.iterations;
res.reduction = result.reduction;
res.converged = result.converged;
res.conv_rate = result.conv_rate;
res.elapsed = result.elapsed;
}else{
res.converged = false;
}
res.iterations = result.iterations;
res.reduction = result.reduction;
res.converged = result.converged;
res.conv_rate = result.conv_rate;
res.elapsed = result.elapsed;
}else{
res.converged = false;
}
#endif // HAVE_CUDA
}
@ -192,9 +192,9 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
template <class BridgeVector>
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED){
#if HAVE_CUDA
if(use_gpu){
backend->post_process(static_cast<double*>(&(x[0][0])));
}
if(use_gpu){
backend->post_process(static_cast<double*>(&(x[0][0])));
}
#endif
}
@ -202,22 +202,22 @@ template void BdaBridge::solve_system< \
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
InverseOperatorResult &res);
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
InverseOperatorResult &res);
template void BdaBridge::solve_system< \
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
InverseOperatorResult &res);
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
InverseOperatorResult &res);
template void BdaBridge::solve_system< \
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
InverseOperatorResult &res);
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
InverseOperatorResult &res);
template void BdaBridge::get_result< \

View File

@ -40,20 +40,18 @@ class BdaBridge
{
private:
#if HAVE_CUDA
std::unique_ptr<cusparseSolverBackend> backend;
std::unique_ptr<cusparseSolverBackend> backend;
#endif
bool use_gpu;
bool use_gpu;
public:
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
~BdaBridge();
template <class BridgeMatrix, class BridgeVector>
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
template <class BridgeMatrix, class BridgeVector>
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
template <class BridgeVector>
void get_result(BridgeVector &x);
template <class BridgeVector>
void get_result(BridgeVector &x);
}; // end class BdaBridge

View File

@ -28,13 +28,13 @@ class BdaResult
{
public:
int iterations = 0; // number of iterations
double reduction = 0.0; // reduction of norm, norm_start / norm_final
bool converged = false; // true iff the linear solver reached the desired norm within maxit iterations
double conv_rate = 0.0; // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
double elapsed = 0.0; // time in seconds to run the linear solver
int iterations = 0; // number of iterations
double reduction = 0.0; // reduction of norm, norm_start / norm_final
bool converged = false; // true iff the linear solver reached the desired norm within maxit iterations
double conv_rate = 0.0; // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
double elapsed = 0.0; // time in seconds to run the linear solver
// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
}; // end class BdaResult

View File

@ -25,12 +25,12 @@
#define cudaCheckLastError(msg) __cudaCheckError( __FILE__, __LINE__, #msg )
inline void __cudaCheckError(const char *file, const int line, const char *msg){
cudaError err = cudaGetLastError();
if (cudaSuccess != err){
std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
std::cerr << "BDA error message: " << msg << std::endl;
exit(1);
}
cudaError err = cudaGetLastError();
if (cudaSuccess != err){
std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
std::cerr << "BDA error message: " << msg << std::endl;
exit(1);
}
}
#endif

View File

@ -18,7 +18,7 @@
*/
#ifndef __NVCC__
#error "Cannot compile for cusparse: NVIDIA compiler not found"
#error "Cannot compile for cusparse: NVIDIA compiler not found"
#endif
#include <cstdio>
@ -41,456 +41,456 @@
namespace Opm
{
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
double second(void){
struct timeval tv;
gettimeofday(&tv, nullptr);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
}
cusparseSolverBackend::~cusparseSolverBackend(){
finalize();
}
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
double t_total1, t_total2;
int n = N;
double rho = 1.0, rhop;
double alpha, nalpha, beta;
double omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
float it;
t_total1 = second();
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
cublasDscal(cublasHandle, n, &mone, d_r, 1);
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
if(verbosity > 1){
std::ostringstream out;
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
OpmLog::info(out.str());
}
for(it = 0.5; it < maxit; it+=0.5){
rhop = rho;
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
if(it > 1){
beta = (rho/rhop) * (alpha/omega);
nomega = -omega;
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
cublasDscal(cublasHandle, n, &beta, d_p, 1);
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
}
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, \
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
alpha = rho / tmp1;
nalpha = -alpha;
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if(norm < tolerance * norm_0 && it > minit){
break;
}
it += 0.5;
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, &one, descr_M, \
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
omega = tmp1 / tmp2;
nomega = -omega;
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if(norm < tolerance * norm_0 && it > minit){
break;
}
if(verbosity > 1){
std::ostringstream out;
out << "it: " << it << std::scientific << ", norm: " << norm;
OpmLog::info(out.str());
}
}
t_total2 = second();
res.iterations = std::min(it, (float)maxit);
res.reduction = norm/norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
res.elapsed = t_total2 - t_total1;
res.converged = (it != (maxit + 0.5));
if(verbosity > 0){
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
OpmLog::info(out.str());
}
}
void cusparseSolverBackend::initialize(int N, int nnz, int dim){
this->N = N;
this->nnz = nnz;
this->BLOCK_SIZE = dim;
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
Nb = (N + dim - 1) / dim;
std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
OpmLog::info(out.str());
out.str("");
out.clear();
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
OpmLog::info(out.str());
int deviceID = 0;
cudaSetDevice(deviceID);
cudaCheckLastError("Could not get device");
struct cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
out.str("");
out.clear();
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
cudaCheckLastError("Could not create stream");
cublasCreate(&cublasHandle);
cudaCheckLastError("Could not create cublasHandle");
cusparseCreate(&cusparseHandle);
cudaCheckLastError("Could not create cusparseHandle");
cudaMalloc((void**)&d_x, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N);
cudaMalloc((void**)&d_rw,sizeof(double) * N);
cudaMalloc((void**)&d_p, sizeof(double) * N);
cudaMalloc((void**)&d_pw,sizeof(double) * N);
cudaMalloc((void**)&d_s, sizeof(double) * N);
cudaMalloc((void**)&d_t, sizeof(double) * N);
cudaMalloc((void**)&d_v, sizeof(double) * N);
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
cudaCheckLastError("Could not allocate enough memory on GPU");
cublasSetStream(cublasHandle, stream);
cudaCheckLastError("Could not set stream to cublas");
cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse");
cudaMallocHost((void**)&x, sizeof(double) * N);
cudaCheckLastError("Could not allocate pinned host memory");
initialized = true;
} // end initialize()
void cusparseSolverBackend::finalize(){
cudaFree(d_x);
cudaFree(d_b);
cudaFree(d_r);
cudaFree(d_rw);
cudaFree(d_p);
cudaFree(d_pw);
cudaFree(d_s);
cudaFree(d_t);
cudaFree(d_v);
cudaFree(d_mVals);
cudaFree(d_bVals);
cudaFree(d_bCols);
cudaFree(d_bRows);
cudaFree(d_buffer);
cusparseDestroyBsrilu02Info(info_M);
cusparseDestroyBsrsv2Info(info_L);
cusparseDestroyBsrsv2Info(info_U);
cusparseDestroyMatDescr(descr_B);
cusparseDestroyMatDescr(descr_M);
cusparseDestroyMatDescr(descr_L);
cusparseDestroyMatDescr(descr_U);
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
cudaHostUnregister(vals);
cudaHostUnregister(cols);
cudaHostUnregister(rows);
cudaStreamDestroy(stream);
cudaFreeHost(x);
} // end finalize()
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
this->vals = vals;
this->cols = cols;
this->rows = rows;
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end update_system_on_gpu()
void cusparseSolverBackend::reset_prec_on_gpu(){
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
bool cusparseSolverBackend::analyse_matrix(){
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cusparseCreateMatDescr(&descr_B);
cusparseCreateMatDescr(&descr_M);
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
cusparseSetMatIndexBase(descr_B, base_type);
cusparseSetMatIndexBase(descr_M, base_type);
cusparseCreateMatDescr(&descr_L);
cusparseSetMatIndexBase(descr_L, base_type);
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
cusparseCreateMatDescr(&descr_U);
cusparseSetMatIndexBase(descr_U, base_type);
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
cudaCheckLastError("Could not initialize matrix descriptions");
cusparseCreateBsrilu02Info(&info_M);
cusparseCreateBsrsv2Info(&info_L);
cusparseCreateBsrsv2Info(&info_U);
cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
cudaCheckLastError();
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
cudaMalloc((void**)&d_buffer, d_bufferSize);
// analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
int structural_zero;
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
return false;
}
// analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information");
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
return true;
} // end analyse_matrix()
bool cusparseSolverBackend::create_preconditioner(){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
d_mCols = d_bCols;
d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
int structural_zero;
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
return false;
}
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
return true;
} // end create_preconditioner()
void cusparseSolverBackend::solve_system(BdaResult &res){
// actually solve
gpu_pbicgstab(res);
cudaStreamSynchronize(stream);
cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
void cusparseSolverBackend::post_process(double *x){
if(!initialized){
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
}
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if(verbosity > 2){
t2 = second();
std::ostringstream out;
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end post_process()
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){
if(initialized == false){
initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b);
}else{
update_system_on_gpu(vals, b);
}
if(analysis_done == false){
if(!analyse_matrix()){
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
}
}
reset_prec_on_gpu();
if(create_preconditioner()){
solve_system(res);
}else{
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
double second(void){
struct timeval tv;
gettimeofday(&tv, nullptr);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
}
cusparseSolverBackend::~cusparseSolverBackend(){
finalize();
}
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
double t_total1, t_total2;
int n = N;
double rho = 1.0, rhop;
double alpha, nalpha, beta;
double omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
float it;
t_total1 = second();
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
cublasDscal(cublasHandle, n, &mone, d_r, 1);
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
if(verbosity > 1){
std::ostringstream out;
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
OpmLog::info(out.str());
}
for(it = 0.5; it < maxit; it+=0.5){
rhop = rho;
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
if(it > 1){
beta = (rho/rhop) * (alpha/omega);
nomega = -omega;
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
cublasDscal(cublasHandle, n, &beta, d_p, 1);
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
}
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, \
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
alpha = rho / tmp1;
nalpha = -alpha;
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if(norm < tolerance * norm_0 && it > minit){
break;
}
it += 0.5;
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
// spmv
cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, &one, descr_M, \
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
omega = tmp1 / tmp2;
nomega = -omega;
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if(norm < tolerance * norm_0 && it > minit){
break;
}
if(verbosity > 1){
std::ostringstream out;
out << "it: " << it << std::scientific << ", norm: " << norm;
OpmLog::info(out.str());
}
}
t_total2 = second();
res.iterations = std::min(it, (float)maxit);
res.reduction = norm/norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
res.elapsed = t_total2 - t_total1;
res.converged = (it != (maxit + 0.5));
if(verbosity > 0){
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
OpmLog::info(out.str());
}
}
void cusparseSolverBackend::initialize(int N, int nnz, int dim){
this->N = N;
this->nnz = nnz;
this->BLOCK_SIZE = dim;
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
Nb = (N + dim - 1) / dim;
std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
OpmLog::info(out.str());
out.str("");
out.clear();
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
OpmLog::info(out.str());
int deviceID = 0;
cudaSetDevice(deviceID);
cudaCheckLastError("Could not get device");
struct cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
out.str("");
out.clear();
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
cudaCheckLastError("Could not create stream");
cublasCreate(&cublasHandle);
cudaCheckLastError("Could not create cublasHandle");
cusparseCreate(&cusparseHandle);
cudaCheckLastError("Could not create cusparseHandle");
cudaMalloc((void**)&d_x, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N);
cudaMalloc((void**)&d_rw,sizeof(double) * N);
cudaMalloc((void**)&d_p, sizeof(double) * N);
cudaMalloc((void**)&d_pw,sizeof(double) * N);
cudaMalloc((void**)&d_s, sizeof(double) * N);
cudaMalloc((void**)&d_t, sizeof(double) * N);
cudaMalloc((void**)&d_v, sizeof(double) * N);
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
cudaCheckLastError("Could not allocate enough memory on GPU");
cublasSetStream(cublasHandle, stream);
cudaCheckLastError("Could not set stream to cublas");
cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse");
cudaMallocHost((void**)&x, sizeof(double) * N);
cudaCheckLastError("Could not allocate pinned host memory");
initialized = true;
} // end initialize()
void cusparseSolverBackend::finalize(){
cudaFree(d_x);
cudaFree(d_b);
cudaFree(d_r);
cudaFree(d_rw);
cudaFree(d_p);
cudaFree(d_pw);
cudaFree(d_s);
cudaFree(d_t);
cudaFree(d_v);
cudaFree(d_mVals);
cudaFree(d_bVals);
cudaFree(d_bCols);
cudaFree(d_bRows);
cudaFree(d_buffer);
cusparseDestroyBsrilu02Info(info_M);
cusparseDestroyBsrsv2Info(info_L);
cusparseDestroyBsrsv2Info(info_U);
cusparseDestroyMatDescr(descr_B);
cusparseDestroyMatDescr(descr_M);
cusparseDestroyMatDescr(descr_L);
cusparseDestroyMatDescr(descr_U);
cusparseDestroy(cusparseHandle);
cublasDestroy(cublasHandle);
cudaHostUnregister(vals);
cudaHostUnregister(cols);
cudaHostUnregister(rows);
cudaStreamDestroy(stream);
cudaFreeHost(x);
} // end finalize()
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
this->vals = vals;
this->cols = cols;
this->rows = rows;
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end update_system_on_gpu()
void cusparseSolverBackend::reset_prec_on_gpu(){
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
bool cusparseSolverBackend::analyse_matrix(){
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cusparseCreateMatDescr(&descr_B);
cusparseCreateMatDescr(&descr_M);
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
cusparseSetMatIndexBase(descr_B, base_type);
cusparseSetMatIndexBase(descr_M, base_type);
cusparseCreateMatDescr(&descr_L);
cusparseSetMatIndexBase(descr_L, base_type);
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
cusparseCreateMatDescr(&descr_U);
cusparseSetMatIndexBase(descr_U, base_type);
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
cudaCheckLastError("Could not initialize matrix descriptions");
cusparseCreateBsrilu02Info(&info_M);
cusparseCreateBsrsv2Info(&info_L);
cusparseCreateBsrsv2Info(&info_U);
cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
cudaCheckLastError();
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
cudaMalloc((void**)&d_buffer, d_bufferSize);
// analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
int structural_zero;
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
return false;
}
// analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information");
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
return true;
} // end analyse_matrix()
bool cusparseSolverBackend::create_preconditioner(){
double t1, t2;
if(verbosity > 2){
t1 = second();
}
d_mCols = d_bCols;
d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
BLOCK_SIZE, info_M, policy, d_buffer);
int structural_zero;
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
return false;
}
if(verbosity > 2){
cudaStreamSynchronize(stream);
t2 = second();
std::ostringstream out;
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
return true;
} // end create_preconditioner()
void cusparseSolverBackend::solve_system(BdaResult &res){
// actually solve
gpu_pbicgstab(res);
cudaStreamSynchronize(stream);
cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
void cusparseSolverBackend::post_process(double *x){
if(!initialized){
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
}
double t1, t2;
if(verbosity > 2){
t1 = second();
}
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if(verbosity > 2){
t2 = second();
std::ostringstream out;
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
OpmLog::info(out.str());
}
} // end post_process()
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){
if(initialized == false){
initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b);
}else{
update_system_on_gpu(vals, b);
}
if(analysis_done == false){
if(!analyse_matrix()){
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
}
}
reset_prec_on_gpu();
if(create_preconditioner()){
solve_system(res);
}else{
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
}

View File

@ -91,10 +91,10 @@ private:
public:
enum class cusparseSolverStatus {
CUSPARSE_SOLVER_SUCCESS,
CUSPARSE_SOLVER_ANALYSIS_FAILED,
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
CUSPARSE_SOLVER_UNKNOWN_ERROR
CUSPARSE_SOLVER_SUCCESS,
CUSPARSE_SOLVER_ANALYSIS_FAILED,
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
CUSPARSE_SOLVER_UNKNOWN_ERROR
};
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);