mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Replaced tabs with 4 spaces
This commit is contained in:
parent
b543947e66
commit
950d1c92c1
@ -37,9 +37,9 @@ namespace Opm
|
||||
|
||||
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED) : use_gpu(use_gpu_) {
|
||||
#if HAVE_CUDA
|
||||
if(use_gpu){
|
||||
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
}
|
||||
if(use_gpu){
|
||||
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -48,41 +48,41 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int
|
||||
#if HAVE_CUDA
|
||||
template <class BridgeMatrix>
|
||||
int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
|
||||
int numZeros = 0;
|
||||
const int dim = 3;
|
||||
const double zero_replace = 1e-15;
|
||||
if(diag_indices.size() == 0){
|
||||
int N = mat.N();
|
||||
diag_indices.reserve(N);
|
||||
int roff = 0;
|
||||
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
auto diag = r->find(r.index()); // diag is an iterator
|
||||
assert(diag.index() == r.index());
|
||||
for(int rr = 0; rr < dim; ++rr){
|
||||
auto& val = (*diag)[rr][rr]; // reference to easily change the value
|
||||
if (val == 0.0){ // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
diag_indices.emplace_back(diag.offset());
|
||||
}
|
||||
}else{
|
||||
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
|
||||
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
|
||||
for(int rr = 0; rr < dim; ++rr){
|
||||
auto& val = diag_block[rr][rr];
|
||||
if(val == 0.0){ // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
|
||||
int numZeros = 0;
|
||||
const int dim = 3;
|
||||
const double zero_replace = 1e-15;
|
||||
if(diag_indices.size() == 0){
|
||||
int N = mat.N();
|
||||
diag_indices.reserve(N);
|
||||
int roff = 0;
|
||||
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
auto diag = r->find(r.index()); // diag is an iterator
|
||||
assert(diag.index() == r.index());
|
||||
for(int rr = 0; rr < dim; ++rr){
|
||||
auto& val = (*diag)[rr][rr]; // reference to easily change the value
|
||||
if (val == 0.0){ // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
diag_indices.emplace_back(diag.offset());
|
||||
}
|
||||
}else{
|
||||
for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
|
||||
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
|
||||
for(int rr = 0; rr < dim; ++rr){
|
||||
auto& val = diag_block[rr][rr];
|
||||
if(val == 0.0){ // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numZeros;
|
||||
return numZeros;
|
||||
}
|
||||
|
||||
|
||||
@ -91,24 +91,24 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix>
|
||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
int sum_nnzs = 0;
|
||||
int sum_nnzs = 0;
|
||||
|
||||
// convert colIndices and rowPointers
|
||||
if(h_rows.size() == 0){
|
||||
h_rows.emplace_back(0);
|
||||
for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
int size_row = 0;
|
||||
for(auto c = r->begin(); c != r->end(); ++c){
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
if(h_rows[mat.N()] != mat.nonzeroes()){
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
}
|
||||
}
|
||||
// convert colIndices and rowPointers
|
||||
if(h_rows.size() == 0){
|
||||
h_rows.emplace_back(0);
|
||||
for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
|
||||
int size_row = 0;
|
||||
for(auto c = r->begin(); c != r->end(); ++c){
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
if(h_rows[mat.N()] != mat.nonzeroes()){
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
}
|
||||
}
|
||||
} // end getSparsityPattern()
|
||||
|
||||
#endif
|
||||
@ -118,73 +118,73 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
||||
{
|
||||
|
||||
#if HAVE_CUDA
|
||||
if(use_gpu){
|
||||
BdaResult result;
|
||||
result.converged = false;
|
||||
static std::vector<int> h_rows;
|
||||
static std::vector<int> h_cols;
|
||||
int dim = (*mat)[0][0].N();
|
||||
int N = mat->N()*dim;
|
||||
int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
|
||||
if(use_gpu){
|
||||
BdaResult result;
|
||||
result.converged = false;
|
||||
static std::vector<int> h_rows;
|
||||
static std::vector<int> h_cols;
|
||||
int dim = (*mat)[0][0].N();
|
||||
int N = mat->N()*dim;
|
||||
int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
|
||||
|
||||
if(dim != 3){
|
||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||
use_gpu = false;
|
||||
}
|
||||
if(dim != 3){
|
||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||
use_gpu = false;
|
||||
}
|
||||
|
||||
if(h_rows.capacity() == 0){
|
||||
h_rows.reserve(N+1);
|
||||
h_cols.reserve(nnz);
|
||||
if(h_rows.capacity() == 0){
|
||||
h_rows.reserve(N+1);
|
||||
h_cols.reserve(nnz);
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
Dune::Timer t;
|
||||
Dune::Timer t;
|
||||
#endif
|
||||
getSparsityPattern(*mat, h_rows, h_cols);
|
||||
getSparsityPattern(*mat, h_rows, h_cols);
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
std::ostringstream out;
|
||||
out << "getSparsityPattern() took: " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
std::ostringstream out;
|
||||
out << "getSparsityPattern() took: " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
Dune::Timer t_zeros;
|
||||
int numZeros = checkZeroDiagonal(*mat);
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
Dune::Timer t_zeros;
|
||||
int numZeros = checkZeroDiagonal(*mat);
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
#else
|
||||
checkZeroDiagonal(*mat);
|
||||
checkZeroDiagonal(*mat);
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
||||
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
|
||||
switch(status){
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
||||
//OpmLog::info("cusparseSolver converged");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
default:
|
||||
OpmLog::warning("cusparseSolver returned unknown status code");
|
||||
}
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
||||
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
|
||||
switch(status){
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
||||
//OpmLog::info("cusparseSolver converged");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
default:
|
||||
OpmLog::warning("cusparseSolver returned unknown status code");
|
||||
}
|
||||
|
||||
res.iterations = result.iterations;
|
||||
res.reduction = result.reduction;
|
||||
res.converged = result.converged;
|
||||
res.conv_rate = result.conv_rate;
|
||||
res.elapsed = result.elapsed;
|
||||
}else{
|
||||
res.converged = false;
|
||||
}
|
||||
res.iterations = result.iterations;
|
||||
res.reduction = result.reduction;
|
||||
res.converged = result.converged;
|
||||
res.conv_rate = result.conv_rate;
|
||||
res.elapsed = result.elapsed;
|
||||
}else{
|
||||
res.converged = false;
|
||||
}
|
||||
#endif // HAVE_CUDA
|
||||
}
|
||||
|
||||
@ -192,9 +192,9 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
||||
template <class BridgeVector>
|
||||
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED){
|
||||
#if HAVE_CUDA
|
||||
if(use_gpu){
|
||||
backend->post_process(static_cast<double*>(&(x[0][0])));
|
||||
}
|
||||
if(use_gpu){
|
||||
backend->post_process(static_cast<double*>(&(x[0][0])));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -202,22 +202,22 @@ template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
|
||||
template void BdaBridge::get_result< \
|
||||
|
@ -40,20 +40,18 @@ class BdaBridge
|
||||
{
|
||||
private:
|
||||
#if HAVE_CUDA
|
||||
std::unique_ptr<cusparseSolverBackend> backend;
|
||||
std::unique_ptr<cusparseSolverBackend> backend;
|
||||
#endif
|
||||
bool use_gpu;
|
||||
bool use_gpu;
|
||||
|
||||
public:
|
||||
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
|
||||
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
||||
~BdaBridge();
|
||||
template <class BridgeMatrix, class BridgeVector>
|
||||
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector>
|
||||
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
|
||||
|
||||
template <class BridgeVector>
|
||||
void get_result(BridgeVector &x);
|
||||
template <class BridgeVector>
|
||||
void get_result(BridgeVector &x);
|
||||
|
||||
}; // end class BdaBridge
|
||||
|
||||
|
@ -28,13 +28,13 @@ class BdaResult
|
||||
{
|
||||
|
||||
public:
|
||||
int iterations = 0; // number of iterations
|
||||
double reduction = 0.0; // reduction of norm, norm_start / norm_final
|
||||
bool converged = false; // true iff the linear solver reached the desired norm within maxit iterations
|
||||
double conv_rate = 0.0; // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
|
||||
double elapsed = 0.0; // time in seconds to run the linear solver
|
||||
int iterations = 0; // number of iterations
|
||||
double reduction = 0.0; // reduction of norm, norm_start / norm_final
|
||||
bool converged = false; // true iff the linear solver reached the desired norm within maxit iterations
|
||||
double conv_rate = 0.0; // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
|
||||
double elapsed = 0.0; // time in seconds to run the linear solver
|
||||
|
||||
// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
|
||||
// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
|
||||
|
||||
}; // end class BdaResult
|
||||
|
||||
|
@ -25,12 +25,12 @@
|
||||
#define cudaCheckLastError(msg) __cudaCheckError( __FILE__, __LINE__, #msg )
|
||||
|
||||
inline void __cudaCheckError(const char *file, const int line, const char *msg){
|
||||
cudaError err = cudaGetLastError();
|
||||
if (cudaSuccess != err){
|
||||
std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
|
||||
std::cerr << "BDA error message: " << msg << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
cudaError err = cudaGetLastError();
|
||||
if (cudaSuccess != err){
|
||||
std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
|
||||
std::cerr << "BDA error message: " << msg << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -18,7 +18,7 @@
|
||||
*/
|
||||
|
||||
#ifndef __NVCC__
|
||||
#error "Cannot compile for cusparse: NVIDIA compiler not found"
|
||||
#error "Cannot compile for cusparse: NVIDIA compiler not found"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
@ -41,456 +41,456 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
double second(void){
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
|
||||
}
|
||||
|
||||
cusparseSolverBackend::~cusparseSolverBackend(){
|
||||
finalize();
|
||||
}
|
||||
|
||||
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
|
||||
double t_total1, t_total2;
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
float it;
|
||||
|
||||
t_total1 = second();
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
|
||||
|
||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
|
||||
|
||||
if(verbosity > 1){
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
for(it = 0.5; it < maxit; it+=0.5){
|
||||
rhop = rho;
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
|
||||
|
||||
if(it > 1){
|
||||
beta = (rho/rhop) * (alpha/omega);
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
|
||||
cublasDscal(cublasHandle, n, &beta, d_p, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
|
||||
}
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, \
|
||||
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
alpha = rho / tmp1;
|
||||
nalpha = -alpha;
|
||||
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
if(norm < tolerance * norm_0 && it > minit){
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||
omega = tmp1 / tmp2;
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if(norm < tolerance * norm_0 && it > minit){
|
||||
break;
|
||||
}
|
||||
|
||||
if(verbosity > 1){
|
||||
std::ostringstream out;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t_total2 = second();
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm/norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if(verbosity > 0){
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim){
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->BLOCK_SIZE = dim;
|
||||
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
int deviceID = 0;
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
struct cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
cudaCheckLastError("Could not create stream");
|
||||
|
||||
cublasCreate(&cublasHandle);
|
||||
cudaCheckLastError("Could not create cublasHandle");
|
||||
|
||||
cusparseCreate(&cusparseHandle);
|
||||
cudaCheckLastError("Could not create cusparseHandle");
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cublas");
|
||||
cusparseSetStream(cusparseHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
|
||||
cudaMallocHost((void**)&x, sizeof(double) * N);
|
||||
cudaCheckLastError("Could not allocate pinned host memory");
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void cusparseSolverBackend::finalize(){
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_r);
|
||||
cudaFree(d_rw);
|
||||
cudaFree(d_p);
|
||||
cudaFree(d_pw);
|
||||
cudaFree(d_s);
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
cudaFree(d_buffer);
|
||||
cusparseDestroyBsrilu02Info(info_M);
|
||||
cusparseDestroyBsrsv2Info(info_L);
|
||||
cusparseDestroyBsrsv2Info(info_U);
|
||||
cusparseDestroyMatDescr(descr_B);
|
||||
cusparseDestroyMatDescr(descr_M);
|
||||
cusparseDestroyMatDescr(descr_L);
|
||||
cusparseDestroyMatDescr(descr_U);
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
cudaHostUnregister(vals);
|
||||
cudaHostUnregister(cols);
|
||||
cudaHostUnregister(rows);
|
||||
cudaStreamDestroy(stream);
|
||||
cudaFreeHost(x);
|
||||
} // end finalize()
|
||||
|
||||
|
||||
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
|
||||
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
|
||||
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
this->vals = vals;
|
||||
this->cols = cols;
|
||||
this->rows = rows;
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
void cusparseSolverBackend::reset_prec_on_gpu(){
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
bool cusparseSolverBackend::analyse_matrix(){
|
||||
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
double t1, t2;
|
||||
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cusparseCreateMatDescr(&descr_B);
|
||||
cusparseCreateMatDescr(&descr_M);
|
||||
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
|
||||
|
||||
cusparseSetMatIndexBase(descr_B, base_type);
|
||||
cusparseSetMatIndexBase(descr_M, base_type);
|
||||
|
||||
cusparseCreateMatDescr(&descr_L);
|
||||
cusparseSetMatIndexBase(descr_L, base_type);
|
||||
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
|
||||
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
|
||||
|
||||
cusparseCreateMatDescr(&descr_U);
|
||||
cusparseSetMatIndexBase(descr_U, base_type);
|
||||
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
|
||||
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
|
||||
cudaCheckLastError("Could not initialize matrix descriptions");
|
||||
|
||||
cusparseCreateBsrilu02Info(&info_M);
|
||||
cusparseCreateBsrsv2Info(&info_L);
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
cudaMalloc((void**)&d_buffer, d_bufferSize);
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
bool cusparseSolverBackend::create_preconditioner(){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
|
||||
return false;
|
||||
}
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void cusparseSolverBackend::solve_system(BdaResult &res){
|
||||
// actually solve
|
||||
gpu_pbicgstab(res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void cusparseSolverBackend::post_process(double *x){
|
||||
|
||||
if(!initialized){
|
||||
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
|
||||
}
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if(verbosity > 2){
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end post_process()
|
||||
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
|
||||
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){
|
||||
if(initialized == false){
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
}else{
|
||||
update_system_on_gpu(vals, b);
|
||||
}
|
||||
if(analysis_done == false){
|
||||
if(!analyse_matrix()){
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if(create_preconditioner()){
|
||||
solve_system(res);
|
||||
}else{
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
double second(void){
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
|
||||
}
|
||||
|
||||
cusparseSolverBackend::~cusparseSolverBackend(){
|
||||
finalize();
|
||||
}
|
||||
|
||||
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
|
||||
double t_total1, t_total2;
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
float it;
|
||||
|
||||
t_total1 = second();
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
|
||||
|
||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
|
||||
|
||||
if(verbosity > 1){
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
for(it = 0.5; it < maxit; it+=0.5){
|
||||
rhop = rho;
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
|
||||
|
||||
if(it > 1){
|
||||
beta = (rho/rhop) * (alpha/omega);
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
|
||||
cublasDscal(cublasHandle, n, &beta, d_p, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
|
||||
}
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, \
|
||||
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
alpha = rho / tmp1;
|
||||
nalpha = -alpha;
|
||||
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
if(norm < tolerance * norm_0 && it > minit){
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||
omega = tmp1 / tmp2;
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if(norm < tolerance * norm_0 && it > minit){
|
||||
break;
|
||||
}
|
||||
|
||||
if(verbosity > 1){
|
||||
std::ostringstream out;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t_total2 = second();
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm/norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if(verbosity > 0){
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim){
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->BLOCK_SIZE = dim;
|
||||
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
int deviceID = 0;
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
struct cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
cudaCheckLastError("Could not create stream");
|
||||
|
||||
cublasCreate(&cublasHandle);
|
||||
cudaCheckLastError("Could not create cublasHandle");
|
||||
|
||||
cusparseCreate(&cusparseHandle);
|
||||
cudaCheckLastError("Could not create cusparseHandle");
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cublas");
|
||||
cusparseSetStream(cusparseHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
|
||||
cudaMallocHost((void**)&x, sizeof(double) * N);
|
||||
cudaCheckLastError("Could not allocate pinned host memory");
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void cusparseSolverBackend::finalize(){
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_r);
|
||||
cudaFree(d_rw);
|
||||
cudaFree(d_p);
|
||||
cudaFree(d_pw);
|
||||
cudaFree(d_s);
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
cudaFree(d_buffer);
|
||||
cusparseDestroyBsrilu02Info(info_M);
|
||||
cusparseDestroyBsrsv2Info(info_L);
|
||||
cusparseDestroyBsrsv2Info(info_U);
|
||||
cusparseDestroyMatDescr(descr_B);
|
||||
cusparseDestroyMatDescr(descr_M);
|
||||
cusparseDestroyMatDescr(descr_L);
|
||||
cusparseDestroyMatDescr(descr_U);
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
cudaHostUnregister(vals);
|
||||
cudaHostUnregister(cols);
|
||||
cudaHostUnregister(rows);
|
||||
cudaStreamDestroy(stream);
|
||||
cudaFreeHost(x);
|
||||
} // end finalize()
|
||||
|
||||
|
||||
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
|
||||
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
|
||||
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
this->vals = vals;
|
||||
this->cols = cols;
|
||||
this->rows = rows;
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
void cusparseSolverBackend::reset_prec_on_gpu(){
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
bool cusparseSolverBackend::analyse_matrix(){
|
||||
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
double t1, t2;
|
||||
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cusparseCreateMatDescr(&descr_B);
|
||||
cusparseCreateMatDescr(&descr_M);
|
||||
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
|
||||
|
||||
cusparseSetMatIndexBase(descr_B, base_type);
|
||||
cusparseSetMatIndexBase(descr_M, base_type);
|
||||
|
||||
cusparseCreateMatDescr(&descr_L);
|
||||
cusparseSetMatIndexBase(descr_L, base_type);
|
||||
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
|
||||
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
|
||||
|
||||
cusparseCreateMatDescr(&descr_U);
|
||||
cusparseSetMatIndexBase(descr_U, base_type);
|
||||
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
|
||||
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
|
||||
cudaCheckLastError("Could not initialize matrix descriptions");
|
||||
|
||||
cusparseCreateBsrilu02Info(&info_M);
|
||||
cusparseCreateBsrsv2Info(&info_L);
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
cudaMalloc((void**)&d_buffer, d_bufferSize);
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
bool cusparseSolverBackend::create_preconditioner(){
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if(CUSPARSE_STATUS_ZERO_PIVOT == status){
|
||||
return false;
|
||||
}
|
||||
|
||||
if(verbosity > 2){
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void cusparseSolverBackend::solve_system(BdaResult &res){
|
||||
// actually solve
|
||||
gpu_pbicgstab(res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void cusparseSolverBackend::post_process(double *x){
|
||||
|
||||
if(!initialized){
|
||||
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
|
||||
}
|
||||
|
||||
double t1, t2;
|
||||
if(verbosity > 2){
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if(verbosity > 2){
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end post_process()
|
||||
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
|
||||
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){
|
||||
if(initialized == false){
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
}else{
|
||||
update_system_on_gpu(vals, b);
|
||||
}
|
||||
if(analysis_done == false){
|
||||
if(!analyse_matrix()){
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if(create_preconditioner()){
|
||||
solve_system(res);
|
||||
}else{
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
|
@ -91,10 +91,10 @@ private:
|
||||
public:
|
||||
|
||||
enum class cusparseSolverStatus {
|
||||
CUSPARSE_SOLVER_SUCCESS,
|
||||
CUSPARSE_SOLVER_ANALYSIS_FAILED,
|
||||
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
CUSPARSE_SOLVER_UNKNOWN_ERROR
|
||||
CUSPARSE_SOLVER_SUCCESS,
|
||||
CUSPARSE_SOLVER_ANALYSIS_FAILED,
|
||||
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
CUSPARSE_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
Loading…
Reference in New Issue
Block a user