Merge pull request #2475 from Tongdongq/separate_wellcontributions_for_gpu

Separate wellcontributions for gpu
This commit is contained in:
Markus Blatt 2020-03-20 12:14:06 +01:00 committed by GitHub
commit c0a82ff335
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 649 additions and 161 deletions

View File

@ -46,7 +46,8 @@ if(NOT CMAKE_DISABLE_FIND_PACKAGE_CUDA AND
# While the documentation says that it is deprecated, FindCUDA seems the # While the documentation says that it is deprecated, FindCUDA seems the
# only easy way to determine the cublas and cusparse libraries. # only easy way to determine the cublas and cusparse libraries.
# Hence we call it unconditionally # Hence we call it unconditionally
find_package(CUDA) # The WellContributions kernel uses __shfl_down_sync, which was introduced in CUDA 9.0
find_package(CUDA 9.0)
endif() endif()
if(CUDA_FOUND) if(CUDA_FOUND)

View File

@ -29,7 +29,6 @@ list (APPEND MAIN_SOURCE_FILES
opm/simulators/flow/MissingFeatures.cpp opm/simulators/flow/MissingFeatures.cpp
opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
opm/simulators/linalg/setupPropertyTree.cpp opm/simulators/linalg/setupPropertyTree.cpp
opm/simulators/linalg/bda/BdaBridge.cpp
opm/simulators/timestepping/TimeStepControl.cpp opm/simulators/timestepping/TimeStepControl.cpp
opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
opm/simulators/timestepping/SimulatorTimer.cpp opm/simulators/timestepping/SimulatorTimer.cpp
@ -44,6 +43,8 @@ list (APPEND MAIN_SOURCE_FILES
if(CUDA_FOUND) if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif() endif()
if(MPI_FOUND) if(MPI_FOUND)
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
@ -142,6 +143,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/BdaResult.hpp opm/simulators/linalg/bda/BdaResult.hpp
opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cuda_header.hpp
opm/simulators/linalg/bda/cusparseSolverBackend.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/BlackoilAmg.hpp opm/simulators/linalg/BlackoilAmg.hpp
opm/simulators/linalg/BlackoilAmgCpr.hpp opm/simulators/linalg/BlackoilAmgCpr.hpp
opm/simulators/linalg/amgcpr.hh opm/simulators/linalg/amgcpr.hh

View File

@ -45,7 +45,9 @@
#include <opm/common/utility/platform_dependent/reenable_warnings.h> #include <opm/common/utility/platform_dependent/reenable_warnings.h>
#if HAVE_CUDA
#include <opm/simulators/linalg/bda/BdaBridge.hpp> #include <opm/simulators/linalg/bda/BdaBridge.hpp>
#endif
BEGIN_PROPERTIES BEGIN_PROPERTIES
@ -338,15 +340,16 @@ protected:
converged_(false) converged_(false)
{ {
parameters_.template init<TypeTag>(); parameters_.template init<TypeTag>();
const auto& gridForConn = simulator_.vanguard().grid();
#if HAVE_CUDA #if HAVE_CUDA
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
if (gridForConn.comm().size() > 1 && use_gpu) {
OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
use_gpu = false;
}
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter); const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
if (use_gpu && !matrix_add_well_contributions) {
OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
}
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance)); bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
#else #else
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu); const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
@ -355,7 +358,6 @@ protected:
} }
#endif #endif
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
const auto& gridForConn = simulator_.vanguard().grid();
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd(); const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
const bool useWellConn = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); const bool useWellConn = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
@ -569,22 +571,33 @@ protected:
#endif #endif
{ {
// tries to solve linear system // tries to solve linear system
// solve_system() does nothing if Dune is selected
#if HAVE_CUDA #if HAVE_CUDA
bdaBridge->solve_system(matrix_.get(), istlb, result); bool use_gpu = bdaBridge->getUseGpu();
if (use_gpu) {
if (result.converged) { WellContributions wellContribs;
// get result vector x from non-Dune backend, iff solve was successful const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
bdaBridge->get_result(x); if (!addWellContribs) {
} else { simulator_.problem().wellModel().getWellContributions(wellContribs);
// CPU fallback, or default case for Dune
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
if (use_gpu) {
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
} }
bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result);
if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x);
} else {
// CPU fallback
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
if (use_gpu) {
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
}
// call Dune
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
solve(linearOperator, x, istlb, *sp, *precond, result);
}
} else { // gpu is not selected or disabled
auto precond = constructPrecond(linearOperator, parallelInformation_arg); auto precond = constructPrecond(linearOperator, parallelInformation_arg);
solve(linearOperator, x, istlb, *sp, *precond, result); solve(linearOperator, x, istlb, *sp, *precond, result);
} // end Dune call }
#else #else
// Construct preconditioner. // Construct preconditioner.
auto precond = constructPrecond(linearOperator, parallelInformation_arg); auto precond = constructPrecond(linearOperator, parallelInformation_arg);

View File

@ -35,7 +35,6 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
namespace Opm namespace Opm
{ {
#if HAVE_CUDA
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance) BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
: use_gpu(use_gpu_) : use_gpu(use_gpu_)
{ {
@ -43,15 +42,9 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance)); backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
} }
} }
#else
BdaBridge::BdaBridge(bool use_gpu_ OPM_UNUSED, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED)
{
}
#endif
#if HAVE_CUDA
template <class BridgeMatrix> template <class BridgeMatrix>
int checkZeroDiagonal(BridgeMatrix& mat) { int checkZeroDiagonal(BridgeMatrix& mat) {
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
@ -118,13 +111,11 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
} }
} // end getSparsityPattern() } // end getSparsityPattern()
#endif
template <class BridgeMatrix, class BridgeVector> template <class BridgeMatrix, class BridgeVector>
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED) void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
{ {
#if HAVE_CUDA
if (use_gpu) { if (use_gpu) {
BdaResult result; BdaResult result;
result.converged = false; result.converged = false;
@ -137,6 +128,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
if (dim != 3) { if (dim != 3) {
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
use_gpu = false; use_gpu = false;
return;
} }
if (h_rows.capacity() == 0) { if (h_rows.capacity() == 0) {
@ -169,7 +161,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus; typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result); cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
switch(status) { switch(status) {
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS: case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
//OpmLog::info("cusparseSolver converged"); //OpmLog::info("cusparseSolver converged");
@ -192,61 +184,62 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
}else{ }else{
res.converged = false; res.converged = false;
} }
#endif // HAVE_CUDA
} }
template <class BridgeVector> template <class BridgeVector>
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) { void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
#if HAVE_CUDA
if (use_gpu) { if (use_gpu) {
backend->post_process(static_cast<double*>(&(x[0][0]))); backend->post_process(static_cast<double*>(&(x[0][0])));
} }
#endif
} }
template void BdaBridge::solve_system< \ template void BdaBridge::solve_system<
Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > , \ Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > ,
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \ Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat, \ (Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat,
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b, \ Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b,
WellContributions& wellContribs,
InverseOperatorResult &res); InverseOperatorResult &res);
template void BdaBridge::solve_system< \ template void BdaBridge::solve_system<
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \ Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > ,
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \ Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \ (Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat,
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \ Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b,
WellContributions& wellContribs,
InverseOperatorResult &res); InverseOperatorResult &res);
template void BdaBridge::solve_system< \ template void BdaBridge::solve_system<
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \ Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > ,
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \ Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \ (Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat,
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \ Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b,
WellContributions& wellContribs,
InverseOperatorResult &res); InverseOperatorResult &res);
template void BdaBridge::solve_system< \ template void BdaBridge::solve_system<
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \ Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > ,
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \ Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \ (Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat,
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \ Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b,
WellContributions& wellContribs,
InverseOperatorResult &res); InverseOperatorResult &res);
template void BdaBridge::get_result< \ template void BdaBridge::get_result<
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \ Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
(Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &x); (Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &x);
template void BdaBridge::get_result< \ template void BdaBridge::get_result<
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \ Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x); (Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
template void BdaBridge::get_result< \ template void BdaBridge::get_result<
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \ Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x); (Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
template void BdaBridge::get_result< \ template void BdaBridge::get_result<
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \ Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x); (Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);

View File

@ -21,14 +21,19 @@
#define BDABRIDGE_HEADER_INCLUDED #define BDABRIDGE_HEADER_INCLUDED
#include <config.h> #include <config.h>
#if ! HAVE_CUDA
#error "This file should only be included if CUDA is found"
#endif
#include "dune/istl/solver.hh" // for struct InverseOperatorResult #include "dune/istl/solver.hh" // for struct InverseOperatorResult
#include "dune/istl/bcrsmatrix.hh" #include "dune/istl/bcrsmatrix.hh"
#include <opm/simulators/linalg/matrixblock.hh> #include <opm/simulators/linalg/matrixblock.hh>
#if HAVE_CUDA #include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp> #include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#endif
namespace Opm namespace Opm
{ {
@ -40,10 +45,8 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
class BdaBridge class BdaBridge
{ {
private: private:
#if HAVE_CUDA
std::unique_ptr<cusparseSolverBackend> backend; std::unique_ptr<cusparseSolverBackend> backend;
bool use_gpu; bool use_gpu;
#endif
public: public:
/// Construct a BdaBridge /// Construct a BdaBridge
@ -55,17 +58,24 @@ public:
/// Solve linear system, A*x = b /// Solve linear system, A*x = b
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix /// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
/// \param[in] b vector b, should be of type Dune::BlockVector /// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] result summary of solver result /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] result summary of solver result
template <class BridgeMatrix, class BridgeVector> template <class BridgeMatrix, class BridgeVector>
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result); void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
/// Get the resulting x vector /// Get the resulting x vector
/// \param[inout] x vector x, should be of type Dune::BlockVector /// \param[inout] x vector x, should be of type Dune::BlockVector
template <class BridgeVector> template <class BridgeVector>
void get_result(BridgeVector &x); void get_result(BridgeVector &x);
/// Return whether the BdaBridge will use the GPU or not
/// return whether the BdaBridge will use the GPU or not
bool getUseGpu(){
return use_gpu;
}
}; // end class BdaBridge }; // end class BdaBridge
} }

View File

@ -0,0 +1,218 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdlib>
#include <cstring>
#include <config.h> // CMake
#ifdef __NVCC__
#include "opm/simulators/linalg/bda/cuda_header.hpp"
#include <cuda_runtime.h>
#endif
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include "opm/simulators/linalg/bda/WellContributions.hpp"
namespace Opm
{
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
__global__ void apply_well_contributions(
const double * __restrict__ Cnnzs,
const double * __restrict__ Dnnzs,
const double * __restrict__ Bnnzs,
const int * __restrict__ Ccols,
const int * __restrict__ Bcols,
const double * __restrict__ x,
double * __restrict__ y,
const int dim,
const int dim_wells,
const unsigned int * __restrict__ val_pointers
)
{
const int idx_b = blockIdx.x;
const int idx_t = threadIdx.x;
const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
const int vals_per_block = dim * dim_wells; // 12
const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
const int num_blocks_per_warp = 32/vals_per_block; // 2
const int lane = idx_t % 32;
const int c = lane % dim; // col in block
const int r = (lane / dim) % dim_wells; // row in block
extern __shared__ double smem[];
double * __restrict__ z1 = smem;
double * __restrict__ z2 = z1 + dim_wells;
if(idx_t < dim_wells){
z1[idx_t] = 0.0;
}
__syncthreads();
// z1 = B * x
if(idx_t < num_active_threads){
// multiply all blocks with x
double temp = 0.0;
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
while(b < val_size + val_pointers[idx_b]){
int colIdx = Bcols[b];
temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
b += num_blocks_per_warp;
}
// merge all blocks into 1 dim*dim_wells block
// since NORNE has only 2 parallel blocks, do not use a loop
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
b = idx_t / vals_per_block + val_pointers[idx_b];
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
if(idx_t < vals_per_block){
// should be a loop as well, now only works for dim == 3
if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
}
// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
if(c == 0 && idx_t < vals_per_block){
z1[r] = temp;
}
}
__syncthreads();
// z2 = D^-1 * B * x = D^-1 * z1
if(idx_t < dim_wells){
double temp = 0.0;
for(int c = 0; c < dim_wells; ++c){
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
}
z2[idx_t] = temp;
}
__syncthreads();
// y -= C^T * D^-1 * B * x
// use dim * val_size threads, each block is assigned 'dim' threads
if(idx_t < dim * val_size){
double temp = 0.0;
int b = idx_t / dim + val_pointers[idx_b];
int cc = idx_t % dim;
int colIdx = Ccols[b];
for(unsigned int c = 0; c < dim_wells; ++c){
temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
}
y[colIdx * dim + cc] -= temp;
}
}
void WellContributions::alloc(){
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
val_pointers = new unsigned int[num_wells + 1];
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
allocated = true;
}
WellContributions::~WellContributions()
{
cudaFree(d_Cnnzs);
cudaFree(d_Dnnzs);
cudaFree(d_Bnnzs);
cudaFree(d_Ccols);
cudaFree(d_Bcols);
delete[] val_pointers;
cudaFree(d_val_pointers);
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
void WellContributions::apply(double *d_x, double *d_y)
{
int smem_size = 2 * sizeof(double) * dim_wells;
apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
}
void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
break;
case MatrixType::D:
cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
break;
case MatrixType::B:
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
val_pointers[num_wells_so_far] = num_blocks_so_far;
if(num_wells_so_far == num_wells - 1){
val_pointers[num_wells] = num_blocks;
}
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
break;
default:
OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
}
cudaCheckLastError("WellContributions::addMatrix() failed");
if (MatrixType::B == type) {
num_blocks_so_far += val_size;
num_wells_so_far++;
}
}
void WellContributions::setCudaStream(cudaStream_t stream_)
{
this->stream = stream_;
}
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
{
dim = dim_;
dim_wells = dim_wells_;
}
void WellContributions::addNumBlocks(unsigned int nnz)
{
if (allocated) {
OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
}
num_blocks += nnz;
num_wells++;
}
} //namespace Opm

View File

@ -0,0 +1,123 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_HEADER_INCLUDED
#include <config.h>
#if ! HAVE_CUDA
#error "This file should only be included if CUDA is found"
#endif
#include <vector>
#include <cuda_runtime.h>
namespace Opm
{
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
/// B*x and D*B*x are a vector with numStaticWellEq doubles
/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
///
/// This class is used in 3 phases:
/// - get total size of all wellcontributions that must be stored here
/// - allocate memory
/// - copy data of wellcontributions
class WellContributions
{
private:
unsigned int num_blocks = 0; // total number of blocks in all wells
unsigned int dim; // number of columns of blocks in B and C, equal to StandardWell::numEq
unsigned int dim_wells; // number of rows of blocks in B and C, equal to StandardWell::numStaticWellEq
unsigned int num_wells = 0; // number of wellcontributions in this object
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
unsigned int num_wells_so_far = 0; // keep track of where next data is written
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
bool allocated = false;
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
cudaStream_t stream;
public:
/// StandardWell has C, D and B matrices that need to be copied
enum class MatrixType {
C,
D,
B
};
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
/// Create a new WellContributions, implementation is empty
WellContributions(){};
/// Destroy a WellContributions, and free memory
~WellContributions();
/// Apply all wellcontributions in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for StandardWell
/// \param[in] x vector x
/// \param[inout] y vector y
void apply(double *x, double *y);
/// Allocate memory for the wellcontributions
void alloc();
/// Indicate how large the blocks of the wellcontributions (C and B) are
/// \param[in] dim number of columns
/// \param[in] dim_wells number of rows
void setBlockSize(unsigned int dim, unsigned int dim_wells);
/// Indicate how large the next wellcontribution is, this function cannot be called after alloc() is called
/// \param[in] numBlocks number of blocks in C and B of next wellcontribution
void addNumBlocks(unsigned int numBlocks);
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
/// Return the number of wells added to this object
/// \return the number of wells added to this object
unsigned int getNumWells(){
return num_wells;
}
};
} //namespace Opm
#endif

View File

@ -38,6 +38,10 @@
#include "cusparse_v2.h" #include "cusparse_v2.h"
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html // For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace Opm namespace Opm
{ {
@ -58,7 +62,7 @@ namespace Opm
finalize(); finalize();
} }
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) { void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
double t_total1, t_total2; double t_total1, t_total2;
int n = N; int n = N;
double rho = 1.0, rhop; double rho = 1.0, rhop;
@ -72,7 +76,11 @@ namespace Opm
t_total1 = second(); t_total1 = second();
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r); if(wellContribs.getNumWells() > 0){
wellContribs.setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
cublasDscal(cublasHandle, n, &mone, d_r, 1); cublasDscal(cublasHandle, n, &mone, d_r, 1);
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1); cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
@ -101,15 +109,20 @@ namespace Opm
// apply ilu0 // apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \ cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \ operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer); descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \ cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \ operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer); descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
// spmv // spmv
cusparseDbsrmv(cusparseHandle, order, \ cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, \ operation, Nb, Nb, nnzb, \
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v); &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
// apply wellContributions
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1); cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
alpha = rho / tmp1; alpha = rho / tmp1;
@ -127,15 +140,20 @@ namespace Opm
// apply ilu0 // apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \ cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \ operation, Nb, nnzb, &one, \
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer); descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \ cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \ operation, Nb, nnzb, &one, \
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer); descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
// spmv // spmv
cusparseDbsrmv(cusparseHandle, order, \ cusparseDbsrmv(cusparseHandle, order, \
operation, Nb, Nb, nnzb, &one, descr_M, \ operation, Nb, Nb, nnzb, &one, descr_M, \
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t); d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
// apply wellContributions
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1); cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2); cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
@ -178,8 +196,8 @@ namespace Opm
void cusparseSolverBackend::initialize(int N, int nnz, int dim) { void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
this->N = N; this->N = N;
this->nnz = nnz; this->nnz = nnz;
this->BLOCK_SIZE = dim; this->block_size = dim;
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE; this->nnzb = nnz/block_size/block_size;
Nb = (N + dim - 1) / dim; Nb = (N + dim - 1) / dim;
std::ostringstream out; std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks"; out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
@ -229,41 +247,44 @@ namespace Opm
cusparseSetStream(cusparseHandle, stream); cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse"); cudaCheckLastError("Could not set stream to cusparse");
cudaMallocHost((void**)&x, sizeof(double) * N); #if COPY_ROW_BY_ROW
cudaCheckLastError("Could not allocate pinned host memory"); cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
cudaCheckLastError("Could not allocate pinned memory");
#endif
initialized = true; initialized = true;
} // end initialize() } // end initialize()
void cusparseSolverBackend::finalize() { void cusparseSolverBackend::finalize() {
cudaFree(d_x); if (initialized) {
cudaFree(d_b); cudaFree(d_x);
cudaFree(d_r); cudaFree(d_b);
cudaFree(d_rw); cudaFree(d_r);
cudaFree(d_p); cudaFree(d_rw);
cudaFree(d_pw); cudaFree(d_p);
cudaFree(d_s); cudaFree(d_pw);
cudaFree(d_t); cudaFree(d_s);
cudaFree(d_v); cudaFree(d_t);
cudaFree(d_mVals); cudaFree(d_v);
cudaFree(d_bVals); cudaFree(d_mVals);
cudaFree(d_bCols); cudaFree(d_bVals);
cudaFree(d_bRows); cudaFree(d_bCols);
cudaFree(d_buffer); cudaFree(d_bRows);
cusparseDestroyBsrilu02Info(info_M); cudaFree(d_buffer);
cusparseDestroyBsrsv2Info(info_L); cusparseDestroyBsrilu02Info(info_M);
cusparseDestroyBsrsv2Info(info_U); cusparseDestroyBsrsv2Info(info_L);
cusparseDestroyMatDescr(descr_B); cusparseDestroyBsrsv2Info(info_U);
cusparseDestroyMatDescr(descr_M); cusparseDestroyMatDescr(descr_B);
cusparseDestroyMatDescr(descr_L); cusparseDestroyMatDescr(descr_M);
cusparseDestroyMatDescr(descr_U); cusparseDestroyMatDescr(descr_L);
cusparseDestroy(cusparseHandle); cusparseDestroyMatDescr(descr_U);
cublasDestroy(cublasHandle); cusparseDestroy(cusparseHandle);
cudaHostUnregister(vals); cublasDestroy(cublasHandle);
cudaHostUnregister(cols); #if COPY_ROW_BY_ROW
cudaHostUnregister(rows); cudaFreeHost(vals_contiguous);
cudaStreamDestroy(stream); #endif
cudaFreeHost(x); cudaStreamDestroy(stream);
}
} // end finalize() } // end finalize()
@ -274,22 +295,23 @@ namespace Opm
t1 = second(); t1 = second();
} }
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c #if COPY_ROW_BY_ROW
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory int sum = 0;
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault); for(int i = 0; i < Nb; ++i){
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault); int size_row = rows[i+1] - rows[i];
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault); memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault); sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#endif
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
this->vals = vals;
this->cols = cols;
this->rows = rows;
if (verbosity > 2) { if (verbosity > 2) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
t2 = second(); t2 = second();
@ -301,14 +323,25 @@ namespace Opm
// don't copy rowpointers and colindices, they stay the same // don't copy rowpointers and colindices, they stay the same
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) { void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
t1 = second(); t1 = second();
} }
#if COPY_ROW_BY_ROW
int sum = 0;
for(int i = 0; i < Nb; ++i){
int size_row = rows[i+1] - rows[i];
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#endif
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
@ -364,11 +397,11 @@ namespace Opm
cudaCheckLastError("Could not create analysis info"); cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb, cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M); descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L); descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U); descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
cudaCheckLastError(); cudaCheckLastError();
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
@ -377,7 +410,7 @@ namespace Opm
// analysis of ilu LU decomposition // analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \ cusparseDbsrilu02_analysis(cusparseHandle, order, \
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \ Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_M, policy, d_buffer); block_size, info_M, policy, d_buffer);
int structural_zero; int structural_zero;
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero); cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
@ -388,11 +421,11 @@ namespace Opm
// analysis of ilu apply // analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \ Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_L, policy, d_buffer); block_size, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \ Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
BLOCK_SIZE, info_U, policy, d_buffer); block_size, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information"); cudaCheckLastError("Could not analyse level information");
if (verbosity > 2) { if (verbosity > 2) {
@ -403,6 +436,8 @@ namespace Opm
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
analysis_done = true;
return true; return true;
} // end analyse_matrix() } // end analyse_matrix()
@ -417,7 +452,8 @@ namespace Opm
d_mRows = d_bRows; d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \ cusparseDbsrilu02(cusparseHandle, order, \
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \ Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
BLOCK_SIZE, info_M, policy, d_buffer); block_size, info_M, policy, d_buffer);
cudaCheckLastError("Could not perform ilu decomposition");
int structural_zero; int structural_zero;
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize() // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
@ -437,9 +473,9 @@ namespace Opm
} // end create_preconditioner() } // end create_preconditioner()
void cusparseSolverBackend::solve_system(BdaResult &res) { void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
// actually solve // actually solve
gpu_pbicgstab(res); gpu_pbicgstab(wellContribs, res);
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
cudaCheckLastError("Something went wrong during the GPU solve"); cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system() } // end solve_system()
@ -449,10 +485,6 @@ namespace Opm
// caller must be sure that x is a valid array // caller must be sure that x is a valid array
void cusparseSolverBackend::post_process(double *x) { void cusparseSolverBackend::post_process(double *x) {
if (!initialized) {
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
}
double t1, t2; double t1, t2;
if (verbosity > 2) { if (verbosity > 2) {
t1 = second(); t1 = second();
@ -472,12 +504,12 @@ namespace Opm
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus; typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) { cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) { if (initialized == false) {
initialize(N, nnz, dim); initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b); copy_system_to_gpu(vals, rows, cols, b);
}else{ }else{
update_system_on_gpu(vals, b); update_system_on_gpu(vals, rows, b);
} }
if (analysis_done == false) { if (analysis_done == false) {
if (!analyse_matrix()) { if (!analyse_matrix()) {
@ -486,7 +518,7 @@ namespace Opm
} }
reset_prec_on_gpu(); reset_prec_on_gpu();
if (create_preconditioner()) { if (create_preconditioner()) {
solve_system(res); solve_system(wellContribs, res);
}else{ }else{
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED; return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }

View File

@ -25,6 +25,7 @@
#include "cusparse_v2.h" #include "cusparse_v2.h"
#include "opm/simulators/linalg/bda/BdaResult.hpp" #include "opm/simulators/linalg/bda/BdaResult.hpp"
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm namespace Opm
{ {
@ -50,13 +51,11 @@ private:
int *d_bRows, *d_mRows; int *d_bRows, *d_mRows;
double *d_x, *d_b, *d_r, *d_rw, *d_p; double *d_x, *d_b, *d_r, *d_rw, *d_p;
double *d_pw, *d_s, *d_t, *d_v; double *d_pw, *d_s, *d_t, *d_v;
double *vals;
int *cols, *rows;
double *x, *b;
void *d_buffer; void *d_buffer;
int N, Nb, nnz, nnzb; int N, Nb, nnz, nnzb;
double *vals_contiguous;
int BLOCK_SIZE; int block_size;
bool initialized = false; bool initialized = false;
bool analysis_done = false; bool analysis_done = false;
@ -70,8 +69,9 @@ private:
int verbosity = 0; int verbosity = 0;
/// Solve linear system using ilu0-bicgstab /// Solve linear system using ilu0-bicgstab
/// \param[inout] res summary of solver result /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
void gpu_pbicgstab(BdaResult& res); /// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory /// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
@ -83,16 +83,17 @@ private:
void finalize(); void finalize();
/// Copy linear system to GPU /// Copy linear system to GPU
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values /// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values /// \param[in] b input vector, contains N values
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b); void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
/// \param[in] b input vector, contains N values /// \param[in] b input vector, contains N values
void update_system_on_gpu(double *vals, double *b); void update_system_on_gpu(double *vals, int *rows, double *b);
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
void reset_prec_on_gpu(); void reset_prec_on_gpu();
@ -106,8 +107,9 @@ private:
bool create_preconditioner(); bool create_preconditioner();
/// Solve linear system /// Solve linear system
/// \param[inout] res summary of solver result /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
void solve_system(BdaResult &res); /// \param[inout] res summary of solver result
void solve_system(WellContributions& wellContribs, BdaResult &res);
public: public:
@ -128,16 +130,17 @@ public:
~cusparseSolverBackend(); ~cusparseSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows /// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block /// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values /// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values /// \param[in] b input vector, contains N values
/// \param[inout] res summary of solver result /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \return status code /// \param[inout] res summary of solver result
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res); /// \return status code
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
/// Post processing after linear solve, now only copies resulting x vector back /// Post processing after linear solve, now only copies resulting x vector back
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array

View File

@ -221,6 +221,11 @@ namespace Opm {
// subtract B*inv(D)*C * x from A*x // subtract B*inv(D)*C * x from A*x
void apply(const BVector& x, BVector& Ax) const; void apply(const BVector& x, BVector& Ax) const;
#if HAVE_CUDA
// accumulate the contributions of all Wells in the WellContributions object
void getWellContributions(WellContributions& x) const;
#endif
// apply well model with scaling of alpha // apply well model with scaling of alpha
void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const; void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;

View File

@ -918,9 +918,32 @@ namespace Opm {
} }
} }
#if HAVE_CUDA
template<typename TypeTag>
void
BlackoilWellModel<TypeTag>::
getWellContributions(WellContributions& wellContribs) const
{
wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
unsigned int numBlocks;
derived->getNumBlocks(numBlocks);
wellContribs.addNumBlocks(numBlocks);
}
wellContribs.alloc();
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
if (derived) {
derived->addWellContribution(wellContribs);
} else {
OpmLog::warning("Warning only StandardWell is supported by WellContributions for GPU");
}
}
}
#endif
// Ax = Ax - alpha * C D^-1 B x // Ax = Ax - alpha * C D^-1 B x
template<typename TypeTag> template<typename TypeTag>

View File

@ -184,6 +184,14 @@ namespace Opm
/// r = r - C D^-1 Rw /// r = r - C D^-1 Rw
virtual void apply(BVector& r) const override; virtual void apply(BVector& r) const override;
#if HAVE_CUDA
/// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
void addWellContribution(WellContributions& wellContribs) const;
/// get the number of blocks of the C and B matrices, used to allocate memory in a WellContributions object
void getNumBlocks(unsigned int& _nnzs) const;
#endif
/// using the solution x to recover the solution xw for wells and applying /// using the solution x to recover the solution xw for wells and applying
/// xw to update Well State /// xw to update Well State
virtual void recoverWellSolutionAndUpdateWellState(const BVector& x, virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,

View File

@ -2735,8 +2735,65 @@ namespace Opm
duneC_.mmtv(invDrw_, r); duneC_.mmtv(invDrw_, r);
} }
#if HAVE_CUDA
template<typename TypeTag>
void
StandardWell<TypeTag>::
addWellContribution(WellContributions& wellContribs) const
{
std::vector<int> colIndices;
std::vector<double> nnzValues;
colIndices.reserve(duneB_.nonzeroes());
nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq);
// duneC
for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC )
{
colIndices.emplace_back(colC.index());
for (int i = 0; i < numStaticWellEq; ++i) {
for (int j = 0; j < numEq; ++j) {
nnzValues.emplace_back((*colC)[i][j]);
}
}
}
wellContribs.addMatrix(WellContributions::MatrixType::C, colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
// invDuneD
colIndices.clear();
nnzValues.clear();
colIndices.emplace_back(0);
for (int i = 0; i < numStaticWellEq; ++i)
{
for (int j = 0; j < numStaticWellEq; ++j) {
nnzValues.emplace_back(invDuneD_[0][0][i][j]);
}
}
wellContribs.addMatrix(WellContributions::MatrixType::D, colIndices.data(), nnzValues.data(), 1);
// duneB
colIndices.clear();
nnzValues.clear();
for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB )
{
colIndices.emplace_back(colB.index());
for (int i = 0; i < numStaticWellEq; ++i) {
for (int j = 0; j < numEq; ++j) {
nnzValues.emplace_back((*colB)[i][j]);
}
}
}
wellContribs.addMatrix(WellContributions::MatrixType::B, colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
}
template<typename TypeTag>
void
StandardWell<TypeTag>::
getNumBlocks(unsigned int& numBlocks) const
{
numBlocks = duneB_.nonzeroes();
}
#endif
template<typename TypeTag> template<typename TypeTag>