mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #2475 from Tongdongq/separate_wellcontributions_for_gpu
Separate wellcontributions for gpu
This commit is contained in:
commit
c0a82ff335
@ -46,7 +46,8 @@ if(NOT CMAKE_DISABLE_FIND_PACKAGE_CUDA AND
|
|||||||
# While the documentation says that it is deprecated, FindCUDA seems the
|
# While the documentation says that it is deprecated, FindCUDA seems the
|
||||||
# only easy way to determine the cublas and cusparse libraries.
|
# only easy way to determine the cublas and cusparse libraries.
|
||||||
# Hence we call it unconditionally
|
# Hence we call it unconditionally
|
||||||
find_package(CUDA)
|
# The WellContributions kernel uses __shfl_down_sync, which was introduced in CUDA 9.0
|
||||||
|
find_package(CUDA 9.0)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(CUDA_FOUND)
|
if(CUDA_FOUND)
|
||||||
|
@ -29,7 +29,6 @@ list (APPEND MAIN_SOURCE_FILES
|
|||||||
opm/simulators/flow/MissingFeatures.cpp
|
opm/simulators/flow/MissingFeatures.cpp
|
||||||
opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
|
opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
|
||||||
opm/simulators/linalg/setupPropertyTree.cpp
|
opm/simulators/linalg/setupPropertyTree.cpp
|
||||||
opm/simulators/linalg/bda/BdaBridge.cpp
|
|
||||||
opm/simulators/timestepping/TimeStepControl.cpp
|
opm/simulators/timestepping/TimeStepControl.cpp
|
||||||
opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
|
opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
|
||||||
opm/simulators/timestepping/SimulatorTimer.cpp
|
opm/simulators/timestepping/SimulatorTimer.cpp
|
||||||
@ -44,6 +43,8 @@ list (APPEND MAIN_SOURCE_FILES
|
|||||||
|
|
||||||
if(CUDA_FOUND)
|
if(CUDA_FOUND)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||||
endif()
|
endif()
|
||||||
if(MPI_FOUND)
|
if(MPI_FOUND)
|
||||||
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
||||||
@ -142,6 +143,7 @@ list (APPEND PUBLIC_HEADER_FILES
|
|||||||
opm/simulators/linalg/bda/BdaResult.hpp
|
opm/simulators/linalg/bda/BdaResult.hpp
|
||||||
opm/simulators/linalg/bda/cuda_header.hpp
|
opm/simulators/linalg/bda/cuda_header.hpp
|
||||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||||
|
opm/simulators/linalg/bda/WellContributions.hpp
|
||||||
opm/simulators/linalg/BlackoilAmg.hpp
|
opm/simulators/linalg/BlackoilAmg.hpp
|
||||||
opm/simulators/linalg/BlackoilAmgCpr.hpp
|
opm/simulators/linalg/BlackoilAmgCpr.hpp
|
||||||
opm/simulators/linalg/amgcpr.hh
|
opm/simulators/linalg/amgcpr.hh
|
||||||
|
@ -45,7 +45,9 @@
|
|||||||
|
|
||||||
#include <opm/common/utility/platform_dependent/reenable_warnings.h>
|
#include <opm/common/utility/platform_dependent/reenable_warnings.h>
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
BEGIN_PROPERTIES
|
BEGIN_PROPERTIES
|
||||||
|
|
||||||
@ -338,15 +340,16 @@ protected:
|
|||||||
converged_(false)
|
converged_(false)
|
||||||
{
|
{
|
||||||
parameters_.template init<TypeTag>();
|
parameters_.template init<TypeTag>();
|
||||||
|
const auto& gridForConn = simulator_.vanguard().grid();
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||||
|
if (gridForConn.comm().size() > 1 && use_gpu) {
|
||||||
|
OpmLog::warning("Warning cannot use GPU with MPI, GPU is disabled");
|
||||||
|
use_gpu = false;
|
||||||
|
}
|
||||||
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
|
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
|
||||||
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
||||||
const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
|
||||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||||
if (use_gpu && !matrix_add_well_contributions) {
|
|
||||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
|
|
||||||
}
|
|
||||||
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
|
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
|
||||||
#else
|
#else
|
||||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||||
@ -355,7 +358,6 @@ protected:
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
||||||
const auto& gridForConn = simulator_.vanguard().grid();
|
|
||||||
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
|
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
|
||||||
const bool useWellConn = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
const bool useWellConn = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||||
|
|
||||||
@ -569,22 +571,33 @@ protected:
|
|||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
// tries to solve linear system
|
// tries to solve linear system
|
||||||
// solve_system() does nothing if Dune is selected
|
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
bdaBridge->solve_system(matrix_.get(), istlb, result);
|
bool use_gpu = bdaBridge->getUseGpu();
|
||||||
|
if (use_gpu) {
|
||||||
if (result.converged) {
|
WellContributions wellContribs;
|
||||||
// get result vector x from non-Dune backend, iff solve was successful
|
const bool addWellContribs = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||||
bdaBridge->get_result(x);
|
if (!addWellContribs) {
|
||||||
} else {
|
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
||||||
// CPU fallback, or default case for Dune
|
|
||||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
|
||||||
if (use_gpu) {
|
|
||||||
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
|
||||||
}
|
}
|
||||||
|
bdaBridge->solve_system(matrix_.get(), istlb, wellContribs, result);
|
||||||
|
if (result.converged) {
|
||||||
|
// get result vector x from non-Dune backend, iff solve was successful
|
||||||
|
bdaBridge->get_result(x);
|
||||||
|
} else {
|
||||||
|
// CPU fallback
|
||||||
|
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
|
||||||
|
if (use_gpu) {
|
||||||
|
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
||||||
|
}
|
||||||
|
|
||||||
|
// call Dune
|
||||||
|
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
||||||
|
solve(linearOperator, x, istlb, *sp, *precond, result);
|
||||||
|
}
|
||||||
|
} else { // gpu is not selected or disabled
|
||||||
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
||||||
solve(linearOperator, x, istlb, *sp, *precond, result);
|
solve(linearOperator, x, istlb, *sp, *precond, result);
|
||||||
} // end Dune call
|
}
|
||||||
#else
|
#else
|
||||||
// Construct preconditioner.
|
// Construct preconditioner.
|
||||||
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
||||||
|
@ -35,7 +35,6 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
|
|||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
|
|
||||||
#if HAVE_CUDA
|
|
||||||
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
|
BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, double tolerance)
|
||||||
: use_gpu(use_gpu_)
|
: use_gpu(use_gpu_)
|
||||||
{
|
{
|
||||||
@ -43,15 +42,9 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity, int maxit, doub
|
|||||||
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
BdaBridge::BdaBridge(bool use_gpu_ OPM_UNUSED, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if HAVE_CUDA
|
|
||||||
template <class BridgeMatrix>
|
template <class BridgeMatrix>
|
||||||
int checkZeroDiagonal(BridgeMatrix& mat) {
|
int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||||
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
|
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
|
||||||
@ -118,13 +111,11 @@ void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector
|
|||||||
}
|
}
|
||||||
} // end getSparsityPattern()
|
} // end getSparsityPattern()
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
template <class BridgeMatrix, class BridgeVector>
|
template <class BridgeMatrix, class BridgeVector>
|
||||||
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
||||||
{
|
{
|
||||||
|
|
||||||
#if HAVE_CUDA
|
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
BdaResult result;
|
BdaResult result;
|
||||||
result.converged = false;
|
result.converged = false;
|
||||||
@ -137,6 +128,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
|||||||
if (dim != 3) {
|
if (dim != 3) {
|
||||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||||
use_gpu = false;
|
use_gpu = false;
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (h_rows.capacity() == 0) {
|
if (h_rows.capacity() == 0) {
|
||||||
@ -169,7 +161,7 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
|||||||
|
|
||||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
||||||
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
|
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
|
||||||
switch(status) {
|
switch(status) {
|
||||||
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
||||||
//OpmLog::info("cusparseSolver converged");
|
//OpmLog::info("cusparseSolver converged");
|
||||||
@ -192,61 +184,62 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
|
|||||||
}else{
|
}else{
|
||||||
res.converged = false;
|
res.converged = false;
|
||||||
}
|
}
|
||||||
#endif // HAVE_CUDA
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template <class BridgeVector>
|
template <class BridgeVector>
|
||||||
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
|
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
|
||||||
#if HAVE_CUDA
|
|
||||||
if (use_gpu) {
|
if (use_gpu) {
|
||||||
backend->post_process(static_cast<double*>(&(x[0][0])));
|
backend->post_process(static_cast<double*>(&(x[0][0])));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template void BdaBridge::solve_system< \
|
template void BdaBridge::solve_system<
|
||||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > , \
|
Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > ,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
|
||||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat, \
|
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>, std::allocator<Opm::MatrixBlock<double, 1, 1> > > *mat,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b, \
|
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &b,
|
||||||
|
WellContributions& wellContribs,
|
||||||
InverseOperatorResult &res);
|
InverseOperatorResult &res);
|
||||||
|
|
||||||
template void BdaBridge::solve_system< \
|
template void BdaBridge::solve_system<
|
||||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
|
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > ,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
|
||||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
|
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
|
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b,
|
||||||
|
WellContributions& wellContribs,
|
||||||
InverseOperatorResult &res);
|
InverseOperatorResult &res);
|
||||||
|
|
||||||
template void BdaBridge::solve_system< \
|
template void BdaBridge::solve_system<
|
||||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
|
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > ,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
|
||||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
|
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
|
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b,
|
||||||
|
WellContributions& wellContribs,
|
||||||
InverseOperatorResult &res);
|
InverseOperatorResult &res);
|
||||||
|
|
||||||
template void BdaBridge::solve_system< \
|
template void BdaBridge::solve_system<
|
||||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
|
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > ,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
|
||||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
|
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat,
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
|
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b,
|
||||||
|
WellContributions& wellContribs,
|
||||||
InverseOperatorResult &res);
|
InverseOperatorResult &res);
|
||||||
|
|
||||||
template void BdaBridge::get_result< \
|
template void BdaBridge::get_result<
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > >
|
||||||
(Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &x);
|
(Dune::BlockVector<Dune::FieldVector<double, 1>, std::allocator<Dune::FieldVector<double, 1> > > &x);
|
||||||
|
|
||||||
template void BdaBridge::get_result< \
|
template void BdaBridge::get_result<
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > >
|
||||||
(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
|
(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
|
||||||
|
|
||||||
template void BdaBridge::get_result< \
|
template void BdaBridge::get_result<
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > >
|
||||||
(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
|
(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
|
||||||
|
|
||||||
template void BdaBridge::get_result< \
|
template void BdaBridge::get_result<
|
||||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
|
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > >
|
||||||
(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
|
(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,14 +21,19 @@
|
|||||||
#define BDABRIDGE_HEADER_INCLUDED
|
#define BDABRIDGE_HEADER_INCLUDED
|
||||||
|
|
||||||
#include <config.h>
|
#include <config.h>
|
||||||
|
|
||||||
|
#if ! HAVE_CUDA
|
||||||
|
#error "This file should only be included if CUDA is found"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
|
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
|
||||||
|
|
||||||
#include "dune/istl/bcrsmatrix.hh"
|
#include "dune/istl/bcrsmatrix.hh"
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
|
|
||||||
#if HAVE_CUDA
|
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||||
|
|
||||||
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
|
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
@ -40,10 +45,8 @@ typedef Dune::InverseOperatorResult InverseOperatorResult;
|
|||||||
class BdaBridge
|
class BdaBridge
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
#if HAVE_CUDA
|
|
||||||
std::unique_ptr<cusparseSolverBackend> backend;
|
std::unique_ptr<cusparseSolverBackend> backend;
|
||||||
bool use_gpu;
|
bool use_gpu;
|
||||||
#endif
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// Construct a BdaBridge
|
/// Construct a BdaBridge
|
||||||
@ -55,17 +58,24 @@ public:
|
|||||||
|
|
||||||
|
|
||||||
/// Solve linear system, A*x = b
|
/// Solve linear system, A*x = b
|
||||||
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
|
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
|
||||||
/// \param[in] b vector b, should be of type Dune::BlockVector
|
/// \param[in] b vector b, should be of type Dune::BlockVector
|
||||||
/// \param[in] result summary of solver result
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
|
/// \param[inout] result summary of solver result
|
||||||
template <class BridgeMatrix, class BridgeVector>
|
template <class BridgeMatrix, class BridgeVector>
|
||||||
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
|
void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
|
||||||
|
|
||||||
/// Get the resulting x vector
|
/// Get the resulting x vector
|
||||||
/// \param[inout] x vector x, should be of type Dune::BlockVector
|
/// \param[inout] x vector x, should be of type Dune::BlockVector
|
||||||
template <class BridgeVector>
|
template <class BridgeVector>
|
||||||
void get_result(BridgeVector &x);
|
void get_result(BridgeVector &x);
|
||||||
|
|
||||||
|
/// Return whether the BdaBridge will use the GPU or not
|
||||||
|
/// return whether the BdaBridge will use the GPU or not
|
||||||
|
bool getUseGpu(){
|
||||||
|
return use_gpu;
|
||||||
|
}
|
||||||
|
|
||||||
}; // end class BdaBridge
|
}; // end class BdaBridge
|
||||||
|
|
||||||
}
|
}
|
||||||
|
218
opm/simulators/linalg/bda/WellContributions.cu
Normal file
218
opm/simulators/linalg/bda/WellContributions.cu
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <config.h> // CMake
|
||||||
|
|
||||||
|
#ifdef __NVCC__
|
||||||
|
#include "opm/simulators/linalg/bda/cuda_header.hpp"
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include "opm/simulators/linalg/bda/WellContributions.hpp"
|
||||||
|
|
||||||
|
namespace Opm
|
||||||
|
{
|
||||||
|
|
||||||
|
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
|
||||||
|
__global__ void apply_well_contributions(
|
||||||
|
const double * __restrict__ Cnnzs,
|
||||||
|
const double * __restrict__ Dnnzs,
|
||||||
|
const double * __restrict__ Bnnzs,
|
||||||
|
const int * __restrict__ Ccols,
|
||||||
|
const int * __restrict__ Bcols,
|
||||||
|
const double * __restrict__ x,
|
||||||
|
double * __restrict__ y,
|
||||||
|
const int dim,
|
||||||
|
const int dim_wells,
|
||||||
|
const unsigned int * __restrict__ val_pointers
|
||||||
|
)
|
||||||
|
{
|
||||||
|
const int idx_b = blockIdx.x;
|
||||||
|
const int idx_t = threadIdx.x;
|
||||||
|
const unsigned int val_size = val_pointers[idx_b+1] - val_pointers[idx_b];
|
||||||
|
|
||||||
|
const int vals_per_block = dim * dim_wells; // 12
|
||||||
|
const int num_active_threads = (32/vals_per_block)*vals_per_block; // 24
|
||||||
|
const int num_blocks_per_warp = 32/vals_per_block; // 2
|
||||||
|
const int lane = idx_t % 32;
|
||||||
|
const int c = lane % dim; // col in block
|
||||||
|
const int r = (lane / dim) % dim_wells; // row in block
|
||||||
|
|
||||||
|
extern __shared__ double smem[];
|
||||||
|
double * __restrict__ z1 = smem;
|
||||||
|
double * __restrict__ z2 = z1 + dim_wells;
|
||||||
|
|
||||||
|
if(idx_t < dim_wells){
|
||||||
|
z1[idx_t] = 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// z1 = B * x
|
||||||
|
if(idx_t < num_active_threads){
|
||||||
|
// multiply all blocks with x
|
||||||
|
double temp = 0.0;
|
||||||
|
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
|
||||||
|
while(b < val_size + val_pointers[idx_b]){
|
||||||
|
int colIdx = Bcols[b];
|
||||||
|
temp += Bnnzs[b * dim * dim_wells + r * dim + c] * x[colIdx * dim + c];
|
||||||
|
b += num_blocks_per_warp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge all blocks into 1 dim*dim_wells block
|
||||||
|
// since NORNE has only 2 parallel blocks, do not use a loop
|
||||||
|
temp += __shfl_down_sync(0x00ffffff, temp, dim * dim_wells);
|
||||||
|
|
||||||
|
b = idx_t / vals_per_block + val_pointers[idx_b];
|
||||||
|
|
||||||
|
// merge all (dim) columns of 1 block, results in a single 1*dim_wells vector, which is used to multiply with invD
|
||||||
|
if(idx_t < vals_per_block){
|
||||||
|
// should be a loop as well, now only works for dim == 3
|
||||||
|
if(c == 0 || c == 2){temp += __shfl_down_sync(0x00000B6D, temp, 2);} // add col 2 to col 0
|
||||||
|
if(c == 0 || c == 1){temp += __shfl_down_sync(0x000006DB, temp, 1);} // add col 1 to col 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// write 1*dim_wells vector to gmem, could be replaced with shfl broadcast to remove z1 altogether
|
||||||
|
if(c == 0 && idx_t < vals_per_block){
|
||||||
|
z1[r] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// z2 = D^-1 * B * x = D^-1 * z1
|
||||||
|
if(idx_t < dim_wells){
|
||||||
|
double temp = 0.0;
|
||||||
|
for(int c = 0; c < dim_wells; ++c){
|
||||||
|
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
|
||||||
|
}
|
||||||
|
z2[idx_t] = temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// y -= C^T * D^-1 * B * x
|
||||||
|
// use dim * val_size threads, each block is assigned 'dim' threads
|
||||||
|
if(idx_t < dim * val_size){
|
||||||
|
double temp = 0.0;
|
||||||
|
int b = idx_t / dim + val_pointers[idx_b];
|
||||||
|
int cc = idx_t % dim;
|
||||||
|
int colIdx = Ccols[b];
|
||||||
|
for(unsigned int c = 0; c < dim_wells; ++c){
|
||||||
|
temp += Cnnzs[b * dim * dim_wells + c * dim + cc] * z2[c];
|
||||||
|
}
|
||||||
|
y[colIdx * dim + cc] -= temp;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void WellContributions::alloc(){
|
||||||
|
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||||
|
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_wells * dim_wells * dim_wells);
|
||||||
|
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||||
|
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
|
||||||
|
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
|
||||||
|
val_pointers = new unsigned int[num_wells + 1];
|
||||||
|
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_wells + 1));
|
||||||
|
cudaCheckLastError("apply_gpu malloc failed");
|
||||||
|
allocated = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
WellContributions::~WellContributions()
|
||||||
|
{
|
||||||
|
cudaFree(d_Cnnzs);
|
||||||
|
cudaFree(d_Dnnzs);
|
||||||
|
cudaFree(d_Bnnzs);
|
||||||
|
cudaFree(d_Ccols);
|
||||||
|
cudaFree(d_Bcols);
|
||||||
|
delete[] val_pointers;
|
||||||
|
cudaFree(d_val_pointers);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Apply the WellContributions, similar to StandardWell::apply()
|
||||||
|
// y -= (C^T *(D^-1*( B*x)))
|
||||||
|
void WellContributions::apply(double *d_x, double *d_y)
|
||||||
|
{
|
||||||
|
int smem_size = 2 * sizeof(double) * dim_wells;
|
||||||
|
apply_well_contributions<<<num_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void WellContributions::addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
|
||||||
|
{
|
||||||
|
if (!allocated) {
|
||||||
|
OPM_THROW(std::logic_error,"Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case MatrixType::C:
|
||||||
|
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||||
|
break;
|
||||||
|
case MatrixType::D:
|
||||||
|
cudaMemcpy(d_Dnnzs + num_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
|
||||||
|
break;
|
||||||
|
case MatrixType::B:
|
||||||
|
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||||
|
val_pointers[num_wells_so_far] = num_blocks_so_far;
|
||||||
|
if(num_wells_so_far == num_wells - 1){
|
||||||
|
val_pointers[num_wells] = num_blocks;
|
||||||
|
}
|
||||||
|
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_wells+1), cudaMemcpyHostToDevice);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
OPM_THROW(std::logic_error,"Error unsupported matrix ID for WellContributions::addMatrix()");
|
||||||
|
}
|
||||||
|
cudaCheckLastError("WellContributions::addMatrix() failed");
|
||||||
|
if (MatrixType::B == type) {
|
||||||
|
num_blocks_so_far += val_size;
|
||||||
|
num_wells_so_far++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::setCudaStream(cudaStream_t stream_)
|
||||||
|
{
|
||||||
|
this->stream = stream_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||||
|
{
|
||||||
|
dim = dim_;
|
||||||
|
dim_wells = dim_wells_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WellContributions::addNumBlocks(unsigned int nnz)
|
||||||
|
{
|
||||||
|
if (allocated) {
|
||||||
|
OPM_THROW(std::logic_error,"Error cannot add more sizes after allocated in WellContributions");
|
||||||
|
}
|
||||||
|
num_blocks += nnz;
|
||||||
|
num_wells++;
|
||||||
|
}
|
||||||
|
|
||||||
|
} //namespace Opm
|
||||||
|
|
123
opm/simulators/linalg/bda/WellContributions.hpp
Normal file
123
opm/simulators/linalg/bda/WellContributions.hpp
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
|
||||||
|
#define WELLCONTRIBUTIONS_HEADER_INCLUDED
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#if ! HAVE_CUDA
|
||||||
|
#error "This file should only be included if CUDA is found"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
namespace Opm
|
||||||
|
{
|
||||||
|
|
||||||
|
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
|
||||||
|
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
|
||||||
|
/// A StandardWell uses C, D and B and performs y -= (C^T * (D^-1 * (B*x)))
|
||||||
|
/// B and C are vectors, disguised as matrices and contain blocks of StandardWell::numEq by StandardWell::numStaticWellEq
|
||||||
|
/// D is a block, disguised as matrix, the square block has size StandardWell::numStaticWellEq. D is actually stored as D^-1
|
||||||
|
/// B*x and D*B*x are a vector with numStaticWellEq doubles
|
||||||
|
/// C*D*B*x is a blocked matrix with a symmetric sparsity pattern, contains square blocks with size numEq. For every columnindex i, j in StandardWell::duneB_, there is a block on (i, j) in C*D*B*x.
|
||||||
|
///
|
||||||
|
/// This class is used in 3 phases:
|
||||||
|
/// - get total size of all wellcontributions that must be stored here
|
||||||
|
/// - allocate memory
|
||||||
|
/// - copy data of wellcontributions
|
||||||
|
class WellContributions
|
||||||
|
{
|
||||||
|
|
||||||
|
private:
|
||||||
|
unsigned int num_blocks = 0; // total number of blocks in all wells
|
||||||
|
unsigned int dim; // number of columns of blocks in B and C, equal to StandardWell::numEq
|
||||||
|
unsigned int dim_wells; // number of rows of blocks in B and C, equal to StandardWell::numStaticWellEq
|
||||||
|
unsigned int num_wells = 0; // number of wellcontributions in this object
|
||||||
|
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
|
||||||
|
unsigned int num_wells_so_far = 0; // keep track of where next data is written
|
||||||
|
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
|
||||||
|
bool allocated = false;
|
||||||
|
|
||||||
|
double *d_Cnnzs = nullptr;
|
||||||
|
double *d_Dnnzs = nullptr;
|
||||||
|
double *d_Bnnzs = nullptr;
|
||||||
|
int *d_Ccols = nullptr;
|
||||||
|
int *d_Bcols = nullptr;
|
||||||
|
double *d_z1 = nullptr;
|
||||||
|
double *d_z2 = nullptr;
|
||||||
|
unsigned int *d_val_pointers = nullptr;
|
||||||
|
cudaStream_t stream;
|
||||||
|
public:
|
||||||
|
|
||||||
|
/// StandardWell has C, D and B matrices that need to be copied
|
||||||
|
enum class MatrixType {
|
||||||
|
C,
|
||||||
|
D,
|
||||||
|
B
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Set a cudaStream to be used
|
||||||
|
/// \param[in] stream the cudaStream that is used to launch the kernel in
|
||||||
|
void setCudaStream(cudaStream_t stream);
|
||||||
|
|
||||||
|
/// Create a new WellContributions, implementation is empty
|
||||||
|
WellContributions(){};
|
||||||
|
|
||||||
|
/// Destroy a WellContributions, and free memory
|
||||||
|
~WellContributions();
|
||||||
|
|
||||||
|
/// Apply all wellcontributions in this object
|
||||||
|
/// performs y -= (C^T * (D^-1 * (B*x))) for StandardWell
|
||||||
|
/// \param[in] x vector x
|
||||||
|
/// \param[inout] y vector y
|
||||||
|
void apply(double *x, double *y);
|
||||||
|
|
||||||
|
/// Allocate memory for the wellcontributions
|
||||||
|
void alloc();
|
||||||
|
|
||||||
|
/// Indicate how large the blocks of the wellcontributions (C and B) are
|
||||||
|
/// \param[in] dim number of columns
|
||||||
|
/// \param[in] dim_wells number of rows
|
||||||
|
void setBlockSize(unsigned int dim, unsigned int dim_wells);
|
||||||
|
|
||||||
|
/// Indicate how large the next wellcontribution is, this function cannot be called after alloc() is called
|
||||||
|
/// \param[in] numBlocks number of blocks in C and B of next wellcontribution
|
||||||
|
void addNumBlocks(unsigned int numBlocks);
|
||||||
|
|
||||||
|
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||||
|
/// \param[in] type indicate if C, D or B is sent
|
||||||
|
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||||
|
/// \param[in] values array of nonzeroes
|
||||||
|
/// \param[in] val_size number of blocks in C or B, ignored for D
|
||||||
|
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
|
||||||
|
|
||||||
|
/// Return the number of wells added to this object
|
||||||
|
/// \return the number of wells added to this object
|
||||||
|
unsigned int getNumWells(){
|
||||||
|
return num_wells;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} //namespace Opm
|
||||||
|
|
||||||
|
#endif
|
@ -38,6 +38,10 @@
|
|||||||
#include "cusparse_v2.h"
|
#include "cusparse_v2.h"
|
||||||
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
|
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
|
||||||
|
|
||||||
|
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
|
||||||
|
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||||
|
#define COPY_ROW_BY_ROW 0
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -58,7 +62,7 @@ namespace Opm
|
|||||||
finalize();
|
finalize();
|
||||||
}
|
}
|
||||||
|
|
||||||
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
|
void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||||
double t_total1, t_total2;
|
double t_total1, t_total2;
|
||||||
int n = N;
|
int n = N;
|
||||||
double rho = 1.0, rhop;
|
double rho = 1.0, rhop;
|
||||||
@ -72,7 +76,11 @@ namespace Opm
|
|||||||
|
|
||||||
t_total1 = second();
|
t_total1 = second();
|
||||||
|
|
||||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
|
if(wellContribs.getNumWells() > 0){
|
||||||
|
wellContribs.setCudaStream(stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||||
|
|
||||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||||
@ -101,15 +109,20 @@ namespace Opm
|
|||||||
// apply ilu0
|
// apply ilu0
|
||||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||||
operation, Nb, nnzb, &one, \
|
operation, Nb, nnzb, &one, \
|
||||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
|
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
|
||||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||||
operation, Nb, nnzb, &one, \
|
operation, Nb, nnzb, &one, \
|
||||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
|
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
|
||||||
|
|
||||||
// spmv
|
// spmv
|
||||||
cusparseDbsrmv(cusparseHandle, order, \
|
cusparseDbsrmv(cusparseHandle, order, \
|
||||||
operation, Nb, Nb, nnzb, \
|
operation, Nb, Nb, nnzb, \
|
||||||
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
|
&one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_pw, &zero, d_v);
|
||||||
|
|
||||||
|
// apply wellContributions
|
||||||
|
if(wellContribs.getNumWells() > 0){
|
||||||
|
wellContribs.apply(d_pw, d_v);
|
||||||
|
}
|
||||||
|
|
||||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||||
alpha = rho / tmp1;
|
alpha = rho / tmp1;
|
||||||
@ -127,15 +140,20 @@ namespace Opm
|
|||||||
// apply ilu0
|
// apply ilu0
|
||||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||||
operation, Nb, nnzb, &one, \
|
operation, Nb, nnzb, &one, \
|
||||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
|
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
|
||||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||||
operation, Nb, nnzb, &one, \
|
operation, Nb, nnzb, &one, \
|
||||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
|
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
|
||||||
|
|
||||||
// spmv
|
// spmv
|
||||||
cusparseDbsrmv(cusparseHandle, order, \
|
cusparseDbsrmv(cusparseHandle, order, \
|
||||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||||
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
|
d_bVals, d_bRows, d_bCols, block_size, d_s, &zero, d_t);
|
||||||
|
|
||||||
|
// apply wellContributions
|
||||||
|
if(wellContribs.getNumWells() > 0){
|
||||||
|
wellContribs.apply(d_s, d_t);
|
||||||
|
}
|
||||||
|
|
||||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||||
@ -178,8 +196,8 @@ namespace Opm
|
|||||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
|
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
|
||||||
this->N = N;
|
this->N = N;
|
||||||
this->nnz = nnz;
|
this->nnz = nnz;
|
||||||
this->BLOCK_SIZE = dim;
|
this->block_size = dim;
|
||||||
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
|
this->nnzb = nnz/block_size/block_size;
|
||||||
Nb = (N + dim - 1) / dim;
|
Nb = (N + dim - 1) / dim;
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||||
@ -229,41 +247,44 @@ namespace Opm
|
|||||||
cusparseSetStream(cusparseHandle, stream);
|
cusparseSetStream(cusparseHandle, stream);
|
||||||
cudaCheckLastError("Could not set stream to cusparse");
|
cudaCheckLastError("Could not set stream to cusparse");
|
||||||
|
|
||||||
cudaMallocHost((void**)&x, sizeof(double) * N);
|
#if COPY_ROW_BY_ROW
|
||||||
cudaCheckLastError("Could not allocate pinned host memory");
|
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
||||||
|
cudaCheckLastError("Could not allocate pinned memory");
|
||||||
|
#endif
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
} // end initialize()
|
} // end initialize()
|
||||||
|
|
||||||
void cusparseSolverBackend::finalize() {
|
void cusparseSolverBackend::finalize() {
|
||||||
cudaFree(d_x);
|
if (initialized) {
|
||||||
cudaFree(d_b);
|
cudaFree(d_x);
|
||||||
cudaFree(d_r);
|
cudaFree(d_b);
|
||||||
cudaFree(d_rw);
|
cudaFree(d_r);
|
||||||
cudaFree(d_p);
|
cudaFree(d_rw);
|
||||||
cudaFree(d_pw);
|
cudaFree(d_p);
|
||||||
cudaFree(d_s);
|
cudaFree(d_pw);
|
||||||
cudaFree(d_t);
|
cudaFree(d_s);
|
||||||
cudaFree(d_v);
|
cudaFree(d_t);
|
||||||
cudaFree(d_mVals);
|
cudaFree(d_v);
|
||||||
cudaFree(d_bVals);
|
cudaFree(d_mVals);
|
||||||
cudaFree(d_bCols);
|
cudaFree(d_bVals);
|
||||||
cudaFree(d_bRows);
|
cudaFree(d_bCols);
|
||||||
cudaFree(d_buffer);
|
cudaFree(d_bRows);
|
||||||
cusparseDestroyBsrilu02Info(info_M);
|
cudaFree(d_buffer);
|
||||||
cusparseDestroyBsrsv2Info(info_L);
|
cusparseDestroyBsrilu02Info(info_M);
|
||||||
cusparseDestroyBsrsv2Info(info_U);
|
cusparseDestroyBsrsv2Info(info_L);
|
||||||
cusparseDestroyMatDescr(descr_B);
|
cusparseDestroyBsrsv2Info(info_U);
|
||||||
cusparseDestroyMatDescr(descr_M);
|
cusparseDestroyMatDescr(descr_B);
|
||||||
cusparseDestroyMatDescr(descr_L);
|
cusparseDestroyMatDescr(descr_M);
|
||||||
cusparseDestroyMatDescr(descr_U);
|
cusparseDestroyMatDescr(descr_L);
|
||||||
cusparseDestroy(cusparseHandle);
|
cusparseDestroyMatDescr(descr_U);
|
||||||
cublasDestroy(cublasHandle);
|
cusparseDestroy(cusparseHandle);
|
||||||
cudaHostUnregister(vals);
|
cublasDestroy(cublasHandle);
|
||||||
cudaHostUnregister(cols);
|
#if COPY_ROW_BY_ROW
|
||||||
cudaHostUnregister(rows);
|
cudaFreeHost(vals_contiguous);
|
||||||
cudaStreamDestroy(stream);
|
#endif
|
||||||
cudaFreeHost(x);
|
cudaStreamDestroy(stream);
|
||||||
|
}
|
||||||
} // end finalize()
|
} // end finalize()
|
||||||
|
|
||||||
|
|
||||||
@ -274,22 +295,23 @@ namespace Opm
|
|||||||
t1 = second();
|
t1 = second();
|
||||||
}
|
}
|
||||||
|
|
||||||
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
|
#if COPY_ROW_BY_ROW
|
||||||
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
|
int sum = 0;
|
||||||
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
|
for(int i = 0; i < Nb; ++i){
|
||||||
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
|
int size_row = rows[i+1] - rows[i];
|
||||||
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
|
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||||
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
|
sum += size_row * block_size * block_size;
|
||||||
|
}
|
||||||
|
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
|
#else
|
||||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||||
|
|
||||||
this->vals = vals;
|
|
||||||
this->cols = cols;
|
|
||||||
this->rows = rows;
|
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
cudaStreamSynchronize(stream);
|
cudaStreamSynchronize(stream);
|
||||||
t2 = second();
|
t2 = second();
|
||||||
@ -301,14 +323,25 @@ namespace Opm
|
|||||||
|
|
||||||
|
|
||||||
// don't copy rowpointers and colindices, they stay the same
|
// don't copy rowpointers and colindices, they stay the same
|
||||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
|
void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) {
|
||||||
|
|
||||||
double t1, t2;
|
double t1, t2;
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
t1 = second();
|
t1 = second();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if COPY_ROW_BY_ROW
|
||||||
|
int sum = 0;
|
||||||
|
for(int i = 0; i < Nb; ++i){
|
||||||
|
int size_row = rows[i+1] - rows[i];
|
||||||
|
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||||
|
sum += size_row * block_size * block_size;
|
||||||
|
}
|
||||||
|
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
|
#else
|
||||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
|
#endif
|
||||||
|
|
||||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||||
|
|
||||||
@ -364,11 +397,11 @@ namespace Opm
|
|||||||
cudaCheckLastError("Could not create analysis info");
|
cudaCheckLastError("Could not create analysis info");
|
||||||
|
|
||||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||||
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
|
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
|
||||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||||
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
|
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
|
||||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||||
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
|
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
|
||||||
cudaCheckLastError();
|
cudaCheckLastError();
|
||||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||||
|
|
||||||
@ -377,7 +410,7 @@ namespace Opm
|
|||||||
// analysis of ilu LU decomposition
|
// analysis of ilu LU decomposition
|
||||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
block_size, info_M, policy, d_buffer);
|
||||||
|
|
||||||
int structural_zero;
|
int structural_zero;
|
||||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||||
@ -388,11 +421,11 @@ namespace Opm
|
|||||||
// analysis of ilu apply
|
// analysis of ilu apply
|
||||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||||
BLOCK_SIZE, info_L, policy, d_buffer);
|
block_size, info_L, policy, d_buffer);
|
||||||
|
|
||||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||||
BLOCK_SIZE, info_U, policy, d_buffer);
|
block_size, info_U, policy, d_buffer);
|
||||||
cudaCheckLastError("Could not analyse level information");
|
cudaCheckLastError("Could not analyse level information");
|
||||||
|
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
@ -403,6 +436,8 @@ namespace Opm
|
|||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
analysis_done = true;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} // end analyse_matrix()
|
} // end analyse_matrix()
|
||||||
|
|
||||||
@ -417,7 +452,8 @@ namespace Opm
|
|||||||
d_mRows = d_bRows;
|
d_mRows = d_bRows;
|
||||||
cusparseDbsrilu02(cusparseHandle, order, \
|
cusparseDbsrilu02(cusparseHandle, order, \
|
||||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
block_size, info_M, policy, d_buffer);
|
||||||
|
cudaCheckLastError("Could not perform ilu decomposition");
|
||||||
|
|
||||||
int structural_zero;
|
int structural_zero;
|
||||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||||
@ -437,9 +473,9 @@ namespace Opm
|
|||||||
} // end create_preconditioner()
|
} // end create_preconditioner()
|
||||||
|
|
||||||
|
|
||||||
void cusparseSolverBackend::solve_system(BdaResult &res) {
|
void cusparseSolverBackend::solve_system(WellContributions& wellContribs, BdaResult &res) {
|
||||||
// actually solve
|
// actually solve
|
||||||
gpu_pbicgstab(res);
|
gpu_pbicgstab(wellContribs, res);
|
||||||
cudaStreamSynchronize(stream);
|
cudaStreamSynchronize(stream);
|
||||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||||
} // end solve_system()
|
} // end solve_system()
|
||||||
@ -449,10 +485,6 @@ namespace Opm
|
|||||||
// caller must be sure that x is a valid array
|
// caller must be sure that x is a valid array
|
||||||
void cusparseSolverBackend::post_process(double *x) {
|
void cusparseSolverBackend::post_process(double *x) {
|
||||||
|
|
||||||
if (!initialized) {
|
|
||||||
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
|
|
||||||
}
|
|
||||||
|
|
||||||
double t1, t2;
|
double t1, t2;
|
||||||
if (verbosity > 2) {
|
if (verbosity > 2) {
|
||||||
t1 = second();
|
t1 = second();
|
||||||
@ -472,12 +504,12 @@ namespace Opm
|
|||||||
|
|
||||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||||
|
|
||||||
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {
|
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||||
if (initialized == false) {
|
if (initialized == false) {
|
||||||
initialize(N, nnz, dim);
|
initialize(N, nnz, dim);
|
||||||
copy_system_to_gpu(vals, rows, cols, b);
|
copy_system_to_gpu(vals, rows, cols, b);
|
||||||
}else{
|
}else{
|
||||||
update_system_on_gpu(vals, b);
|
update_system_on_gpu(vals, rows, b);
|
||||||
}
|
}
|
||||||
if (analysis_done == false) {
|
if (analysis_done == false) {
|
||||||
if (!analyse_matrix()) {
|
if (!analyse_matrix()) {
|
||||||
@ -486,7 +518,7 @@ namespace Opm
|
|||||||
}
|
}
|
||||||
reset_prec_on_gpu();
|
reset_prec_on_gpu();
|
||||||
if (create_preconditioner()) {
|
if (create_preconditioner()) {
|
||||||
solve_system(res);
|
solve_system(wellContribs, res);
|
||||||
}else{
|
}else{
|
||||||
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "cusparse_v2.h"
|
#include "cusparse_v2.h"
|
||||||
|
|
||||||
#include "opm/simulators/linalg/bda/BdaResult.hpp"
|
#include "opm/simulators/linalg/bda/BdaResult.hpp"
|
||||||
|
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
@ -50,13 +51,11 @@ private:
|
|||||||
int *d_bRows, *d_mRows;
|
int *d_bRows, *d_mRows;
|
||||||
double *d_x, *d_b, *d_r, *d_rw, *d_p;
|
double *d_x, *d_b, *d_r, *d_rw, *d_p;
|
||||||
double *d_pw, *d_s, *d_t, *d_v;
|
double *d_pw, *d_s, *d_t, *d_v;
|
||||||
double *vals;
|
|
||||||
int *cols, *rows;
|
|
||||||
double *x, *b;
|
|
||||||
void *d_buffer;
|
void *d_buffer;
|
||||||
int N, Nb, nnz, nnzb;
|
int N, Nb, nnz, nnzb;
|
||||||
|
double *vals_contiguous;
|
||||||
|
|
||||||
int BLOCK_SIZE;
|
int block_size;
|
||||||
|
|
||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
bool analysis_done = false;
|
bool analysis_done = false;
|
||||||
@ -70,8 +69,9 @@ private:
|
|||||||
int verbosity = 0;
|
int verbosity = 0;
|
||||||
|
|
||||||
/// Solve linear system using ilu0-bicgstab
|
/// Solve linear system using ilu0-bicgstab
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
void gpu_pbicgstab(BdaResult& res);
|
/// \param[inout] res summary of solver result
|
||||||
|
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||||
|
|
||||||
/// Initialize GPU and allocate memory
|
/// Initialize GPU and allocate memory
|
||||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||||
@ -83,16 +83,17 @@ private:
|
|||||||
void finalize();
|
void finalize();
|
||||||
|
|
||||||
/// Copy linear system to GPU
|
/// Copy linear system to GPU
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||||
/// \param[in] cols array of columnIndices, contains nnz values
|
/// \param[in] cols array of columnIndices, contains nnz values
|
||||||
/// \param[in] b input vector, contains N values
|
/// \param[in] b input vector, contains N values
|
||||||
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
|
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
|
||||||
|
|
||||||
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||||
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
|
||||||
/// \param[in] b input vector, contains N values
|
/// \param[in] b input vector, contains N values
|
||||||
void update_system_on_gpu(double *vals, double *b);
|
void update_system_on_gpu(double *vals, int *rows, double *b);
|
||||||
|
|
||||||
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
|
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
|
||||||
void reset_prec_on_gpu();
|
void reset_prec_on_gpu();
|
||||||
@ -106,8 +107,9 @@ private:
|
|||||||
bool create_preconditioner();
|
bool create_preconditioner();
|
||||||
|
|
||||||
/// Solve linear system
|
/// Solve linear system
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
void solve_system(BdaResult &res);
|
/// \param[inout] res summary of solver result
|
||||||
|
void solve_system(WellContributions& wellContribs, BdaResult &res);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
@ -128,16 +130,17 @@ public:
|
|||||||
~cusparseSolverBackend();
|
~cusparseSolverBackend();
|
||||||
|
|
||||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||||
/// \param[in] dim size of block
|
/// \param[in] dim size of block
|
||||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||||
/// \param[in] cols array of columnIndices, contains nnz values
|
/// \param[in] cols array of columnIndices, contains nnz values
|
||||||
/// \param[in] b input vector, contains N values
|
/// \param[in] b input vector, contains N values
|
||||||
/// \param[inout] res summary of solver result
|
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||||
/// \return status code
|
/// \param[inout] res summary of solver result
|
||||||
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
|
/// \return status code
|
||||||
|
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res);
|
||||||
|
|
||||||
/// Post processing after linear solve, now only copies resulting x vector back
|
/// Post processing after linear solve, now only copies resulting x vector back
|
||||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||||
|
@ -221,6 +221,11 @@ namespace Opm {
|
|||||||
// subtract B*inv(D)*C * x from A*x
|
// subtract B*inv(D)*C * x from A*x
|
||||||
void apply(const BVector& x, BVector& Ax) const;
|
void apply(const BVector& x, BVector& Ax) const;
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
|
// accumulate the contributions of all Wells in the WellContributions object
|
||||||
|
void getWellContributions(WellContributions& x) const;
|
||||||
|
#endif
|
||||||
|
|
||||||
// apply well model with scaling of alpha
|
// apply well model with scaling of alpha
|
||||||
void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;
|
void applyScaleAdd(const Scalar alpha, const BVector& x, BVector& Ax) const;
|
||||||
|
|
||||||
|
@ -918,9 +918,32 @@ namespace Opm {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
|
template<typename TypeTag>
|
||||||
|
void
|
||||||
|
BlackoilWellModel<TypeTag>::
|
||||||
|
getWellContributions(WellContributions& wellContribs) const
|
||||||
|
{
|
||||||
|
wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
|
||||||
|
for(unsigned int i = 0; i < well_container_.size(); i++){
|
||||||
|
auto& well = well_container_[i];
|
||||||
|
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
|
||||||
|
unsigned int numBlocks;
|
||||||
|
derived->getNumBlocks(numBlocks);
|
||||||
|
wellContribs.addNumBlocks(numBlocks);
|
||||||
|
}
|
||||||
|
wellContribs.alloc();
|
||||||
|
for(unsigned int i = 0; i < well_container_.size(); i++){
|
||||||
|
auto& well = well_container_[i];
|
||||||
|
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
|
||||||
|
if (derived) {
|
||||||
|
derived->addWellContribution(wellContribs);
|
||||||
|
} else {
|
||||||
|
OpmLog::warning("Warning only StandardWell is supported by WellContributions for GPU");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Ax = Ax - alpha * C D^-1 B x
|
// Ax = Ax - alpha * C D^-1 B x
|
||||||
template<typename TypeTag>
|
template<typename TypeTag>
|
||||||
|
@ -184,6 +184,14 @@ namespace Opm
|
|||||||
/// r = r - C D^-1 Rw
|
/// r = r - C D^-1 Rw
|
||||||
virtual void apply(BVector& r) const override;
|
virtual void apply(BVector& r) const override;
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
|
/// add the contribution (C, D^-1, B matrices) of this Well to the WellContributions object
|
||||||
|
void addWellContribution(WellContributions& wellContribs) const;
|
||||||
|
|
||||||
|
/// get the number of blocks of the C and B matrices, used to allocate memory in a WellContributions object
|
||||||
|
void getNumBlocks(unsigned int& _nnzs) const;
|
||||||
|
#endif
|
||||||
|
|
||||||
/// using the solution x to recover the solution xw for wells and applying
|
/// using the solution x to recover the solution xw for wells and applying
|
||||||
/// xw to update Well State
|
/// xw to update Well State
|
||||||
virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,
|
virtual void recoverWellSolutionAndUpdateWellState(const BVector& x,
|
||||||
|
@ -2735,8 +2735,65 @@ namespace Opm
|
|||||||
duneC_.mmtv(invDrw_, r);
|
duneC_.mmtv(invDrw_, r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_CUDA
|
||||||
|
template<typename TypeTag>
|
||||||
|
void
|
||||||
|
StandardWell<TypeTag>::
|
||||||
|
addWellContribution(WellContributions& wellContribs) const
|
||||||
|
{
|
||||||
|
std::vector<int> colIndices;
|
||||||
|
std::vector<double> nnzValues;
|
||||||
|
colIndices.reserve(duneB_.nonzeroes());
|
||||||
|
nnzValues.reserve(duneB_.nonzeroes()*numStaticWellEq * numEq);
|
||||||
|
|
||||||
|
// duneC
|
||||||
|
for ( auto colC = duneC_[0].begin(), endC = duneC_[0].end(); colC != endC; ++colC )
|
||||||
|
{
|
||||||
|
colIndices.emplace_back(colC.index());
|
||||||
|
for (int i = 0; i < numStaticWellEq; ++i) {
|
||||||
|
for (int j = 0; j < numEq; ++j) {
|
||||||
|
nnzValues.emplace_back((*colC)[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wellContribs.addMatrix(WellContributions::MatrixType::C, colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
|
||||||
|
|
||||||
|
// invDuneD
|
||||||
|
colIndices.clear();
|
||||||
|
nnzValues.clear();
|
||||||
|
colIndices.emplace_back(0);
|
||||||
|
for (int i = 0; i < numStaticWellEq; ++i)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < numStaticWellEq; ++j) {
|
||||||
|
nnzValues.emplace_back(invDuneD_[0][0][i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wellContribs.addMatrix(WellContributions::MatrixType::D, colIndices.data(), nnzValues.data(), 1);
|
||||||
|
|
||||||
|
// duneB
|
||||||
|
colIndices.clear();
|
||||||
|
nnzValues.clear();
|
||||||
|
for ( auto colB = duneB_[0].begin(), endB = duneB_[0].end(); colB != endB; ++colB )
|
||||||
|
{
|
||||||
|
colIndices.emplace_back(colB.index());
|
||||||
|
for (int i = 0; i < numStaticWellEq; ++i) {
|
||||||
|
for (int j = 0; j < numEq; ++j) {
|
||||||
|
nnzValues.emplace_back((*colB)[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wellContribs.addMatrix(WellContributions::MatrixType::B, colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template<typename TypeTag>
|
||||||
|
void
|
||||||
|
StandardWell<TypeTag>::
|
||||||
|
getNumBlocks(unsigned int& numBlocks) const
|
||||||
|
{
|
||||||
|
numBlocks = duneB_.nonzeroes();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
template<typename TypeTag>
|
template<typename TypeTag>
|
||||||
|
Loading…
Reference in New Issue
Block a user