SolverAdapter is now more explicit on its communicator/MPI dependence. This fixes the MPI issues with CUDA/HIP/GPUISTL that were introduced with the GhostLastMatrixAdapter.

This commit is contained in:
Kjetil Olsen Lye
2024-09-24 10:03:23 +02:00
parent 9c74c4d638
commit 3161a986a2
4 changed files with 140 additions and 112 deletions

View File

@@ -86,7 +86,8 @@ private:
const std::function<VectorType()> weightsCalculator, const Dune::Amg::SequentialInformation&,
std::size_t pressureIndex);
void initSolver(const Opm::PropertyTree& prm, const bool is_iorank);
template <class Comm>
void initSolver(const Opm::PropertyTree& prm, const Comm& comm);
void recreateDirectSolver();

View File

@@ -153,10 +153,12 @@ namespace Dune
}
template <class Operator>
template <class Comm>
void
FlexibleSolver<Operator>::
initSolver(const Opm::PropertyTree& prm, const bool is_iorank)
initSolver(const Opm::PropertyTree& prm, const Comm& comm)
{
const bool is_iorank = comm.communicator().rank() == 0;
const double tol = prm.get<double>("tol", 1e-2);
const int maxiter = prm.get<int>("maxiter", 200);
const int verbosity = is_iorank ? prm.get<int>("verbosity", 0) : 0;
@@ -211,7 +213,8 @@ namespace Dune
preconditioner_,
tol, // desired residual reduction factor
maxiter, // maximum number of iterations
verbosity));
verbosity,
comm));
#endif
} else {
OPM_THROW(std::invalid_argument,
@@ -256,7 +259,7 @@ namespace Dune
std::size_t pressureIndex)
{
initOpPrecSp(op, prm, weightsCalculator, comm, pressureIndex);
initSolver(prm, comm.communicator().rank() == 0);
initSolver(prm, comm);
}
} // namespace Dune

View File

@@ -23,6 +23,7 @@
#include <dune/istl/bcrsmatrix.hh>
#include <dune/istl/operators.hh>
#include <dune/istl/paamg/pinfo.hh>
#include <dune/istl/schwarz.hh>
#include <dune/istl/solver.hh>
#include <opm/common/ErrorMacros.hpp>
@@ -58,18 +59,35 @@ public:
static constexpr auto block_size = domain_type::block_type::dimension;
using XGPU = Opm::gpuistl::GpuVector<real_type>;
// TODO: Use a std::forward
//! @brief constructor
//!
//! @param op the linear operator (assumed CPU, the output (matrix) of which will be converted to a GPU variant)
//! @param sp the scalar product (assumed CPU, this will be converted to a GPU variant)
//! @param reduction the reduction factor passed to the iterative solver
//! @param maxit maximum number of iterations for the linear solver
//! @param verbose verbosity level
//! @param comm the communication object. If this is Dune::Amg::SequentialInformation, we assume a serial setup
//!
//! @todo Use a std::forward in this function
template<class Comm>
SolverAdapter(Operator& op,
Dune::ScalarProduct<X>& sp,
std::shared_ptr<Dune::Preconditioner<X, X>> prec,
scalar_real_type reduction,
int maxit,
int verbose)
int verbose,
const Comm& comm)
: Dune::IterativeSolver<X, X>(op, sp, *prec, reduction, maxit, verbose)
, m_opOnCPUWithMatrix(op)
, m_matrix(GpuSparseMatrix<real_type>::fromMatrix(op.getmat()))
, m_underlyingSolver(constructSolver(prec, reduction, maxit, verbose))
, m_underlyingSolver(constructSolver(prec, reduction, maxit, verbose, comm))
{
OPM_ERROR_IF(
detail::is_a_well_operator<Operator>::value,
"Currently we only support operators of type MatrixAdapter in the CUDA/HIP solver. "
"Use --matrix-add-well-contributions=true. "
"Using WellModelMatrixAdapter with SolverAdapter is not well-defined so it will not work well, or at all.");
}
virtual void apply(X& x, X& b, double reduction, Dune::InverseOperatorResult& res) override
@@ -122,35 +140,25 @@ private:
// TODO: Use a std::forward
// This is the MPI parallel case (general communication object)
template <class Comm>
UnderlyingSolver<XGPU> constructSolver(std::shared_ptr<Dune::Preconditioner<X, X>> prec,
scalar_real_type reduction,
int maxit,
int verbose)
int verbose,
const Comm& communication)
{
OPM_ERROR_IF(
detail::is_a_well_operator<Operator>::value,
"Currently we only support operators of type MatrixAdapter in the CUDA solver. "
"Use --matrix-add-well-contributions=true. "
"Using WellModelMatrixAdapter with SolverAdapter is not well-defined so it will not work well, or at all.");
if constexpr (detail::has_communication<Operator>::value) {
// TODO: See the below TODO over the definition of precHolder in the other branch
// TODO: See the below TODO over the definition of precHolder in the other overload of constructSolver
// TODO: We are currently double wrapping preconditioners in the preconditioner factory to be extra
// compatible
// with CPU. Probably a cleaner way eventually would be to do more modifications to the flexible
// solver to accomodate the pure GPU better.
// compatible with CPU. Probably a cleaner way eventually would be to do more modifications to the
// flexible solver to accomodate the pure GPU better.
auto precAsHolder = std::dynamic_pointer_cast<PreconditionerHolder<X, X>>(prec);
if (!precAsHolder) {
OPM_THROW(std::invalid_argument,
"The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
"The preconditioner needs to be a CUDA preconditioner (eg. GPUDILU) wrapped in a "
"Opm::gpuistl::PreconditionerAdapter wrapped in a "
"Opm::gpuistl::GpuBlockPreconditioner. If you are unsure what this means, set "
"preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
"preconditioner to 'GPUDILU'");
}
auto preconditionerAdapter = precAsHolder->getUnderlyingPreconditioner();
@@ -158,14 +166,13 @@ private:
= std::dynamic_pointer_cast<PreconditionerHolder<XGPU, XGPU>>(preconditionerAdapter);
if (!preconditionerAdapterAsHolder) {
OPM_THROW(std::invalid_argument,
"The preconditioner needs to be a CUDA preconditioner (eg. CuILU0) wrapped in a "
"The preconditioner needs to be a CUDA preconditioner (eg. GPUDILU) wrapped in a "
"Opm::gpuistl::PreconditionerAdapter wrapped in a "
"Opm::gpuistl::GpuBlockPreconditioner. If you are unsure what this means, set "
"preconditioner to 'CUILU0'"); // TODO: Suggest a better preconditioner
"preconditioner to 'GPUDILU'");
}
// We need to get the underlying preconditioner:
auto preconditionerReallyOnGPU = preconditionerAdapterAsHolder->getUnderlyingPreconditioner();
const auto& communication = m_opOnCPUWithMatrix.getCommunication();
// Temporary solution use the GPU Direct communication solely based on these prepcrosessor statements
bool mpiSupportsCudaAwareAtCompileTime = false;
@@ -183,15 +190,18 @@ private:
// TODO add typename Operator communication type as a named type with using
std::shared_ptr<Opm::gpuistl::GPUSender<real_type, typename Operator::communication_type>> gpuComm;
if (mpiSupportsCudaAwareAtCompileTime && mpiSupportsCudaAwareAtRunTime){
gpuComm = std::make_shared<Opm::gpuistl::GPUAwareMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
}
else{
gpuComm = std::make_shared<Opm::gpuistl::GPUObliviousMPISender<real_type, block_size, typename Operator::communication_type>>(communication);
std::shared_ptr<Opm::gpuistl::GPUSender<real_type, Comm>> gpuComm;
if (mpiSupportsCudaAwareAtCompileTime && mpiSupportsCudaAwareAtRunTime) {
gpuComm = std::make_shared<
Opm::gpuistl::GPUAwareMPISender<real_type, block_size, Comm>>(
communication);
} else {
gpuComm = std::make_shared<
Opm::gpuistl::GPUObliviousMPISender<real_type, block_size, Comm>>(
communication);
}
using CudaCommunication = GpuOwnerOverlapCopy<real_type, block_size, typename Operator::communication_type>;
using CudaCommunication = GpuOwnerOverlapCopy<real_type, block_size, Comm>;
using SchwarzOperator
= Dune::OverlappingSchwarzOperator<GpuSparseMatrix<real_type>, XGPU, XGPU, CudaCommunication>;
auto cudaCommunication = std::make_shared<CudaCommunication>(gpuComm);
@@ -203,17 +213,34 @@ private:
cudaCommunication, m_opOnCPUWithMatrix.category());
// NOTE: Ownsership of cudaCommunication is handled by mpiPreconditioner. However, just to make sure we
// remember
// this, we add this check. So remember that we hold one count in this scope, and one in the
// GpuBlockPreconditioner instance. We accomedate for the fact that it could be passed around in
// NOTE: Ownership of cudaCommunication is handled by mpiPreconditioner. However, just to make sure we
// remember this, we add this check. So remember that we hold one count in this scope, and one in the
// GpuBlockPreconditioner instance. We accommodate for the fact that it could be passed around in
// GpuBlockPreconditioner, hence we do not test for != 2
OPM_ERROR_IF(cudaCommunication.use_count() < 2, "Internal error. Shared pointer not owned properly.");
auto overlappingCudaOperator = std::make_shared<SchwarzOperator>(m_matrix, *cudaCommunication);
return UnderlyingSolver<XGPU>(
overlappingCudaOperator, scalarProduct, mpiPreconditioner, reduction, maxit, verbose);
} else {
}
// This is the serial case (specific overload for Dune::Amg::SequentialInformation)
UnderlyingSolver<XGPU> constructSolver(std::shared_ptr<Dune::Preconditioner<X, X>> prec,
scalar_real_type reduction,
int maxit,
int verbose,
[[maybe_unused]] const Dune::Amg::SequentialInformation& communication)
{
// Dune::Amg::SequentialInformation is the serial case
return constructSolver(prec, reduction, maxit, verbose);
}
// TODO: Use a std::forward
UnderlyingSolver<XGPU> constructSolver(std::shared_ptr<Dune::Preconditioner<X, X>> prec,
scalar_real_type reduction,
int maxit,
int verbose)
{
// TODO: Fix the reliance on casting here. This is a code smell to a certain degree, and assumes
// a certain setup beforehand. The only reason we do it this way is to minimize edits to the
// flexible solver. We could design it differently, but keep this for the time being until
@@ -222,16 +249,13 @@ private:
if (!precAsHolder) {
OPM_THROW(std::invalid_argument,
"The preconditioner needs to be a CUDA preconditioner wrapped in a "
"Opm::gpuistl::PreconditionerHolder (eg. CuILU0).");
"Opm::gpuistl::PreconditionerHolder (eg. GPUDILU).");
}
auto preconditionerOnGPU = precAsHolder->getUnderlyingPreconditioner();
auto matrixOperator
= std::make_shared<Dune::MatrixAdapter<GpuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
auto matrixOperator = std::make_shared<Dune::MatrixAdapter<GpuSparseMatrix<real_type>, XGPU, XGPU>>(m_matrix);
auto scalarProduct = std::make_shared<Dune::SeqScalarProduct<XGPU>>();
return UnderlyingSolver<XGPU>(
matrixOperator, scalarProduct, preconditionerOnGPU, reduction, maxit, verbose);
}
return UnderlyingSolver<XGPU>(matrixOperator, scalarProduct, preconditionerOnGPU, reduction, maxit, verbose);
}
std::unique_ptr<XGPU> m_inputBuffer;

View File

@@ -79,7 +79,7 @@ createSolverAdapterWithMatrix(const size_t N = 10)
prm.put<double>("relaxation", 1.0);
prm.put<std::string>("type", "CUILU0");
auto prec = PrecondFactory::create(*op, prm);
auto solverAdapter = std::make_shared<SolverAdapter>(*op, *sp, prec, 1.0, 10, 0);
auto solverAdapter = std::make_shared<SolverAdapter>(*op, *sp, prec, 1.0, 10, 0, Dune::Amg::SequentialInformation());
return std::make_tuple(matrixPtr, solverAdapter, op, sp);
}