Merge pull request #5380 from akva2/linalg_template_scalar

LinAlg classes: template Scalar type
This commit is contained in:
Arne Morten Kvarving 2024-05-31 08:40:11 +02:00 committed by GitHub
commit bcbac79486
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
57 changed files with 2346 additions and 1863 deletions

View File

@ -31,7 +31,7 @@ endif()
foreach(CL ${CL_LIST})
get_filename_component(FNAME ${CL} NAME_WE)
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
file(READ "${CL}" CL_CONTENT)
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")

View File

@ -50,15 +50,14 @@
std::shared_ptr<std::thread> copyThread;
#endif // HAVE_OPENMP
namespace Opm {
namespace detail {
namespace Opm::detail {
template<class Matrix, class Vector>
BdaSolverInfo<Matrix,Vector>::
BdaSolverInfo(const std::string& accelerator_mode,
const int linear_solver_verbosity,
const int maxit,
const double tolerance,
const Scalar tolerance,
const int platformID,
const int deviceID,
const bool opencl_ilu_parallel,
@ -104,7 +103,7 @@ apply(Vector& rhs,
{
bool use_gpu = bridge_->getUseGpu();
if (use_gpu) {
auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
auto wellContribs = WellContributions<Scalar>::create(accelerator_mode_, useWellConn);
bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
// the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
@ -179,8 +178,9 @@ blockJacobiAdjacency(const Grid& grid,
const auto& gridView = grid.leafGridView();
auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
//Loop over cells
for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
// Loop over cells
for (Iter row = blockJacobiForGPUILU0_->createbegin();
row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
{
const auto& elem = *elemIt;
size_type idx = lid.id(elem);
@ -221,25 +221,26 @@ copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
auto outerCol = (*outerRow).begin();
for (auto col = (*row).begin(); col != (*row).end(); ++col) {
// outerRow is guaranteed to have all column entries that row has!
while(outerCol.index() < col.index()) ++outerCol;
while (outerCol.index() < col.index()) {
++outerCol;
}
assert(outerCol.index() == col.index());
*col = *outerCol; // copy nonzero block
}
}
}
template<int Dim>
using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
template<int Dim>
using BV = Dune::BlockVector<Dune::FieldVector<double,Dim>>;
template<class Scalar, int Dim>
using BM = Dune::BCRSMatrix<MatrixBlock<Scalar,Dim,Dim>>;
template<class Scalar, int Dim>
using BV = Dune::BlockVector<Dune::FieldVector<Scalar,Dim>>;
#define INSTANCE_GRID(Dim, Grid) \
template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
prepare(const Grid&, \
const Dune::CartesianIndexMapper<Grid>&, \
const std::vector<Well>&, \
const std::vector<int>&, \
#define INSTANTIATE_GRID(T, Dim, Grid) \
template void BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>:: \
prepare(const Grid&, \
const Dune::CartesianIndexMapper<Grid>&, \
const std::vector<Well>&, \
const std::vector<int>&, \
const std::size_t, const bool);
using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
#if HAVE_DUNE_ALUGRID
@ -248,23 +249,26 @@ using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
#else
using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridNoComm>;
#endif //HAVE_MPI
#define INSTANCE(Dim) \
template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
INSTANCE_GRID(Dim,Dune::CpGrid) \
INSTANCE_GRID(Dim,ALUGrid3CN) \
INSTANCE_GRID(Dim,PolyHedralGrid3D)
#define INSTANTIATE(T,Dim) \
template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
INSTANTIATE_GRID(T,Dim,Dune::CpGrid) \
INSTANTIATE_GRID(T,Dim,ALUGrid3CN) \
INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
#else
#define INSTANCE(Dim) \
template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
INSTANCE_GRID(Dim,Dune::CpGrid) \
INSTANCE_GRID(Dim,PolyHedralGrid3D)
#define INSTANTIATE(T,Dim) \
template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
INSTANTIATE_GRID(T,Dim,Dune::CpGrid) \
INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
#endif
INSTANCE(1)
INSTANCE(2)
INSTANCE(3)
INSTANCE(4)
INSTANCE(5)
INSTANCE(6)
} // namespace detail
} // namespace Opm
#define INSTANTIATE_TYPE(T) \
INSTANTIATE(T,1) \
INSTANTIATE(T,2) \
INSTANTIATE(T,3) \
INSTANTIATE(T,4) \
INSTANTIATE(T,5) \
INSTANTIATE(T,6)
INSTANTIATE_TYPE(double)
} // namespace Opm::detail

View File

@ -35,60 +35,61 @@ namespace Opm {
class Well;
template<class Matrix, class Vector, int block_size> class BdaBridge;
class WellContributions;
template<class Scalar> class WellContributions;
namespace detail {
template<class Matrix, class Vector>
struct BdaSolverInfo
{
using WellContribFunc = std::function<void(WellContributions&)>;
using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
using Scalar = typename Vector::field_type;
using WellContribFunc = std::function<void(WellContributions<Scalar>&)>;
using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
BdaSolverInfo(const std::string& accelerator_mode,
const int linear_solver_verbosity,
const int maxit,
const double tolerance,
const int platformID,
const int deviceID,
const bool opencl_ilu_parallel,
const std::string& linsolver);
BdaSolverInfo(const std::string& accelerator_mode,
const int linear_solver_verbosity,
const int maxit,
const Scalar tolerance,
const int platformID,
const int deviceID,
const bool opencl_ilu_parallel,
const std::string& linsolver);
~BdaSolverInfo();
~BdaSolverInfo();
template<class Grid>
void prepare(const Grid& grid,
const Dune::CartesianIndexMapper<Grid>& cartMapper,
const std::vector<Well>& wellsForConn,
const std::vector<int>& cellPartition,
const std::size_t nonzeroes,
const bool useWellConn);
template<class Grid>
void prepare(const Grid& grid,
const Dune::CartesianIndexMapper<Grid>& cartMapper,
const std::vector<Well>& wellsForConn,
const std::vector<int>& cellPartition,
const std::size_t nonzeroes,
const bool useWellConn);
bool apply(Vector& rhs,
const bool useWellConn,
WellContribFunc getContribs,
const int rank,
Matrix& matrix,
Vector& x,
Dune::InverseOperatorResult& result);
bool apply(Vector& rhs,
const bool useWellConn,
WellContribFunc getContribs,
const int rank,
Matrix& matrix,
Vector& x,
Dune::InverseOperatorResult& result);
bool gpuActive();
bool gpuActive();
int numJacobiBlocks_ = 0;
int numJacobiBlocks_ = 0;
private:
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
/// Do not initialize the values, that is done in copyMatToBlockJac()
template<class Grid>
void blockJacobiAdjacency(const Grid& grid,
const std::vector<int>& cell_part,
std::size_t nonzeroes);
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
/// Do not initialize the values, that is done in copyMatToBlockJac()
template<class Grid>
void blockJacobiAdjacency(const Grid& grid,
const std::vector<int>& cell_part,
std::size_t nonzeroes);
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
std::unique_ptr<Bridge> bridge_;
std::string accelerator_mode_;
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
std::vector<std::set<int>> wellConnectionsGraph_;
std::unique_ptr<Bridge> bridge_;
std::string accelerator_mode_;
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
std::vector<std::set<int>> wellConnectionsGraph_;
};
}
@ -249,8 +250,8 @@ public:
// Solve system.
Dune::InverseOperatorResult result;
std::function<void(WellContributions&)> getContribs =
[this](WellContributions& w)
std::function<void(WellContributions<Scalar>&)> getContribs =
[this](WellContributions<Scalar>& w)
{
this->simulator_.problem().wellModel().getWellContributions(w);
};

View File

@ -50,11 +50,11 @@
#include <opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp>
namespace Opm
{
namespace Opm {
template <class Smoother>
struct AMGSmootherArgsHelper {
struct AMGSmootherArgsHelper
{
static auto args(const PropertyTree& prm)
{
using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
@ -69,10 +69,11 @@ struct AMGSmootherArgsHelper {
};
template <class M, class V, class C>
struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
struct AMGSmootherArgsHelper<ParallelOverlappingILU0<M, V, V, C>>
{
static auto args(const PropertyTree& prm)
{
using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
using Smoother = ParallelOverlappingILU0<M, V, V, C>;
using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
SmootherArgs smootherArgs;
smootherArgs.iterations = prm.get<int>("iterations", 1);
@ -88,7 +89,6 @@ struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
}
};
// trailing return type with decltype used for detecting existence of setUseFixedOrder member function by overloading the setUseFixedOrder function
template <typename C>
auto setUseFixedOrder(C criterion, bool booleanValue) -> decltype(criterion.setUseFixedOrder(booleanValue))
@ -209,7 +209,7 @@ struct StandardPreconditioners {
const std::string smoother = prm.get<std::string>("smoother", "ParOverILU0");
// TODO: merge this with ILUn, and possibly simplify the factory to only work with ILU?
if (smoother == "ILU0" || smoother == "ParOverILU0") {
using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
using Smoother = ParallelOverlappingILU0<M, V, V, C>;
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
auto sargs = AMGSmootherArgsHelper<Smoother>::args(prm);
PrecPtr prec = std::make_shared<Dune::Amg::AMGCPR<O, V, Smoother, C>>(op, crit, sargs, comm);
@ -279,7 +279,8 @@ struct StandardPreconditioners {
OPM_THROW(std::logic_error,
"Pressure index out of bounds. It needs to specified for CPR");
}
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, false>;
using Scalar = typename V::field_type;
using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, false>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
op, prm, weightsCalculator, pressureIndex, comm);
});
@ -294,7 +295,8 @@ struct StandardPreconditioners {
OPM_THROW(std::logic_error,
"Pressure index out of bounds. It needs to specified for CPR");
}
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, true>;
using Scalar = typename V::field_type;
using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, true>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
op, prm, weightsCalculator, pressureIndex, comm);
});
@ -311,7 +313,8 @@ struct StandardPreconditioners {
OPM_THROW(std::logic_error,
"Pressure index out of bounds. It needs to specified for CPR");
}
using LevelTransferPolicy = Opm::PressureBhpTransferPolicy<O, Comm, false>;
using Scalar = typename V::field_type;
using LevelTransferPolicy = PressureBhpTransferPolicy<O, Comm, Scalar, false>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
op, prm, weightsCalculator, pressureIndex, comm);
});
@ -321,12 +324,12 @@ struct StandardPreconditioners {
F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const double w = prm.get<double>("relaxation", 1.0);
using field_type = typename V::field_type;
using CuILU0 = typename Opm::cuistl::
CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
using CuILU0 = typename cuistl::
CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
auto cuILU0 = std::make_shared<CuILU0>(op.getmat(), w);
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
@ -334,21 +337,21 @@ struct StandardPreconditioners {
const double w = prm.get<double>("relaxation", 1.0);
using field_type = typename V::field_type;
using CuJac =
typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
auto cuJac = std::make_shared<CuJac>(op.getmat(), w);
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
using field_type = typename V::field_type;
using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
auto cuDILU = std::make_shared<CuDILU>(op.getmat());
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
#endif
@ -368,11 +371,11 @@ struct StandardPreconditioners {
// Already a parallel preconditioner. Need to pass comm, but no need to wrap it in a BlockPreconditioner.
if (ilulevel == 0) {
const std::size_t num_interior = interiorIfGhostLast(comm);
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
op.getmat(), comm, w, Opm::MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
op.getmat(), comm, w, MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
} else {
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
op.getmat(), comm, ilulevel, w, Opm::MILU_VARIANT::ILU, redblack, reorder_spheres);
return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
op.getmat(), comm, ilulevel, w, MILU_VARIANT::ILU, redblack, reorder_spheres);
}
}
@ -412,8 +415,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
using P = PropertyTree;
F::addCreator("ILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
const double w = prm.get<double>("relaxation", 1.0);
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), 0, w, Opm::MILU_VARIANT::ILU);
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), 0, w, MILU_VARIANT::ILU);
});
F::addCreator("DuneILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
const double w = prm.get<double>("relaxation", 1.0);
@ -424,14 +427,14 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
F::addCreator("ParOverILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
const double w = prm.get<double>("relaxation", 1.0);
const int n = prm.get<int>("ilulevel", 0);
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), n, w, MILU_VARIANT::ILU);
});
F::addCreator("ILUn", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
const int n = prm.get<int>("ilulevel", 0);
const double w = prm.get<double>("relaxation", 1.0);
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
op.getmat(), n, w, MILU_VARIANT::ILU);
});
F::addCreator("DILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
DUNE_UNUSED_PARAMETER(prm);
@ -513,11 +516,16 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
}
});
F::addCreator("famg", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
Dune::Amg::Parameters parms;
parms.setNoPreSmoothSteps(1);
parms.setNoPostSmoothSteps(1);
return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
if constexpr (std::is_same_v<typename V::field_type, float>) {
OPM_THROW(std::logic_error, "famg requires UMFPack which is not available for floats");
return nullptr;
} else {
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
Dune::Amg::Parameters parms;
parms.setNoPreSmoothSteps(1);
parms.setNoPostSmoothSteps(1);
return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
}
});
}
if constexpr (std::is_same_v<O, WellModelMatrixAdapter<M, V, V, false>>) {
@ -527,8 +535,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
}
using Scalar = typename V::field_type;
using LevelTransferPolicy
= Opm::PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
= PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
op, prm, weightsCalculator, pressureIndex);
});
@ -540,7 +549,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
}
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
using Scalar = typename V::field_type;
using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
op, prm, weightsCalculator, pressureIndex);
});
@ -550,7 +560,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
}
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, true>;
using Scalar = typename V::field_type;
using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, true>;
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
op, prm, weightsCalculator, pressureIndex);
});
@ -559,9 +570,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
const double w = prm.get<double>("relaxation", 1.0);
using field_type = typename V::field_type;
using CuILU0 = typename Opm::cuistl::
CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(
using CuILU0 = typename cuistl::
CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(
std::make_shared<CuILU0>(op.getmat(), w));
});
@ -571,10 +582,10 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
using matrix_type_to =
typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
using CuILU0 = typename Opm::cuistl::
CuSeqILU0<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
using CuILU0 = typename cuistl::
CuSeqILU0<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
auto converted = std::make_shared<Converter>(op.getmat());
auto adapted = std::make_shared<Adapter>(std::make_shared<CuILU0>(converted->getConvertedMatrix(), w));
converted->setUnderlyingPreconditioner(adapted);
@ -585,24 +596,24 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
const double w = prm.get<double>("relaxation", 1.0);
using field_type = typename V::field_type;
using CUJac =
typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(
typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUJac>>(
std::make_shared<CUJac>(op.getmat(), w));
});
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
using field_type = typename V::field_type;
using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
});
F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
using block_type = typename V::block_type;
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
using CuDILU = typename cuistl::CuDILU<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
auto converted = std::make_shared<Converter>(op.getmat());
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
converted->setUnderlyingPreconditioner(adapted);
@ -744,7 +755,7 @@ using OpFSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Dune::FieldMatrix<double, Di
Dune::BlockVector<Dune::FieldVector<double, Dim>>,
Dune::BlockVector<Dune::FieldVector<double, Dim>>>;
template <int Dim>
using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Opm::MatrixBlock<double, Dim, Dim>>,
using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<MatrixBlock<double, Dim, Dim>>,
Dune::BlockVector<Dune::FieldVector<double, Dim>>,
Dune::BlockVector<Dune::FieldVector<double, Dim>>>;

View File

@ -76,31 +76,36 @@ namespace Opm
namespace Details
{
using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
template <class Comm>
template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
PressureVectorType<Scalar>,
PressureVectorType<Scalar>>;
template<class Scalar, class Comm>
using ParCoarseOperatorType
= Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
template <class Comm>
= Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
PressureVectorType<Scalar>,
PressureVectorType<Scalar>,
Comm>;
template<class Scalar, class Comm>
using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
SeqCoarseOperatorType,
ParCoarseOperatorType<Comm>>;
SeqCoarseOperatorType<Scalar>,
ParCoarseOperatorType<Scalar,Comm>>;
} // namespace Details
template <class FineOperator, class Communication, bool transpose = false>
class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
template<class FineOperator, class Communication, class Scalar, bool transpose = false>
class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
{
public:
typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
typedef Communication ParallelInformation;
typedef typename FineOperator::domain_type FineVectorType;
using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
using ParallelInformation = Communication;
using FineVectorType= typename FineOperator::domain_type;
public:
PressureBhpTransferPolicy(const Communication& comm,
const FineVectorType& weights,
const Opm::PropertyTree& prm,
const PropertyTree& prm,
const std::size_t pressureIndex)
: communication_(&const_cast<Communication&>(comm))
, weights_(weights)
@ -109,7 +114,7 @@ namespace Opm
{
}
virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
void createCoarseLevelSystem(const FineOperator& fineOperator) override
{
OPM_TIMEBLOCK(createCoarseLevelSystem);
using CoarseMatrix = typename CoarseOperator::matrix_type;
@ -164,7 +169,7 @@ namespace Opm
this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
}
virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
void calculateCoarseEntries(const FineOperator& fineOperator) override
{
OPM_TIMEBLOCK(calculateCoarseEntries);
const auto& fineMatrix = fineOperator.getmat();
@ -175,7 +180,7 @@ namespace Opm
auto entryCoarse = rowCoarse->begin();
for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
assert(entry.index() == entryCoarse.index());
double matrix_el = 0;
Scalar matrix_el = 0;
if (transpose) {
const auto& bw = weights_[entry.index()];
for (std::size_t i = 0; i < bw.size(); ++i) {
@ -203,7 +208,7 @@ namespace Opm
}
}
virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
{
OPM_TIMEBLOCK(moveToCoarseLevel);
//NB we iterate over fine assumming welldofs is at the end
@ -214,7 +219,7 @@ namespace Opm
for (auto block = begin; block != end; ++block) {
const auto& bw = weights_[block.index()];
double rhs_el = 0.0;
Scalar rhs_el = 0.0;
if (transpose) {
rhs_el = (*block)[pressure_var_index_];
} else {
@ -228,7 +233,7 @@ namespace Opm
this->lhs_ = 0;
}
virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
void moveToFineLevel(typename ParentType::FineDomainType& fine) override
{
OPM_TIMEBLOCK(moveToFineLevel);
//NB we iterate over fine assumming welldofs is at the end
@ -246,7 +251,7 @@ namespace Opm
}
}
virtual PressureBhpTransferPolicy* clone() const override
PressureBhpTransferPolicy* clone() const override
{
return new PressureBhpTransferPolicy(*this);
}

View File

@ -28,39 +28,40 @@
#include <cstddef>
namespace Opm
{
namespace Details
{
using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
template <class Comm>
namespace Opm { namespace Details {
template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
PressureVectorType<Scalar>,
PressureVectorType<Scalar>>;
template<class Scalar, class Comm>
using ParCoarseOperatorType
= Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
template <class Comm>
= Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
PressureVectorType<Scalar>,
PressureVectorType<Scalar>,
Comm>;
template<class Scalar, class Comm>
using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
SeqCoarseOperatorType,
ParCoarseOperatorType<Comm>>;
SeqCoarseOperatorType<Scalar>,
ParCoarseOperatorType<Scalar,Comm>>;
} // namespace Details
template <class FineOperator, class Communication, bool transpose = false>
template <class FineOperator, class Communication, class Scalar, bool transpose = false>
class PressureTransferPolicy
: public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
: public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
{
public:
typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
typedef Communication ParallelInformation;
typedef typename FineOperator::domain_type FineVectorType;
using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
using ParallelInformation = Communication;
using FineVectorType = typename FineOperator::domain_type;
public:
PressureTransferPolicy(const Communication& comm,
const FineVectorType& weights,
const Opm::PropertyTree& /*prm*/,
const PropertyTree& /*prm*/,
int pressure_var_index)
: communication_(&const_cast<Communication&>(comm))
, weights_(weights)
@ -68,7 +69,7 @@ public:
{
}
virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
void createCoarseLevelSystem(const FineOperator& fineOperator) override
{
using CoarseMatrix = typename CoarseOperator::matrix_type;
const auto& fineLevelMatrix = fineOperator.getmat();
@ -92,7 +93,7 @@ public:
this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
}
virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
void calculateCoarseEntries(const FineOperator& fineOperator) override
{
const auto& fineMatrix = fineOperator.getmat();
*coarseLevelMatrix_ = 0;
@ -102,7 +103,7 @@ public:
auto entryCoarse = rowCoarse->begin();
for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
assert(entry.index() == entryCoarse.index());
double matrix_el = 0;
Scalar matrix_el = 0;
if (transpose) {
const auto& bw = weights_[entry.index()];
for (std::size_t i = 0; i < bw.size(); ++i) {
@ -120,7 +121,7 @@ public:
assert(rowCoarse == coarseLevelMatrix_->end());
}
virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
{
// Set coarse vector to zero
this->rhs_ = 0;
@ -129,7 +130,7 @@ public:
for (auto block = begin; block != end; ++block) {
const auto& bw = weights_[block.index()];
double rhs_el = 0.0;
Scalar rhs_el = 0.0;
if (transpose) {
rhs_el = (*block)[pressure_var_index_];
} else {
@ -143,7 +144,7 @@ public:
this->lhs_ = 0;
}
virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
void moveToFineLevel(typename ParentType::FineDomainType& fine) override
{
auto end = fine.end(), begin = fine.begin();
@ -159,7 +160,7 @@ public:
}
}
virtual PressureTransferPolicy* clone() const override
PressureTransferPolicy* clone() const override
{
return new PressureTransferPolicy(*this);
}

View File

@ -52,56 +52,70 @@
typedef Dune::InverseOperatorResult InverseOperatorResult;
namespace Opm
{
namespace Opm {
using Opm::Accelerator::BdaResult;
using Opm::Accelerator::BdaSolver;
using Opm::Accelerator::SolverStatus;
using Accelerator::BdaResult;
using Accelerator::BdaSolver;
using Accelerator::SolverStatus;
template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
int linear_solver_verbosity,
[[maybe_unused]] int maxit,
[[maybe_unused]] double tolerance,
[[maybe_unused]] unsigned int platformID,
[[maybe_unused]] unsigned int deviceID,
[[maybe_unused]] bool opencl_ilu_parallel,
[[maybe_unused]] std::string linsolver)
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
template<class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::
BdaBridge(std::string accelerator_mode_,
int linear_solver_verbosity,
[[maybe_unused]] int maxit,
[[maybe_unused]] Scalar tolerance,
[[maybe_unused]] unsigned int platformID,
[[maybe_unused]] unsigned int deviceID,
[[maybe_unused]] bool opencl_ilu_parallel,
[[maybe_unused]] std::string linsolver)
: verbosity(linear_solver_verbosity)
, accelerator_mode(accelerator_mode_)
{
if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
use_gpu = true;
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
using CU = Accelerator::cusparseSolverBackend<Scalar,block_size>;
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
#else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif
} else if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
use_gpu = true;
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
backend = std::make_unique<OCL>(linear_solver_verbosity,
maxit,
tolerance,
platformID,
deviceID,
opencl_ilu_parallel,
linsolver);
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
} else if (accelerator_mode.compare("amgcl") == 0) {
#if HAVE_AMGCL
use_gpu = true; // should be replaced by a 'use_bridge' boolean
backend.reset(new Opm::Accelerator::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
using AMGCL = Accelerator::amgclSolverBackend<Scalar,block_size>;
backend = std::make_unique<AMGCL>(linear_solver_verbosity, maxit,
tolerance, platformID, deviceID);
#else
OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
#endif
} else if (accelerator_mode.compare("rocalution") == 0) {
#if HAVE_ROCALUTION
use_gpu = true; // should be replaced by a 'use_bridge' boolean
backend.reset(new Opm::Accelerator::rocalutionSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
using ROCA = Accelerator::rocalutionSolverBackend<Scalar,block_size>;
backend = std::make_unique<ROCA>(linear_solver_verbosity, maxit, tolerance);
#else
OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake");
#endif
} else if (accelerator_mode.compare("rocsparse") == 0) {
#if HAVE_ROCSPARSE
use_gpu = true; // should be replaced by a 'use_bridge' boolean
backend.reset(new Opm::Accelerator::rocsparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
tolerance, platformID, deviceID);
#else
OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
#endif
@ -112,13 +126,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
}
}
template <class BridgeMatrix>
int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::size_type>& diag_indices) {
int replaceZeroDiagonal(BridgeMatrix& mat,
std::vector<typename BridgeMatrix::size_type>& diag_indices)
{
using Scalar = typename BridgeMatrix::field_type;
int numZeros = 0;
const int dim = mat[0][0].N(); // might be replaced with BridgeMatrix::block_type::size()
const double zero_replace = 1e-15;
const Scalar zero_replace = 1e-15;
if (diag_indices.empty()) {
int Nb = mat.N();
diag_indices.reserve(Nb);
@ -134,7 +149,7 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
}
diag_indices.emplace_back(diag.offset());
}
}else{
} else {
for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
@ -151,13 +166,15 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
return numZeros;
}
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
// sparsity pattern should stay the same
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
copySparsityPatternFromISTL(const BridgeMatrix& mat,
std::vector<int>& h_rows,
std::vector<int>& h_cols)
{
h_rows.clear();
h_cols.clear();
@ -172,17 +189,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
OPM_THROW(std::logic_error,
"Error size of rows do not sum to number of nonzeroes "
"in BdaBridge::copySparsityPatternFromISTL()");
}
}
// check if the nnz values of the matrix are in contiguous memory
// this is done by checking if the distance between the last value of the last block of row 0 and
// the first value of the first row of row 1 is equal to 1
// if the matrix only has 1 row, it is always contiguous
template <class BridgeMatrix>
void checkMemoryContiguous(const BridgeMatrix& mat) {
void checkMemoryContiguous(const BridgeMatrix& mat)
{
auto block_size = mat[0][0].N();
auto row = mat.begin();
auto last_of_row0 = row->begin();
@ -199,14 +218,14 @@ void checkMemoryContiguous(const BridgeMatrix& mat) {
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
BridgeMatrix* jacMat,
int numJacobiBlocks,
BridgeVector& b,
WellContributions& wellContribs,
InverseOperatorResult& res)
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
solve_system(BridgeMatrix* bridgeMat,
BridgeMatrix* jacMat,
int numJacobiBlocks,
BridgeVector& b,
WellContributions<Scalar>& wellContribs,
InverseOperatorResult& res)
{
if (use_gpu) {
BdaResult result;
@ -221,38 +240,48 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
return;
}
using Mat = Accelerator::BlockedMatrix<Scalar>;
if (!matrix) {
h_rows.reserve(Nb+1);
h_cols.reserve(nnzb);
copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
checkMemoryContiguous(*bridgeMat);
matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
matrix = std::make_unique<Mat>(Nb, nnzb, block_size,
static_cast<Scalar*>(&(((*bridgeMat)[0][0][0][0]))),
h_cols.data(),
h_rows.data());
}
Dune::Timer t_zeros;
int numZeros = replaceZeroDiagonal(*bridgeMat, diagIndices);
if (verbosity >= 2) {
std::ostringstream out;
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
out << "Checking zeros took: " << t_zeros.stop() << " s, found "
<< numZeros << " zeros";
OpmLog::info(out.str());
}
if (numJacobiBlocks >= 2) {
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes()
: h_jacRows.back();
if (!jacMatrix) {
h_jacRows.reserve(Nb+1);
h_jacCols.reserve(jacNnzb);
copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
checkMemoryContiguous(*jacMat);
jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
jacMatrix = std::make_unique<Mat>(Nb, jacNnzb, block_size,
static_cast<Scalar*>(&(((*jacMat)[0][0][0][0]))),
h_jacCols.data(),
h_jacRows.data());
}
Dune::Timer t_zeros2;
int jacNumZeros = replaceZeroDiagonal(*jacMat, jacDiagIndices);
if (verbosity >= 2) {
std::ostringstream out;
out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
out << "Checking zeros for jacMat took: " << t_zeros2.stop()
<< " s, found " << jacNumZeros << " zeros";
OpmLog::info(out.str());
}
}
@ -260,17 +289,23 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
/////////////////////////
// actually solve
// assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
SolverStatus status = backend->solve_system(matrix,
static_cast<Scalar*>(&(b[0][0])),
jacMatrix, wellContribs, result);
switch(status) {
switch (status) {
case SolverStatus::BDA_SOLVER_SUCCESS:
//OpmLog::info("BdaSolver converged");
break;
case SolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
OpmLog::warning("BdaSolver could not analyse level information of matrix, "
"perhaps there is still a 0.0 on the diagonal of a "
"block on the diagonal");
break;
case SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
OpmLog::warning("BdaSolver could not create preconditioner, "
"perhaps there is still a 0.0 on the diagonal "
"of a block on the diagonal");
break;
default:
OpmLog::warning("BdaSolver returned unknown status code");
@ -286,21 +321,27 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unused]] BridgeVector& x) {
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
get_result([[maybe_unused]] BridgeVector& x)
{
if (use_gpu) {
backend->get_result(static_cast<double*>(&(x[0][0])));
backend->get_result(static_cast<Scalar*>(&(x[0][0])));
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
[[maybe_unused]] unsigned N) {
if(accelerator_mode.compare("opencl") == 0){
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
initWellContributions([[maybe_unused]] WellContributions<Scalar>& wellContribs,
[[maybe_unused]] unsigned N)
{
if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
const auto openclBackend = static_cast<const OCL*>(backend.get());
using WCOCL = WellContributionsOCL<Scalar>;
static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
@ -309,23 +350,20 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
}
// the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>; \
\
template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<double, n, n>, std::allocator<Dune::FieldMatrix<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>;
#define INSTANTIATE_BDA_FUNCTIONS(T,n) \
template class BdaBridge<Dune::BCRSMatrix<MatrixBlock<T,n,n>>, \
Dune::BlockVector<Dune::FieldVector<T,n>>,n>; \
template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<T,n,n>>, \
Dune::BlockVector<Dune::FieldVector<T,n>>,n>;
#define INSTANTIATE_TYPE(T) \
INSTANTIATE_BDA_FUNCTIONS(T,1) \
INSTANTIATE_BDA_FUNCTIONS(T,2) \
INSTANTIATE_BDA_FUNCTIONS(T,3) \
INSTANTIATE_BDA_FUNCTIONS(T,4) \
INSTANTIATE_BDA_FUNCTIONS(T,5) \
INSTANTIATE_BDA_FUNCTIONS(T,6)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
INSTANTIATE_TYPE(double)
} // namespace Opm

View File

@ -27,7 +27,7 @@
namespace Opm
{
class WellContributions;
template<class Scalar> class WellContributions;
typedef Dune::InverseOperatorResult InverseOperatorResult;
@ -36,12 +36,13 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
class BdaBridge
{
private:
using Scalar = typename BridgeVector::field_type;
int verbosity = 0;
bool use_gpu = false;
std::string accelerator_mode;
std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::unique_ptr<Accelerator::BdaSolver<Scalar,block_size>> backend;
std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::vector<int> h_rows, h_cols; // store the sparsity pattern of the matrix
std::vector<int> h_jacRows, h_jacCols; // store the sparsity pattern of the jacMatrix
std::vector<typename BridgeMatrix::size_type> diagIndices; // contains offsets of the diagonal blocks wrt start of the row, used for replaceZeroDiagonal()
@ -57,8 +58,14 @@ public:
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
BdaBridge(std::string accelerator_mode, int linear_solver_verbosity, int maxit, double tolerance,
unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
BdaBridge(std::string accelerator_mode,
int linear_solver_verbosity,
int maxit,
Scalar tolerance,
unsigned int platformID,
unsigned int deviceID,
bool opencl_ilu_parallel,
std::string linsolver);
/// Solve linear system, A*x = b
@ -69,7 +76,12 @@ public:
/// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] result summary of solver result
void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
void solve_system(BridgeMatrix* bridgeMat,
BridgeMatrix* jacMat,
int numJacobiBlocks,
BridgeVector& b,
WellContributions<Scalar>& wellContribs,
InverseOperatorResult &result);
/// Get the resulting x vector
/// \param[inout] x vector x, should be of type Dune::BlockVector
@ -77,7 +89,8 @@ public:
/// Return whether the BdaBridge will use the GPU or not
/// return whether the BdaBridge will use the GPU or not
bool getUseGpu(){
bool getUseGpu()
{
return use_gpu;
}
@ -85,19 +98,21 @@ public:
/// \param[in] mat input matrix, probably BCRSMatrix
/// \param[out] h_rows rowpointers
/// \param[out] h_cols columnindices
static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
static void copySparsityPatternFromISTL(const BridgeMatrix& mat,
std::vector<int>& h_rows,
std::vector<int>& h_cols);
/// Initialize the WellContributions object with opencl context and queue
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
/// \param[in] wellContribs container to hold all WellContributions
/// \param[in] N number of rows in scalar vector that wellContribs will be applied on
void initWellContributions(WellContributions& wellContribs, unsigned N);
void initWellContributions(WellContributions<Scalar>& wellContribs, unsigned N);
/// Return the selected accelerator mode, this is input via the command-line
std::string getAccleratorName(){
std::string getAccleratorName()
{
return accelerator_mode;
}
}; // end class BdaBridge
}

View File

@ -25,70 +25,86 @@
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <memory>
#include <string>
namespace Opm {
class WellContributions;
template<class Scalar> class WellContributions;
namespace Accelerator {
enum class SolverStatus {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
/// This class is abstract, no instantiations can of it can be made, only of its children
template <unsigned int block_size>
class BdaSolver
{
enum class SolverStatus {
BDA_SOLVER_SUCCESS,
BDA_SOLVER_ANALYSIS_FAILED,
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
BDA_SOLVER_UNKNOWN_ERROR
};
protected:
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
/// This class is abstract, no instantiations can of it can be made, only of its children
template<class Scalar, unsigned int block_size>
class BdaSolver
{
protected:
// verbosity
// 0: print nothing during solves, only when initializing
// 1: print number of iterations and final norm
// 2: also print norm each iteration
// 3: also print timings of different backend functions
int verbosity = 0;
// verbosity
// 0: print nothing during solves, only when initializing
// 1: print number of iterations and final norm
// 2: also print norm each iteration
// 3: also print timings of different backend functions
int maxit = 200;
Scalar tolerance = 1e-2;
int verbosity = 0;
int N; // number of rows
int Nb; // number of blocked rows (Nb*block_size == N)
int nnz; // number of nonzeroes (scalars)
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
int maxit = 200;
double tolerance = 1e-2;
unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
unsigned int deviceID = 0; // ID of the device to be used
int N; // number of rows
int Nb; // number of blocked rows (Nb*block_size == N)
int nnz; // number of nonzeroes (scalars)
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
bool initialized = false;
unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
unsigned int deviceID = 0; // ID of the device to be used
public:
/// Construct a BdaSolver
/// \param[in] linear_solver_verbosity verbosity of solver
/// \param[in] maxit maximum number of iterations for solver
/// \param[in] tolerance required relative tolerance for solver
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
/// \param[in] deviceID the device to be used
BdaSolver(int linear_solver_verbosity, int max_it, Scalar tolerance_)
: verbosity(linear_solver_verbosity)
, maxit(max_it)
, tolerance(tolerance_)
{}
BdaSolver(int linear_solver_verbosity, int max_it,
Scalar tolerance_, unsigned int deviceID_)
: verbosity(linear_solver_verbosity)
, maxit(max_it)
, tolerance(tolerance_)
, deviceID(deviceID_) {};
BdaSolver(int linear_solver_verbosity, int max_it,
double tolerance_, unsigned int platformID_,
unsigned int deviceID_)
: verbosity(linear_solver_verbosity)
, maxit(max_it)
, tolerance(tolerance_)
, platformID(platformID_)
, deviceID(deviceID_)
{}
bool initialized = false;
/// Define virtual destructor, so that the derivedclass destructor will be called
virtual ~BdaSolver() = default;
public:
/// Construct a BdaSolver
/// \param[in] linear_solver_verbosity verbosity of solver
/// \param[in] maxit maximum number of iterations for solver
/// \param[in] tolerance required relative tolerance for solver
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
/// \param[in] deviceID the device to be used
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
/// Define as pure virtual functions, so derivedclass must implement them
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) = 0;
/// Define virtual destructor, so that the derivedclass destructor will be called
virtual ~BdaSolver() {};
/// Define as pure virtual functions, so derivedclass must implement them
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
virtual void get_result(double *x) = 0;
}; // end class BdaSolver
virtual void get_result(Scalar* x) = 0;
}; // end class BdaSolver
} // namespace Accelerator
} // namespace Opm

View File

@ -17,9 +17,6 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cstring>
#include <cmath>
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
@ -29,16 +26,10 @@
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
namespace Opm
namespace Opm::Accelerator {
void sortRow(int *colIndices, int *data, int left, int right)
{
namespace Accelerator
{
using Opm::OpmLog;
void sortRow(int *colIndices, int *data, int left, int right) {
int l = left;
int r = right;
int middle = colIndices[(l + r) >> 1];
@ -67,14 +58,14 @@ void sortRow(int *colIndices, int *data, int left, int right) {
sortRow(colIndices, data, l, right);
}
// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
// a = a - (b * c)
void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
template<class Scalar>
void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size)
{
for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) {
double temp = 0.0;
Scalar temp = 0.0;
for (unsigned int k = 0; k < block_size; k++) {
temp += b[block_size * row + k] * c[block_size * k + col];
}
@ -84,11 +75,12 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
}
/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
template<class Scalar>
void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size)
{
for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) {
double temp = 0;
Scalar temp = 0;
for (unsigned int k = 0; k < block_size; k++) {
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
}
@ -97,5 +89,10 @@ void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_si
}
}
} // namespace Accelerator
} // namespace Opm
#define INSTANCE_TYPE(T) \
template void blockMultSub(double*, double*, double*, unsigned int); \
template void blockMult(double*, double*, double*, unsigned int);
INSTANCE_TYPE(double)
} // namespace Opm::Accelerator

View File

@ -20,44 +20,40 @@
#ifndef OPM_BLOCKED_MATRIX_HPP
#define OPM_BLOCKED_MATRIX_HPP
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
template<class Scalar>
class BlockedMatrix
{
public:
/// Allocate BlockedMatrix and data arrays with given sizes
/// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of nonzero blocks
/// \param[in] block_size the number of rows and columns for each block
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
: nnzValues(new double[nnzbs_*block_size_*block_size_]),
colIndices(new int[nnzbs_*block_size_*block_size_]),
rowPointers(new int[Nb_+1]),
Nb(Nb_),
nnzbs(nnzbs_),
block_size(block_size_),
deleteNnzs(true),
deleteSparsity(true)
: nnzValues(new Scalar[nnzbs_*block_size_*block_size_])
, colIndices(new int[nnzbs_*block_size_*block_size_])
, rowPointers(new int[Nb_+1])
, Nb(Nb_)
, nnzbs(nnzbs_)
, block_size(block_size_)
, deleteNnzs(true)
, deleteSparsity(true)
{}
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
/// \param[in] M matrix to be copied
BlockedMatrix(const BlockedMatrix& M)
: nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
colIndices(M.colIndices),
rowPointers(M.rowPointers),
Nb(M.Nb),
nnzbs(M.nnzbs),
block_size(M.block_size),
deleteNnzs(true),
deleteSparsity(false)
: nnzValues(new Scalar[M.nnzbs*M.block_size*M.block_size])
, colIndices(M.colIndices)
, rowPointers(M.rowPointers)
, Nb(M.Nb)
, nnzbs(M.nnzbs)
, block_size(M.block_size)
, deleteNnzs(true)
, deleteSparsity(false)
{}
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
@ -67,18 +63,20 @@ public:
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
/// \param[in] colIndices array of column indices, contains nnzb entries
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
: nnzValues(nnzValues_),
colIndices(colIndices_),
rowPointers(rowPointers_),
Nb(Nb_),
nnzbs(nnzbs_),
block_size(block_size_),
deleteNnzs(false),
deleteSparsity(false)
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_,
Scalar* nnzValues_, int *colIndices_, int *rowPointers_)
: nnzValues(nnzValues_)
, colIndices(colIndices_)
, rowPointers(rowPointers_)
, Nb(Nb_)
, nnzbs(nnzbs_)
, block_size(block_size_)
, deleteNnzs(false)
, deleteSparsity(false)
{}
~BlockedMatrix(){
~BlockedMatrix()
{
if (deleteNnzs) {
delete[] nnzValues;
}
@ -88,8 +86,7 @@ public:
}
}
double *nnzValues;
Scalar* nnzValues;
int *colIndices;
int *rowPointers;
int Nb;
@ -99,14 +96,13 @@ public:
bool deleteSparsity;
};
/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
/// These ints aren't actually nonzeroes, but represent a mapping used later
/// \param[inout] colIndices represent keys in sorting
/// \param[inout] data sorted according to the colIndices
/// \param[in] left lower index of data of row
/// \param[in] right upper index of data of row
void sortRow(int *colIndices, int *data, int left, int right);
void sortRow(int* colIndices, int* data, int left, int right);
/// Multiply and subtract blocks
/// a = a - (b * c)
@ -114,7 +110,8 @@ void sortRow(int *colIndices, int *data, int left, int right);
/// \param[in] b input block
/// \param[in] c input block
/// \param[in] block_size size of block
void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
template<class Scalar>
void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size);
/// Perform a matrix-matrix multiplication on two blocks
/// resMat = mat1 * mat2
@ -122,9 +119,9 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
/// \param[in] mat2 input block 2
/// \param[out] resMat output block
/// \param[in] block_size size of block
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
template<class Scalar>
void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size);
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -29,17 +29,17 @@ namespace Accelerator
/// This struct resembles a csr matrix, only doubles are supported
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
class Matrix {
template<class Scalar>
class Matrix
{
public:
/// Allocate square Matrix and data arrays with given sizes
/// \param[in] N number of rows
/// \param[in] nnzs number of nonzeros
Matrix(int N_, int nnzs_)
: N(N_),
M(N_),
nnzs(nnzs_)
: N(N_)
, M(N_)
, nnzs(nnzs_)
{
nnzValues.resize(nnzs);
colIndices.resize(nnzs);
@ -51,12 +51,12 @@ public:
/// \param[in] M number of columns
/// \param[in] nnzs number of nonzeros
Matrix(int N_, int M_, int nnzs_)
: Matrix(N_, nnzs_)
: Matrix(N_, nnzs_)
{
M = M_;
}
std::vector<double> nnzValues;
std::vector<Scalar> nnzValues;
std::vector<int> colIndices;
std::vector<int> rowPointers;
int N, M;

View File

@ -29,21 +29,27 @@
namespace Opm
{
MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
unsigned int Mb_,
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
unsigned int DnumBlocks_, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues)
:
dim(dim_), // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
dim_wells(dim_wells_), // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
M(Mb_ * dim_wells), // number of rows, M == dim_wells*Mb
Mb(Mb_), // number of blockrows in C, D and B
DnumBlocks(DnumBlocks_), // number of blocks in D
template<class Scalar>
MultisegmentWellContribution<Scalar>::
MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
unsigned int Mb_,
std::vector<Scalar>& Bvalues,
std::vector<unsigned int>& BcolIndices,
std::vector<unsigned int>& BrowPointers,
unsigned int DnumBlocks_,
Scalar* Dvalues,
UMFPackIndex* DcolPointers,
UMFPackIndex* DrowIndices,
std::vector<Scalar>& Cvalues)
: dim(dim_) // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
, dim_wells(dim_wells_) // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
, M(Mb_ * dim_wells) // number of rows, M == dim_wells*Mb
, Mb(Mb_) // number of blockrows in C, D and B
, DnumBlocks(DnumBlocks_) // number of blocks in D
// copy data for matrix D into vectors to prevent it going out of scope
Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
Dcols(DcolPointers, DcolPointers + M + 1),
Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
, Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells)
, Dcols(DcolPointers, DcolPointers + M + 1)
, Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
{
Cvals = std::move(Cvalues);
Bvals = std::move(Bvalues);
@ -57,17 +63,18 @@ MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, un
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
}
MultisegmentWellContribution::~MultisegmentWellContribution()
template<class Scalar>
MultisegmentWellContribution<Scalar>::~MultisegmentWellContribution()
{
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
umfpack_di_free_numeric(&UMFPACK_Numeric);
}
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
// h_x and h_y reside on host
// y -= (C^T * (D^-1 * (B * x)))
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
template<class Scalar>
void MultisegmentWellContribution<Scalar>::apply(Scalar* h_x, Scalar* h_y)
{
OPM_TIMEBLOCK(apply);
// reset z1 and z2
@ -80,7 +87,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
unsigned int colIdx = Bcols[blockID];
for (unsigned int j = 0; j < dim_wells; ++j) {
double temp = 0.0;
Scalar temp = 0.0;
for (unsigned int k = 0; k < dim; ++k) {
temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
}
@ -100,7 +107,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
unsigned int colIdx = Bcols[blockID];
for (unsigned int j = 0; j < dim; ++j) {
double temp = 0.0;
Scalar temp = 0.0;
for (unsigned int k = 0; k < dim_wells; ++k) {
temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
}
@ -111,11 +118,14 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
}
#if HAVE_CUDA
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
template<class Scalar>
void MultisegmentWellContribution<Scalar>::setCudaStream(cudaStream_t stream_)
{
stream = stream_;
}
#endif
template class MultisegmentWellContribution<double>;
} //namespace Opm

View File

@ -41,6 +41,7 @@ namespace Opm
/// B*x and D*B*x are a vector with M*numWellEq doubles
/// C*D*B*x is a vector with N*numEq doubles.
template<class Scalar>
class MultisegmentWellContribution
{
@ -57,15 +58,15 @@ private:
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
// Sparsity pattern for C is not stored, since it is the same as B
unsigned int DnumBlocks; // number of blocks in D
std::vector<double> Cvals;
std::vector<double> Dvals;
std::vector<double> Bvals;
std::vector<Scalar> Cvals;
std::vector<Scalar> Dvals;
std::vector<Scalar> Bvals;
std::vector<int> Dcols; // Columnpointers, contains M+1 entries
std::vector<unsigned int> Bcols;
std::vector<int> Drows; // Rowindicies, contains DnumBlocks*dim*dim_wells entries
std::vector<unsigned int> Brows;
std::vector<double> z1; // z1 = B * x
std::vector<double> z2; // z2 = D^-1 * B * x
std::vector<Scalar> z1; // z1 = B * x
std::vector<Scalar> z2; // z2 = D^-1 * B * x
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
/// Translate the columnIndex if needed
@ -97,9 +98,14 @@ public:
/// \param[in] Cvalues nonzero values of matrix C
MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
unsigned int Mb,
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers,
UMFPackIndex *DrowIndices, std::vector<double> &Cvalues);
std::vector<Scalar>& Bvalues,
std::vector<unsigned int>& BcolIndices,
std::vector<unsigned int>& BrowPointers,
unsigned int DnumBlocks,
Scalar* Dvalues,
UMFPackIndex* DcolPointers,
UMFPackIndex* DrowIndices,
std::vector<Scalar>& Cvalues);
/// Destroy a MultisegmentWellContribution, and free memory
~MultisegmentWellContribution();
@ -108,7 +114,7 @@ public:
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
/// \param[in] h_x vector x, must be on CPU
/// \param[inout] h_y vector y, must be on CPU
void apply(double *h_x, double *h_y);
void apply(Scalar* h_x, Scalar* h_y);
};
} //namespace Opm

View File

@ -39,35 +39,36 @@
namespace Opm {
WellContributions::~WellContributions() = default;
template<class Scalar>
WellContributions<Scalar>::~WellContributions() = default;
std::unique_ptr<WellContributions>
WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
template<class Scalar>
std::unique_ptr<WellContributions<Scalar>>
WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useWellConn)
{
if(accelerator_mode.compare("cusparse") == 0){
if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA
return std::make_unique<WellContributionsCuda>();
return std::make_unique<WellContributionsCuda<Scalar>>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
#endif
}
else if(accelerator_mode.compare("opencl") == 0){
else if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL
return std::make_unique<WellContributionsOCL>();
return std::make_unique<WellContributionsOCL<Scalar>>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
#endif
}
else if(accelerator_mode.compare("rocsparse") == 0){
else if (accelerator_mode.compare("rocsparse") == 0) {
if (!useWellConn) {
#if HAVE_ROCSPARSE
return std::make_unique<WellContributionsRocsparse>();
return std::make_unique<WellContributionsRocsparse<Scalar>>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
#endif
}
return std::make_unique<WellContributions>();
}
else if(accelerator_mode.compare("amgcl") == 0){
if (!useWellConn) {
@ -86,10 +87,12 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
}
}
void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
[[maybe_unused]] int* colIndices,
[[maybe_unused]] double* values,
[[maybe_unused]] unsigned int val_size)
template<class Scalar>
void WellContributions<Scalar>::
addMatrix([[maybe_unused]] MatrixType type,
[[maybe_unused]] int* colIndices,
[[maybe_unused]] Scalar* values,
[[maybe_unused]] unsigned int val_size)
{
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
@ -107,7 +110,8 @@ void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
}
}
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
template<class Scalar>
void WellContributions<Scalar>::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
{
dim = dim_;
dim_wells = dim_wells_;
@ -121,11 +125,14 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
}
}
void WellContributions::setVectorSize(unsigned N_) {
template<class Scalar>
void WellContributions<Scalar>::setVectorSize(unsigned N_)
{
N = N_;
}
void WellContributions::addNumBlocks(unsigned int numBlocks)
template<class Scalar>
void WellContributions<Scalar>::addNumBlocks(unsigned int numBlocks)
{
if (allocated) {
OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
@ -134,7 +141,8 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
num_std_wells++;
}
void WellContributions::alloc()
template<class Scalar>
void WellContributions<Scalar>::alloc()
{
if (num_std_wells > 0) {
val_pointers.resize(num_std_wells+1);
@ -144,31 +152,36 @@ void WellContributions::alloc()
}
}
void WellContributions::addMultisegmentWellContribution(unsigned int dim_,
unsigned int dim_wells_,
unsigned int Mb,
std::vector<double>& Bvalues,
std::vector<unsigned int>& BcolIndices,
std::vector<unsigned int>& BrowPointers,
unsigned int DnumBlocks,
double* Dvalues,
UMFPackIndex* DcolPointers,
UMFPackIndex* DrowIndices,
std::vector<double>& Cvalues)
template<class Scalar>
void WellContributions<Scalar>::
addMultisegmentWellContribution(unsigned int dim_,
unsigned int dim_wells_,
unsigned int Mb,
std::vector<Scalar>& Bvalues,
std::vector<unsigned int>& BcolIndices,
std::vector<unsigned int>& BrowPointers,
unsigned int DnumBlocks,
Scalar* Dvalues,
UMFPackIndex* DcolPointers,
UMFPackIndex* DrowIndices,
std::vector<Scalar>& Cvalues)
{
assert(dim==dim_);
multisegments.push_back(std::make_unique<MultisegmentWellContribution>(dim_,
dim_wells_,
Mb,
Bvalues,
BcolIndices,
BrowPointers,
DnumBlocks,
Dvalues,
DcolPointers,
DrowIndices,
Cvalues));
using MSW = MultisegmentWellContribution<Scalar>;
multisegments.push_back(std::make_unique<MSW>(dim_,
dim_wells_,
Mb,
Bvalues,
BcolIndices,
BrowPointers,
DnumBlocks,
Dvalues,
DcolPointers,
DrowIndices,
Cvalues));
++num_ms_wells;
}
template class WellContributions<double>;
} //namespace Opm

View File

@ -30,7 +30,7 @@
namespace Opm {
class MultisegmentWellContribution;
template<class Scalar> class MultisegmentWellContribution;
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver or openclSolver.
/// If the --matrix-add-well-contributions commandline parameter is true, this class should still be used, but be empty.
@ -48,6 +48,7 @@ class MultisegmentWellContribution;
/// - get total size of all wellcontributions that must be stored here
/// - allocate memory
/// - copy data of wellcontributions
template<class Scalar>
class WellContributions
{
public:
@ -74,7 +75,7 @@ protected:
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
std::vector<unsigned int> val_pointers; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;
std::vector<std::unique_ptr<MultisegmentWellContribution<Scalar>>> multisegments;
public:
unsigned int getNumWells(){
@ -105,7 +106,7 @@ public:
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
void addMatrix(MatrixType type, int* colIndices, Scalar* values, unsigned int val_size);
/// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
@ -120,19 +121,25 @@ public:
/// \param[in] DcolPointers columnpointers of matrix D
/// \param[in] DrowIndices rowindices of matrix D
/// \param[in] Cvalues nonzero values of matrix C
void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
void addMultisegmentWellContribution(unsigned int dim,
unsigned int dim_wells,
unsigned int Mb,
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
unsigned int DnumBlocks, double *Dvalues,
UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues);
std::vector<Scalar>& Bvalues,
std::vector<unsigned int>& BcolIndices,
std::vector<unsigned int>& BrowPointers,
unsigned int DnumBlocks,
Scalar* Dvalues,
UMFPackIndex* DcolPointers,
UMFPackIndex* DrowIndices,
std::vector<Scalar>& Cvalues);
protected:
//! \brief API specific allocation.
virtual void APIalloc() {}
/// Api specific upload of matrix.
virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
virtual void APIaddMatrix(MatrixType, int*, Scalar*, unsigned int) {}
};
} //namespace Opm
#endif

View File

@ -46,36 +46,35 @@
#include <tuple>
#include <vector>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
amgclSolverBackend<block_size>::amgclSolverBackend(const int verbosity_,
const int maxit_,
const double tolerance_,
const unsigned int platformID_,
const unsigned int deviceID_)
: BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
template<class Scalar, unsigned int block_size>
amgclSolverBackend<Scalar,block_size>::
amgclSolverBackend(const int verbosity_,
const int maxit_,
const Scalar tolerance_,
const unsigned int platformID_,
const unsigned int deviceID_)
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
{}
template <unsigned int block_size>
amgclSolverBackend<block_size>::~amgclSolverBackend() {}
template<class Scalar, unsigned int block_size>
amgclSolverBackend<Scalar,block_size>::~amgclSolverBackend()
{}
template <unsigned int block_size>
void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::initialize(int Nb_, int nnzbs)
{
this->Nb = Nb_;
this->N = Nb * block_size;
this->nnzb = nnzbs;
this->nnz = nnzbs * block_size * block_size;
std::ostringstream out;
out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
out << "Initializing amgclSolverBackend, matrix size: " << Nb
<< " blockrows, nnzb: " << nnzb << " blocks\n";
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "DeviceID: " << deviceID << "\n";
OpmLog::info(out.str());
@ -118,7 +117,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
prm.put("solver.maxiter", t3);
bool t4 = prm.get("solver.verbose", verbosity >= 2);
prm.put("solver.verbose", t4);
out << "Using parameters from " << filename << " (with default values for omitted parameters):\n";
out << "Using parameters from " << filename
<< " (with default values for omitted parameters):\n";
} else { // otherwise use default parameters, same as Dune
prm.put("backend_type", "cpu"); // put it in the tree so it gets printed
prm.put("precond.class", "relaxation");
@ -142,7 +142,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
} else if (backend_type_string == "vexcl") {
backend_type = Amgcl_backend_type::vexcl;
} else {
OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
OPM_THROW(std::logic_error,
"Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
}
if (backend_type == Amgcl_backend_type::cuda) {
@ -160,9 +161,10 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
initialized = true;
} // end initialize()
template <unsigned int block_size>
void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *cols) {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::
convert_sparsity_pattern(int* rows, int* cols)
{
Timer t;
const unsigned int bs = block_size;
int idx = 0; // indicates the unblocked write index
@ -189,9 +191,10 @@ void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *co
}
} // end convert_sparsity_pattern()
template <unsigned int block_size>
void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::
convert_data(Scalar* vals, int* rows)
{
Timer t;
const unsigned int bs = block_size;
int idx = 0; // indicates the unblocked write index
@ -217,7 +220,9 @@ void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
} // end convert_data()
#if HAVE_VEXCL
void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformID, unsigned int deviceID) {
void initialize_vexcl(std::vector<cl::CommandQueue>& ctx,
unsigned int platformID, unsigned int deviceID)
{
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Platform::get(&platforms);
@ -245,19 +250,20 @@ void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformI
OpmLog::info(out.str());
}
template <typename vexcl_matrix_type, typename vexcl_vector_type, unsigned int block_size, typename AIJInfo>
void solve_vexcl(
const AIJInfo& A,
const boost::property_tree::ptree prm,
const std::vector<cl::CommandQueue>& ctx,
double *b,
std::vector<double>& x,
const int N,
int& iters,
double& error)
template <typename vexcl_matrix_type, typename vexcl_vector_type,
unsigned int block_size, typename Scalar, typename AIJInfo>
void solve_vexcl(const AIJInfo& A,
const boost::property_tree::ptree prm,
const std::vector<cl::CommandQueue>& ctx,
Scalar* b,
std::vector<Scalar>& x,
const int N,
int& iters,
Scalar& error)
{
typedef amgcl::backend::vexcl<vexcl_matrix_type> Backend;
typedef amgcl::make_solver<amgcl::runtime::preconditioner<Backend>, amgcl::runtime::solver::wrapper<Backend> > Solver;
using Backend = amgcl::backend::vexcl<vexcl_matrix_type>;
using Solver = amgcl::make_solver<amgcl::runtime::preconditioner<Backend>,
amgcl::runtime::solver::wrapper<Backend>>;
typename Solver::backend_params bprm;
bprm.q = ctx; // set vexcl context
@ -275,8 +281,10 @@ void solve_vexcl(
}
#endif
template <unsigned int block_size>
void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::
solve_system(Scalar* b, BdaResult& res)
{
Timer t;
try {
@ -306,7 +314,7 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
// reset x vector
std::fill(x.begin(), x.end(), 0.0);
std::vector<double> b_(b, b + N);
std::vector<Scalar> b_(b, b + N);
// create numa vectors
typename CPU_Backend::params bprm;
@ -349,10 +357,11 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
if constexpr(block_size == 1){
auto A = std::tie(N, A_rows, A_cols, A_vals);
solve_vexcl<double, double, block_size>(A, prm, ctx, b, x, N, iters, error);
solve_vexcl<Scalar, Scalar, block_size>(A, prm, ctx, b, x, N, iters, error);
} else {
// allow vexcl to use blocked matrices
vex::scoped_program_header h1(ctx, amgcl::backend::vexcl_static_matrix_declaration<double, block_size>());
vex::scoped_program_header h1(ctx,
amgcl::backend::vexcl_static_matrix_declaration<Scalar, block_size>());
auto Atmp = std::tie(N, A_rows, A_cols, A_vals);
auto A = amgcl::adapter::block_matrix<dmat_type>(Atmp);
@ -375,8 +384,8 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
if (verbosity >= 1) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
out << "=== converged: " << res.converged << ", time: " << res.elapsed
<< ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
OpmLog::info(out.str());
}
if (verbosity >= 3) {
@ -384,14 +393,13 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
out << "amgclSolverBackend::solve_system(): " << time_elapsed << " s";
OpmLog::info(out.str());
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void amgclSolverBackend<block_size>::get_result(double *x_) {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::get_result(Scalar* x_)
{
Timer t;
std::copy(x.begin(), x.end(), x_);
@ -403,13 +411,13 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
[[maybe_unused]] WellContributions& wellContribs,
BdaResult &res)
template<class Scalar, unsigned int block_size>
SolverStatus amgclSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
[[maybe_unused]] WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
initialize(matrix->Nb, matrix->nnzbs);
@ -420,15 +428,14 @@ SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<Blocke
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class amgclSolverBackend<1>; \
template class amgclSolverBackend<2>; \
template class amgclSolverBackend<3>; \
template class amgclSolverBackend<4>; \
template class amgclSolverBackend<5>; \
template class amgclSolverBackend<6>;
#define INSTANTIATE_TYPE(T) \
template class amgclSolverBackend<T,1>; \
template class amgclSolverBackend<T,2>; \
template class amgclSolverBackend<T,3>; \
template class amgclSolverBackend<T,4>; \
template class amgclSolverBackend<T,5>; \
template class amgclSolverBackend<T,6>;
INSTANTIATE_TYPE(double)
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -41,17 +41,14 @@
#include <type_traits>
#include <vector>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
/// Note amgcl also implements blocked solvers, but looks like it needs unblocked input data
template <unsigned int block_size>
class amgclSolverBackend : public BdaSolver<block_size>
template<class Scalar, unsigned int block_size>
class amgclSolverBackend : public BdaSolver<Scalar,block_size>
{
typedef BdaSolver<block_size> Base;
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -64,17 +61,16 @@ class amgclSolverBackend : public BdaSolver<block_size>
using Base::tolerance;
using Base::initialized;
using dmat_type = amgcl::static_matrix<double, block_size, block_size>; // matrix value type in double precision
using dvec_type = amgcl::static_matrix<double, block_size, 1>; // the corresponding vector value type
using dmat_type = amgcl::static_matrix<Scalar, block_size, block_size>; // matrix value type in double precision
using dvec_type = amgcl::static_matrix<Scalar, block_size, 1>; // the corresponding vector value type
using CPU_Backend = std::conditional_t<block_size == 1,
amgcl::backend::builtin<double>,
amgcl::backend::builtin<Scalar>,
amgcl::backend::builtin<dmat_type>>;
using CPU_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CPU_Backend>,
amgcl::runtime::solver::wrapper<CPU_Backend>>;
private:
// amgcl can use different backends, this lets the user choose
enum Amgcl_backend_type {
cpu,
@ -84,18 +80,18 @@ private:
// store matrix in CSR format
std::vector<unsigned> A_rows, A_cols;
std::vector<double> A_vals, rhs;
std::vector<double> x;
std::vector<Scalar> A_vals, rhs;
std::vector<Scalar> x;
std::once_flag print_info;
Amgcl_backend_type backend_type = cpu;
boost::property_tree::ptree prm; // amgcl parameters
int iters = 0;
double error = 0.0;
Scalar error = 0.0;
#if HAVE_CUDA
std::once_flag cuda_initialize;
void solve_cuda(double *b);
void solve_cuda(Scalar* b);
#endif
#if HAVE_VEXCL
@ -114,21 +110,23 @@ private:
/// Convert the BCSR nonzero data to a CSR format
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
void convert_data(double *vals, int *rows);
void convert_data(Scalar* vals, int* rows);
/// Solve linear system
/// \param[in] b pointer to b vector
/// \param[inout] res summary of solver result
void solve_system(double *b, BdaResult &res);
void solve_system(Scalar* b, BdaResult& res);
public:
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver
/// \param[in] maxit maximum number of iterations for openclSolver
/// \param[in] tolerance required relative tolerance for openclSolver
/// Construct an amgcl solver
/// \param[in] linear_solver_verbosity verbosity of amgclSolver
/// \param[in] maxit maximum number of iterations for amgclSolver
/// \param[in] tolerance required relative tolerance for amgclSolver
/// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used
amgclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
amgclSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, unsigned int platformID,
unsigned int deviceID);
/// Destroy a openclSolver, and free memory
~amgclSolverBackend();
@ -140,18 +138,18 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
}; // end class amgclSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -28,18 +28,14 @@
/// This file is only compiled when both amgcl and CUDA are found by CMake
namespace Opm
namespace Opm::Accelerator {
template<class Scalar, unsigned int block_size>
void amgclSolverBackend<Scalar,block_size>::solve_cuda(Scalar* b)
{
namespace Accelerator
{
using Opm::OpmLog;
template <unsigned int block_size>
void amgclSolverBackend<block_size>::solve_cuda(double *b) {
typedef amgcl::backend::cuda<double> CUDA_Backend;
typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
using CUDA_Backend = amgcl::backend::cuda<Scalar>;
using CUDA_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>,
amgcl::runtime::solver::wrapper<CUDA_Backend>>;
static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle
@ -67,8 +63,8 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
OpmLog::info(out.str());
});
thrust::device_vector<double> B(b, b + N);
thrust::device_vector<double> X(N, 0.0);
thrust::device_vector<Scalar> B(b, b + N);
thrust::device_vector<Scalar> X(N, 0.0);
// actually solve
std::tie(iters, error) = solve(B, X);
@ -76,19 +72,15 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
thrust::copy(X.begin(), X.end(), x.begin());
}
#define INSTANTIATE_TYPE(T) \
template void amgclSolverBackend<T,1>::solve_cuda(T*); \
template void amgclSolverBackend<T,2>::solve_cuda(T*); \
template void amgclSolverBackend<T,3>::solve_cuda(T*); \
template void amgclSolverBackend<T,4>::solve_cuda(T*); \
template void amgclSolverBackend<T,5>::solve_cuda(T*); \
template void amgclSolverBackend<T,6>::solve_cuda(T*);
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template void amgclSolverBackend<n>::solve_cuda(double*); \
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -33,18 +33,17 @@ namespace Opm
{
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
__global__ void apply_well_contributions(
const double * __restrict__ Cnnzs,
const double * __restrict__ Dnnzs,
const double * __restrict__ Bnnzs,
const int * __restrict__ Ccols,
const int * __restrict__ Bcols,
const double * __restrict__ x,
double * __restrict__ y,
const int dim,
const int dim_wells,
const unsigned int * __restrict__ val_pointers
)
template<class Scalar>
__global__ void apply_well_contributions(const Scalar* __restrict__ Cnnzs,
const Scalar* __restrict__ Dnnzs,
const Scalar* __restrict__ Bnnzs,
const int* __restrict__ Ccols,
const int* __restrict__ Bcols,
const Scalar* __restrict__ x,
Scalar* __restrict__ y,
const int dim,
const int dim_wells,
const unsigned int * __restrict__ val_pointers)
{
const int idx_b = blockIdx.x;
const int idx_t = threadIdx.x;
@ -57,9 +56,9 @@ __global__ void apply_well_contributions(
const int c = lane % dim; // col in block
const int r = (lane / dim) % dim_wells; // row in block
extern __shared__ double smem[];
double * __restrict__ z1 = smem;
double * __restrict__ z2 = z1 + dim_wells;
extern __shared__ unsigned char smem[];
Scalar* __restrict__ z1 = reinterpret_cast<Scalar*>(smem);
Scalar* __restrict__ z2 = z1 + dim_wells;
if (idx_t < dim_wells) {
z1[idx_t] = 0.0;
@ -70,7 +69,7 @@ __global__ void apply_well_contributions(
// z1 = B * x
if (idx_t < num_active_threads) {
// multiply all blocks with x
double temp = 0.0;
Scalar temp = 0.0;
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
while (b < val_size + val_pointers[idx_b]) {
int colIdx = Bcols[b];
@ -106,7 +105,7 @@ __global__ void apply_well_contributions(
// z2 = D^-1 * B * x = D^-1 * z1
if (idx_t < dim_wells) {
double temp = 0.0;
Scalar temp = 0.0;
for (int c = 0; c < dim_wells; ++c) {
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
}
@ -118,7 +117,7 @@ __global__ void apply_well_contributions(
// y -= C^T * D^-1 * B * x
// use dim * val_size threads, each block is assigned 'dim' threads
if (idx_t < dim * val_size) {
double temp = 0.0;
Scalar temp = 0.0;
int b = idx_t / dim + val_pointers[idx_b];
int cc = idx_t % dim;
int colIdx = Ccols[b];
@ -127,13 +126,13 @@ __global__ void apply_well_contributions(
}
y[colIdx * dim + cc] -= temp;
}
}
WellContributionsCuda::~WellContributionsCuda()
template<class Scalar>
WellContributionsCuda<Scalar>::~WellContributionsCuda()
{
// delete data for StandardWell
if (num_std_wells > 0) {
if (this->num_std_wells > 0) {
cudaFree(d_Cnnzs);
cudaFree(d_Dnnzs);
cudaFree(d_Bnnzs);
@ -142,80 +141,108 @@ WellContributionsCuda::~WellContributionsCuda()
cudaFree(d_val_pointers);
}
if (num_ms_wells > 0 && h_x) {
if (this->num_ms_wells > 0 && h_x) {
cudaFreeHost(h_x);
cudaFreeHost(h_y);
h_x = h_y = nullptr; // Mark as free for constructor
}
}
void WellContributionsCuda::APIalloc()
template<class Scalar>
void WellContributionsCuda<Scalar>::APIalloc()
{
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaMalloc((void**)&d_Cnnzs,
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
cudaMalloc((void**)&d_Dnnzs,
sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
cudaMalloc((void**)&d_Bnnzs,
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * this->num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * this->num_blocks);
cudaMalloc((void**)&this->d_val_pointers, sizeof(unsigned int) * (this->num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
void WellContributionsCuda::apply(double *d_x, double *d_y)
template<class Scalar>
void WellContributionsCuda<Scalar>::apply(Scalar* d_x, Scalar* d_y)
{
// apply MultisegmentWells
// make sure the stream is empty if timing measurements are done
cudaStreamSynchronize(stream);
if (num_ms_wells > 0) {
if (this->num_ms_wells > 0) {
// allocate pinned memory on host if not yet done
if (h_x == nullptr) {
cudaMallocHost(&h_x, sizeof(double) * N);
cudaMallocHost(&h_y, sizeof(double) * N);
cudaMallocHost(&h_x, sizeof(Scalar) * this->N);
cudaMallocHost(&h_y, sizeof(Scalar) * this->N);
}
// copy vectors x and y from GPU to CPU
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(h_x, d_x, sizeof(Scalar) * this->N,
cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(h_y, d_y, sizeof(Scalar) * this->N,
cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// actually apply MultisegmentWells
for (auto& well : multisegments) {
for (auto& well : this->multisegments) {
well->apply(h_x, h_y);
}
// copy vector y from CPU to GPU
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_y, h_y, sizeof(Scalar) * this->N,
cudaMemcpyHostToDevice, stream);
cudaStreamSynchronize(stream);
}
// apply StandardWells
if (num_std_wells > 0) {
int smem_size = 2 * sizeof(double) * dim_wells;
apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
if (this->num_std_wells > 0) {
int smem_size = 2 * sizeof(Scalar) * this->dim_wells;
apply_well_contributions <<< this->num_std_wells, 32, smem_size, stream>>>(d_Cnnzs,
d_Dnnzs,
d_Bnnzs,
d_Ccols,
d_Bcols,
d_x,
d_y,
this->dim,
this->dim_wells,
this->d_val_pointers);
}
}
void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
template<class Scalar>
void WellContributionsCuda<Scalar>::APIaddMatrix(MatrixType type, int* colIndices,
Scalar* values, unsigned int val_size)
{
switch (type) {
case MatrixType::C:
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
cudaMemcpy(d_Cnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Ccols + this->num_blocks_so_far, colIndices,
sizeof(int) * val_size, cudaMemcpyHostToDevice);
break;
case MatrixType::D:
cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Dnnzs + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
values, sizeof(Scalar) * this->dim_wells * this->dim_wells,
cudaMemcpyHostToDevice);
break;
case MatrixType::B:
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
cudaMemcpy(d_val_pointers, val_pointers.data(), sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
cudaMemcpy(d_Bnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Bcols + this->num_blocks_so_far, colIndices,
sizeof(int) * val_size, cudaMemcpyHostToDevice);
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
this->val_pointers[this->num_std_wells] = this->num_blocks;
cudaMemcpy(d_val_pointers, this->val_pointers.data(),
sizeof(unsigned int) * (this->num_std_wells + 1),
cudaMemcpyHostToDevice);
}
break;
default:
@ -224,13 +251,16 @@ void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, doubl
cudaCheckLastError("WellContributions::addMatrix() failed");
}
void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
template<class Scalar>
void WellContributionsCuda<Scalar>::setCudaStream(cudaStream_t stream_)
{
this->stream = stream_;
for (auto& well : multisegments) {
for (auto& well : this->multisegments) {
well->setCudaStream(stream_);
}
}
template class WellContributionsCuda<double>;
} //namespace Opm

View File

@ -25,10 +25,10 @@
#include <cuda_runtime.h>
namespace Opm
{
namespace Opm {
class WellContributionsCuda : public WellContributions
template<class Scalar>
class WellContributionsCuda : public WellContributions<Scalar>
{
public:
~WellContributionsCuda() override;
@ -41,33 +41,35 @@ public:
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
void apply(Scalar* d_x, Scalar* d_y);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
using MatrixType = typename WellContributions<Scalar>::MatrixType;
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
void APIaddMatrix(MatrixType type, int* colIndices,
Scalar* values, unsigned int val_size) override;
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
Scalar* d_Cnnzs = nullptr;
Scalar* d_Dnnzs = nullptr;
Scalar* d_Bnnzs = nullptr;
int* d_Ccols = nullptr;
int* d_Bcols = nullptr;
Scalar* d_z1 = nullptr;
Scalar* d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
double* h_x = nullptr;
double* h_y = nullptr;
Scalar* h_x = nullptr;
Scalar* h_y = nullptr;
};
} //namespace Opm

View File

@ -44,22 +44,20 @@
extern std::shared_ptr<std::thread> copyThread;
#endif // HAVE_OPENMP
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
template <unsigned int block_size>
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {
template<class Scalar, unsigned int block_size>
cusparseSolverBackend<Scalar, block_size>::
cusparseSolverBackend(int verbosity_, int maxit_,
Scalar tolerance_, unsigned int deviceID_)
: Base(verbosity_, maxit_, tolerance_, deviceID_)
{
// initialize CUDA device, stream and libraries
cudaSetDevice(deviceID);
cudaCheckLastError("Could not get device");
@ -67,7 +65,8 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
std::ostringstream out;
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
out << "Name GPU: " << props.name << ", Compute Capability: "
<< props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
@ -84,26 +83,29 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
cudaCheckLastError("Could not set stream to cusparse");
}
template <unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
template<class Scalar, unsigned int block_size>
cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
{
finalize();
}
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
int n = N;
double rho = 1.0, rhop;
double alpha, nalpha, beta;
double omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
Scalar rho = 1.0, rhop;
Scalar alpha, nalpha, beta;
Scalar omega, nomega, tmp1, tmp2;
Scalar norm, norm_0;
Scalar zero = 0.0;
Scalar one = 1.0;
Scalar mone = -1.0;
float it;
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -147,7 +149,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -178,7 +180,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
@ -190,7 +192,6 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
if (norm < tolerance * norm_0) {
break;
}
@ -210,15 +211,18 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
if (verbosity > 0) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
out << "=== converged: " << res.converged << ", conv_rate: "
<< res.conv_rate << ", time: " << res.elapsed
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
}
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
@ -232,46 +236,49 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
}
std::ostringstream out;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
out << "Initializing GPU, matrix size: " << Nb
<< " blockrows, nnz: " << nnzb << " blocks\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "Maxit: " << maxit << std::scientific
<< ", tolerance: " << tolerance << "\n";
OpmLog::info(out.str());
cudaMalloc((void**)&d_x, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N);
cudaMalloc((void**)&d_rw, sizeof(double) * N);
cudaMalloc((void**)&d_p, sizeof(double) * N);
cudaMalloc((void**)&d_pw, sizeof(double) * N);
cudaMalloc((void**)&d_s, sizeof(double) * N);
cudaMalloc((void**)&d_t, sizeof(double) * N);
cudaMalloc((void**)&d_v, sizeof(double) * N);
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
if (useJacMatrix) {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
} else {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
d_mCols = d_bCols;
d_mRows = d_bRows;
}
cudaCheckLastError("Could not allocate enough memory on GPU");
#if COPY_ROW_BY_ROW
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
cudaCheckLastError("Could not allocate pinned memory");
#endif
initialized = true;
} // end initialize()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::finalize() {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::finalize()
{
if (initialized) {
cudaFree(d_x);
cudaFree(d_b);
@ -307,40 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
}
} // end finalize()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
Timer t;
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
#endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_mVals, d_bVals,
nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
}
#endif
if (useJacMatrix) {
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
cudaMemcpyHostToDevice, stream);
}
if (verbosity >= 3) {
@ -353,33 +374,43 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
Timer t;
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, vals_contiguous,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
nnzbs_prec * block_size * block_size * sizeof(Scalar),
cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
cudaMemcpyDeviceToDevice, stream);
}
#endif
@ -394,10 +425,9 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() {
template<class Scalar, unsigned int block_size>
bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
{
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
@ -472,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
return true;
} // end analyse_matrix()
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() {
template<class Scalar, unsigned int block_size>
bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
{
Timer t;
cusparseDbsrilu02(cusparseHandle, order, \
@ -497,23 +528,24 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
return true;
} // end create_preconditioner()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
// actually solve
gpu_pbicgstab(wellContribs, res);
cudaStreamSynchronize(stream);
cudaCheckLastError("Something went wrong during the GPU solve");
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
{
Timer t;
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
if (verbosity > 2) {
@ -523,14 +555,13 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
template<class Scalar, unsigned int block_size>
SolverStatus cusparseSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
initialize(matrix, jacMatrix);
@ -551,18 +582,14 @@ SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<Blo
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class cusparseSolverBackend<T,1>; \
template class cusparseSolverBackend<T,2>; \
template class cusparseSolverBackend<T,3>; \
template class cusparseSolverBackend<T,4>; \
template class cusparseSolverBackend<T,5>; \
template class cusparseSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -28,16 +28,13 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<block_size> {
typedef BdaSolver<block_size> Base;
template<class Scalar, unsigned int block_size>
class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
{
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -50,7 +47,6 @@ class cusparseSolverBackend : public BdaSolver<block_size> {
using Base::initialized;
private:
cublasHandle_t cublasHandle;
cusparseHandle_t cusparseHandle;
cudaStream_t stream;
@ -58,13 +54,13 @@ private:
bsrilu02Info_t info_M;
bsrsv2Info_t info_L, info_U;
// b: bsr matrix, m: preconditioner
double *d_bVals, *d_mVals;
Scalar *d_bVals, *d_mVals;
int *d_bCols, *d_mCols;
int *d_bRows, *d_mRows;
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
double *d_pw, *d_s, *d_t, *d_v;
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
Scalar *d_pw, *d_s, *d_t, *d_v;
void *d_buffer;
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
bool analysis_done = false;
@ -77,12 +73,13 @@ private:
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix for spmv
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Clean memory
void finalize();
@ -92,14 +89,18 @@ private:
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// also copy matrix for preconditioner if needed
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@ -112,17 +113,16 @@ private:
/// Solve linear system
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions& wellContribs, BdaResult &res);
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
public:
/// Construct a cusparseSolver
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
/// \param[in] maxit maximum number of iterations for cusparseSolver
/// \param[in] tolerance required relative tolerance for cusparseSolver
/// \param[in] deviceID the device to be used
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
cusparseSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, unsigned int deviceID);
/// Destroy a cusparseSolver, and free memory
~cusparseSolverBackend();
@ -134,17 +134,19 @@ public:
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
}; // end class cusparseSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -31,33 +31,29 @@
#include <sstream>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
template<class Scalar, unsigned int block_size>
BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
: Base(verbosity_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
#if CHOW_PATEL
chowPatelIlu.setVerbosity(verbosity);
#endif
}
template <unsigned int block_size>
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
template<class Scalar, unsigned int block_size>
bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
{
return analyze_matrix(mat, nullptr);
}
template <unsigned int block_size>
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
template<class Scalar, unsigned int block_size>
bool BILU0<Scalar,block_size>::
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
{
const unsigned int bs = block_size;
@ -77,30 +73,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
CSCRowIndices.resize(matToDecompose->nnzbs);
CSCColPointers.resize(Nb + 1);
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
Timer t_convert;
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers,
CSCRowIndices.data(), CSCColPointers.data(), Nb);
if(verbosity >= 3){
std::ostringstream out;
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
} else {
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
}
Timer t_analysis;
std::ostringstream out;
if (opencl_ilu_parallel) {
out << "opencl_ilu_parallel: true (level_scheduling)\n";
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers,
CSCRowIndices.data(), CSCColPointers.data(), Nb,
&numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else {
out << "opencl_ilu_parallel: false\n";
// numColors = 1;
// rowsPerColor.emplace_back(Nb);
numColors = Nb;
for(int i = 0; i < Nb; ++i){
for (int i = 0; i < Nb; ++i) {
rowsPerColor.emplace_back(1);
}
}
@ -118,44 +117,52 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
invDiagVals.resize(mat->Nb * bs * bs);
#if CHOW_PATEL
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
Lmat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
Umat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
#endif
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
#if CHOW_PATEL
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
#else
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * LUmat->nnzbs);
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
#endif
events.resize(3);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0,
mat->Nb * sizeof(Scalar) * bs * bs,
invDiagVals.data(), nullptr, &events[0]);
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0,
(numColors + 1) * sizeof(int),
rowsPerColorPrefix.data(), nullptr, &events[1]);
if (opencl_ilu_parallel) {
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
Nb * sizeof(unsigned), fromOrder.data(),
nullptr, &events[2]);
} else {
// fromOrder is not initialized, so use something else to fill s.rowIndices
// s.rowIndices[i] == i must hold, since every rowidx is mapped to itself (i.e. no actual mapping)
// rowsPerColorPrefix is misused here, it contains an increasing sequence (0, 1, 2, ...)
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), rowsPerColorPrefix.data(), nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
Nb * sizeof(unsigned),
rowsPerColorPrefix.data(), nullptr, &events[2]);
}
cl::WaitForEvents(events);
@ -168,17 +175,15 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
return true;
}
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
template<class Scalar, unsigned int block_size>
bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
{
return create_preconditioner(mat, nullptr);
}
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
template<class Scalar, unsigned int block_size>
bool BILU0<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
{
const unsigned int bs = block_size;
@ -186,7 +191,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
Timer t_copy;
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
memcpy(LUmat->nnzValues, matToDecompose->nnzValues,
sizeof(Scalar) * bs * bs * matToDecompose->nnzbs);
if (verbosity >= 3){
std::ostringstream out;
@ -205,7 +211,9 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
Timer t_copyToGpu;
events.resize(1);
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0,
LUmat->nnzbs * bs * bs * sizeof(Scalar),
LUmat->nnzValues, nullptr, &events[0]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
@ -213,14 +221,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
auto candidate = std::find(LUmat->colIndices + rowStart,
LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(4);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int),
diagIndex.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int),
LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int),
LUmat->rowPointers, nullptr, &events[3]);
});
cl::WaitForEvents(events);
@ -242,11 +254,12 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
out << "color " << color << ": " << firstRow << " - " << lastRow
<< " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size);
OpenclKernels<Scalar>::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size);
}
if (verbosity >= 3) {
@ -259,43 +272,42 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
return true;
} // end create_preconditioner()
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
template<class Scalar, unsigned int block_size>
void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
const Scalar relaxation = 0.9;
cl::Event event;
Timer t_apply;
for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif
}
for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif
}
// apply relaxation
OpenclKernels::scale(x, relaxation, N);
OpenclKernels<Scalar>::scale(x, relaxation, N);
if (verbosity >= 4) {
std::ostringstream out;
@ -304,20 +316,14 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
}
}
#define INSTANCE_TYPE(T) \
template class BILU0<T,1>; \
template class BILU0<T,2>; \
template class BILU0<T,3>; \
template class BILU0<T,4>; \
template class BILU0<T,5>; \
template class BILU0<T,6>;
INSTANCE_TYPE(double)
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class BILU0<n>;
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -29,18 +29,15 @@
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a Blocked ILU0 preconditioner
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
/// The preconditioner is applied via two exact triangular solves
template <unsigned int block_size>
class BILU0 : public Preconditioner<block_size>
template<class Scalar, unsigned int block_size>
class BILU0 : public Preconditioner<Scalar,block_size>
{
typedef Preconditioner<block_size> Base;
using Base = Preconditioner<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -53,11 +50,11 @@ class BILU0 : public Preconditioner<block_size>
using Base::err;
private:
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
std::unique_ptr<BlockedMatrix<Scalar>> LUmat{};
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
std::unique_ptr<BlockedMatrix<Scalar>> Lmat{}, Umat{};
#endif
std::vector<double> invDiagVals;
std::vector<Scalar> invDiagVals;
std::vector<int> diagIndex;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
@ -67,7 +64,7 @@ private:
bool opencl_ilu_parallel;
typedef struct {
struct GPU_storage {
cl::Buffer invDiagVals; // nnz values of diagonal blocks of the matrix, inverted
cl::Buffer diagIndex; // index of diagonal block of each row, used to differentiate between lower and upper triangular part
cl::Buffer rowsPerColor; // number of rows for every color
@ -80,7 +77,7 @@ private:
#else
cl::Buffer LUvals, LUcols, LUrows;
#endif
} GPU_storage;
};
GPU_storage s;
@ -93,21 +90,25 @@ public:
BILU0(bool opencl_ilu_parallel, int verbosity);
// analysis, extract parallelism if specified
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
// apply preconditioner, x = prec(y)
// via Lz = y
// and Ux = z
void apply(const cl::Buffer& y, cl::Buffer& x) override;
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
get_preconditioner_structure()
{
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)},
{LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
}
std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
@ -120,8 +121,6 @@ public:
}
};
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -34,26 +34,25 @@
#include <sstream>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
BISAI<block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_) :
Preconditioner<block_size>(verbosity_)
template<class Scalar, unsigned int block_size>
BISAI<Scalar,block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
: Base(verbosity_)
{
#if CHOW_PATEL
OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
#endif
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel_, verbosity_);
bilu0 = std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
}
template <unsigned int block_size>
void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
template<class Scalar, unsigned int block_size>
void BISAI<Scalar,block_size>::
setOpencl(std::shared_ptr<cl::Context>& context_,
std::shared_ptr<cl::CommandQueue>& queue_)
{
context = context_;
queue = queue_;
@ -61,7 +60,9 @@ void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::s
bilu0->setOpencl(context, queue);
}
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices){
std::vector<int>
buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices)
{
std::vector<int> aux(colPointers); // colPointers must be copied to this vector
std::vector<int> csrToCscOffsetMap(rowIndices.size()); // map must have the same size as the indices vector
@ -77,14 +78,15 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
return csrToCscOffsetMap;
}
template <unsigned int block_size>
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
template<class Scalar, unsigned int block_size>
bool BISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
{
return analyze_matrix(mat, nullptr);
}
template <unsigned int block_size>
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
template<class Scalar, unsigned int block_size>
bool BISAI<Scalar,block_size>::
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
{
const unsigned int bs = block_size;
auto *m = mat;
@ -105,21 +107,22 @@ bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
}
}
template <unsigned int block_size>
void BISAI<block_size>::buildLowerSubsystemsStructures(){
template<class Scalar, unsigned int block_size>
void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
{
lower.subsystemPointers.assign(Nb + 1, 0);
Dune::Timer t_buildLowerSubsystemsStructures;
for(int tcol = 0; tcol < Nb; tcol++){
for (int tcol = 0; tcol < Nb; tcol++) {
int frow = diagIndex[tcol] + 1;
int lrow = colPointers[tcol + 1];
int nx = lrow - frow;
int nv = 0;
for(int sweep = 0; sweep < nx - 1; sweep++){
for(int xid = sweep + 1; xid < nx; xid++){
for(int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++){
for (int sweep = 0; sweep < nx - 1; sweep++) {
for (int xid = sweep + 1; xid < nx; xid++) {
for( int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++) {
if(rowIndices[ptr] == rowIndices[frow + xid]){
lower.nzIndices.push_back(csrToCscOffsetMap[ptr]);
lower.knownRhsIndices.push_back(csrToCscOffsetMap[frow + sweep]);
@ -133,29 +136,31 @@ void BISAI<block_size>::buildLowerSubsystemsStructures(){
lower.subsystemPointers[tcol + 1] = lower.subsystemPointers[tcol] + nv;
}
if(verbosity >= 4){
if (verbosity >= 4) {
std::ostringstream out;
out << "BISAI buildLowerSubsystemsStructures time: " << t_buildLowerSubsystemsStructures.stop() << " s";
out << "BISAI buildLowerSubsystemsStructures time: "
<< t_buildLowerSubsystemsStructures.stop() << " s";
OpmLog::info(out.str());
}
}
template <unsigned int block_size>
void BISAI<block_size>::buildUpperSubsystemsStructures(){
template<class Scalar, unsigned int block_size>
void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
{
upper.subsystemPointers.assign(Nb + 1, 0);
Dune::Timer t_buildUpperSubsystemsStructures;
for(int tcol = 0; tcol < Nb; tcol++){
for (int tcol = 0; tcol < Nb; tcol++) {
int frow = colPointers[tcol];
int lrow = diagIndex[tcol];
int nx = lrow - frow + 1;
int nv = 0;
for(int sweep = 0; sweep < nx - 1; sweep++){
for(int xid = 0; xid < nx; xid++){
for(int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++){
if(rowIndices[ptr] == rowIndices[lrow - xid]){
for (int sweep = 0; sweep < nx - 1; sweep++) {
for (int xid = 0; xid < nx; xid++) {
for (int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++) {
if (rowIndices[ptr] == rowIndices[lrow - xid]) {
upper.nzIndices.push_back(csrToCscOffsetMap[ptr]);
upper.knownRhsIndices.push_back(csrToCscOffsetMap[lrow - sweep]);
upper.unknownRhsIndices.push_back(csrToCscOffsetMap[lrow - xid]);
@ -168,15 +173,17 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
upper.subsystemPointers[tcol + 1] = upper.subsystemPointers[tcol] + nv;
}
if(verbosity >= 4){
if (verbosity >= 4) {
std::ostringstream out;
out << "BISAI buildUpperSubsystemsStructures time: " << t_buildUpperSubsystemsStructures.stop() << " s";
out << "BISAI buildUpperSubsystemsStructures time: "
<< t_buildUpperSubsystemsStructures.stop() << " s";
OpmLog::info(out.str());
}
}
template <unsigned int block_size>
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
template<class Scalar, unsigned int block_size>
bool BISAI<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
{
const unsigned int bs = block_size;
@ -199,48 +206,93 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
buildLowerSubsystemsStructures();
buildUpperSubsystemsStructures();
d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * colPointers.size());
d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * rowIndices.size());
d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * csrToCscOffsetMap.size());
d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * diagIndex.size());
d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb * bs);
d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.subsystemPointers.size());
d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.subsystemPointers.size());
d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * colPointers.size());
d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * rowIndices.size());
d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * csrToCscOffsetMap.size());
d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * diagIndex.size());
d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * nnzb * bs * bs);
d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * nnzb * bs * bs);
d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * Nb * bs);
d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * lower.subsystemPointers.size());
d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * upper.subsystemPointers.size());
if(!lower.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.nzIndices.size());
d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.knownRhsIndices.size());
d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.unknownRhsIndices.size());
if (!lower.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * lower.nzIndices.size());
d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * lower.knownRhsIndices.size());
d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * lower.unknownRhsIndices.size());
}
if(!upper.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.nzIndices.size());
d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.knownRhsIndices.size());
d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.unknownRhsIndices.size());
if (!upper.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * upper.nzIndices.size());
d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * upper.knownRhsIndices.size());
d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(int) * upper.unknownRhsIndices.size());
}
events.resize(6);
err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0, colPointers.size() * sizeof(int), colPointers.data(), nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0, rowIndices.size() * sizeof(int), rowIndices.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0, csrToCscOffsetMap.size() * sizeof(int), csrToCscOffsetMap.data(), nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, diagIndex.size() * sizeof(int), diagIndex.data(), nullptr, &events[3]);
err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0, sizeof(int) * lower.subsystemPointers.size(), lower.subsystemPointers.data(), nullptr, &events[4]);
err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0, sizeof(int) * upper.subsystemPointers.size(), upper.subsystemPointers.data(), nullptr, &events[5]);
err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0,
colPointers.size() * sizeof(int),
colPointers.data(), nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0,
rowIndices.size() * sizeof(int),
rowIndices.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0,
csrToCscOffsetMap.size() * sizeof(int),
csrToCscOffsetMap.data(), nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0,
diagIndex.size() * sizeof(int),
diagIndex.data(), nullptr, &events[3]);
err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0,
sizeof(int) * lower.subsystemPointers.size(),
lower.subsystemPointers.data(), nullptr, &events[4]);
err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0,
sizeof(int) * upper.subsystemPointers.size(),
upper.subsystemPointers.data(), nullptr, &events[5]);
if(!lower.nzIndices.empty()){
if (!lower.nzIndices.empty()) {
events.resize(events.size() + 3);
err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0, sizeof(int) * lower.nzIndices.size(), lower.nzIndices.data(), nullptr, &events[events.size() - 3]);
err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.knownRhsIndices.size(), lower.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.unknownRhsIndices.size(), lower.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0,
sizeof(int) * lower.nzIndices.size(),
lower.nzIndices.data(), nullptr,
&events[events.size() - 3]);
err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0,
sizeof(int) * lower.knownRhsIndices.size(),
lower.knownRhsIndices.data(), nullptr,
&events[events.size() - 2]);
err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0,
sizeof(int) * lower.unknownRhsIndices.size(),
lower.unknownRhsIndices.data(), nullptr,
&events[events.size() - 1]);
}
if(!upper.nzIndices.empty()){
if (!upper.nzIndices.empty()) {
events.resize(events.size() + 3);
err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE, 0, sizeof(int) * upper.nzIndices.size(), upper.nzIndices.data(), nullptr, &events[events.size() - 3]);
err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.knownRhsIndices.size(), upper.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.unknownRhsIndices.size(), upper.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE,
0, sizeof(int) * upper.nzIndices.size(),
upper.nzIndices.data(), nullptr,
&events[events.size() - 3]);
err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0,
sizeof(int) * upper.knownRhsIndices.size(),
upper.knownRhsIndices.data(), nullptr,
&events[events.size() - 2]);
err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0,
sizeof(int) * upper.unknownRhsIndices.size(),
upper.unknownRhsIndices.data(), nullptr,
&events[events.size() - 1]);
}
cl::WaitForEvents(events);
@ -255,16 +307,24 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
std::tie(d_LUvals, d_invDiagVals) = bilu0->get_preconditioner_data();
events.resize(2);
err = queue->enqueueFillBuffer(d_invLvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[0]);
err |= queue->enqueueFillBuffer(d_invUvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[1]);
err = queue->enqueueFillBuffer(d_invLvals, 0, 0,
sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[0]);
err |= queue->enqueueFillBuffer(d_invUvals, 0, 0,
sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
OpenclKernels<Scalar>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
d_lower.subsystemPointers, d_lower.nzIndices,
d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
d_LUvals, d_invLvals, Nb);
OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
d_csrToCscOffsetMap, d_upper.subsystemPointers,
d_upper.nzIndices, d_upper.unknownRhsIndices,
d_upper.knownRhsIndices, d_LUvals,
d_invDiagVals, d_invUvals, Nb);
if(verbosity >= 4){
if (verbosity >= 4) {
std::ostringstream out;
out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
OpmLog::info(out.str());
@ -273,33 +333,34 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
return true;
}
template <unsigned int block_size>
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
template<class Scalar, unsigned int block_size>
bool BISAI<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat)
{
return create_preconditioner(mat, nullptr);
}
template <unsigned int block_size>
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
template<class Scalar, unsigned int block_size>
void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
{
const unsigned int bs = block_size;
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
// (to compensate for the unitary diagonal that is not
// included in isaiL, for simplicity)
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
OpenclKernels<Scalar>::spmv(d_invLvals, d_rowIndices, d_colPointers,
x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
// (to compensate for the unitary diagonal that is not
// included in isaiL, for simplicity)
OpenclKernels<Scalar>::spmv(d_invUvals, d_rowIndices, d_colPointers,
d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class BISAI<n>;
#define INSTANCE_TYPE(T) \
template class BISAI<T,1>; \
template class BISAI<T,2>; \
template class BISAI<T,3>; \
template class BISAI<T,4>; \
template class BISAI<T,5>; \
template class BISAI<T,6>;
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
INSTANCE_TYPE(double)
#undef INSTANTIATE_BDA_FUNCTIONS
}
}
} // namespace Opm::Accelerator

View File

@ -26,19 +26,16 @@
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
class BlockedMatrix;
template<class Scalar> class BlockedMatrix;
/// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
/// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
template <unsigned int block_size>
class BISAI : public Preconditioner<block_size>
template<class Scalar, unsigned int block_size>
class BISAI : public Preconditioner<Scalar,block_size>
{
typedef Preconditioner<block_size> Base;
using Base = Preconditioner<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -57,8 +54,8 @@ private:
std::vector<int> rowIndices;
std::vector<int> diagIndex;
std::vector<int> csrToCscOffsetMap;
std::vector<double> invLvals;
std::vector<double> invUvals;
std::vector<Scalar> invLvals;
std::vector<Scalar> invUvals;
cl::Buffer d_colPointers;
cl::Buffer d_rowIndices;
@ -71,10 +68,10 @@ private:
cl::Buffer d_invL_x;
bool opencl_ilu_parallel;
std::unique_ptr<BILU0<block_size> > bilu0;
std::unique_ptr<BILU0<Scalar,block_size>> bilu0;
/// Struct that holds the structure of the small subsystems for each column
typedef struct{
struct subsystemStructure {
/// This vector holds the cumulative sum for the number of non-zero blocks for each subsystem.
/// Works similarly to row and column pointers for the CSR and CSC matrix representations.
std::vector<int> subsystemPointers;
@ -88,15 +85,15 @@ private:
std::vector<int> knownRhsIndices;
/// This vector holds the indices of the unknown values of the right hand sides of the subsystems.
std::vector<int> unknownRhsIndices;
} subsystemStructure;
};
/// GPU version of subsystemStructure
typedef struct{
struct subsystemStructureGPU {
cl::Buffer subsystemPointers;
cl::Buffer nzIndices;
cl::Buffer knownRhsIndices;
cl::Buffer unknownRhsIndices;
} subsystemStructureGPU;
} ;
subsystemStructure lower, upper;
subsystemStructureGPU d_lower, d_upper;
@ -113,15 +110,18 @@ public:
BISAI(bool opencl_ilu_parallel, int verbosity);
// set own Opencl variables, but also that of the bilu0 preconditioner
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
void setOpencl(std::shared_ptr<cl::Context>& context,
std::shared_ptr<cl::CommandQueue>& queue) override;
// analysis, extract parallelism
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) override;
@ -132,7 +132,6 @@ public:
/// in the csrToCscOffsetMap[i]-th position in the CSC representation.
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices);
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -34,37 +34,32 @@
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
namespace Opm::Accelerator {
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
CPR<block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
template<class Scalar, unsigned int block_size>
CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
: Base(verbosity_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
diagIndices.resize(1);
}
template <unsigned int block_size>
void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::
setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
{
context = context_;
queue = queue_;
bilu0->setOpencl(context, queue);
}
template <unsigned int block_size>
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
template<class Scalar, unsigned int block_size>
bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
{
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
@ -75,8 +70,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
return success;
}
template <unsigned int block_size>
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
template<class Scalar, unsigned int block_size>
bool CPR<Scalar,block_size>::
analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
{
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
@ -88,8 +85,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat)
return success;
}
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
template<class Scalar, unsigned int block_size>
bool CPR<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
{
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_, jacMat);
if (verbosity >= 3) {
@ -108,8 +107,10 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *
return result;
}
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
template<class Scalar, unsigned int block_size>
bool CPR<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat_)
{
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_);
if (verbosity >= 3) {
@ -128,26 +129,30 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
return result;
}
// return the absolute value of the N elements for which the absolute value is highest
double get_absmax(const double *data, const int N) {
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
template<class Scalar>
Scalar get_absmax(const Scalar* data, const int N)
{
return std::abs(*std::max_element(data, data + N,
[](Scalar a, Scalar b)
{ return std::fabs(a) < std::fabs(b); }));
}
// solve A^T * x = b
void solve_transposed_3x3(const double *A, const double *b, double *x) {
template<class Scalar>
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
{
const int B = 3;
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
double t4 = A[0+0*B] * A[1+1*B];
double t6 = A[0+0*B] * A[1+2*B];
double t8 = A[0+1*B] * A[1+0*B];
double t10 = A[0+2*B] * A[1+0*B];
double t12 = A[0+1*B] * A[2+0*B];
double t14 = A[0+2*B] * A[2+0*B];
Scalar t4 = A[0+0*B] * A[1+1*B];
Scalar t6 = A[0+0*B] * A[1+2*B];
Scalar t8 = A[0+1*B] * A[1+0*B];
Scalar t10 = A[0+2*B] * A[1+0*B];
Scalar t12 = A[0+1*B] * A[2+0*B];
Scalar t14 = A[0+2*B] * A[2+0*B];
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
@ -162,44 +167,49 @@ void solve_transposed_3x3(const double *A, const double *b, double *x) {
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
}
template <unsigned int block_size>
void CPR<block_size>::init_opencl_buffers() {
template<class Scalar, unsigned int block_size>
void CPR<Scalar, block_size>::init_opencl_buffers()
{
d_Amatrices.reserve(num_levels);
d_Rmatrices.reserve(num_levels - 1);
d_invDiags.reserve(num_levels - 1);
for (Matrix& m : Amatrices) {
for (Matrix<Scalar>& m : Amatrices) {
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
}
for (Matrix& m : Rmatrices) {
for (Matrix<Scalar>& m : Rmatrices) {
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
}
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
}
template <unsigned int block_size>
void CPR<block_size>::opencl_upload() {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::opencl_upload()
{
d_mat->upload(queue.get(), mat);
err = CL_SUCCESS;
events.resize(2 * Rmatrices.size() + 1);
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
nullptr, &events[2*i+1]);
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
nullptr, &events[2*i+2]);
}
cl::WaitForEvents(events);
events.clear();
@ -212,9 +222,10 @@ void CPR<block_size>::opencl_upload() {
}
}
template <unsigned int block_size>
void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::
create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
{
this->mat = mat_;
coarse_vals.resize(nnzb);
@ -222,8 +233,8 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
coarse_y.resize(Nb);
weights.resize(N);
try{
double rhs[] = {0, 0, 0};
try {
Scalar rhs[] = {0, 0, 0};
rhs[pressure_idx] = 1;
// find diagonal index for each row
@ -241,12 +252,12 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
// calculate weights for each row
for (int row = 0; row < Nb; ++row) {
// solve to find weights
double *row_weights = weights.data() + block_size * row; // weights for this row
Scalar* row_weights = weights.data() + block_size * row; // weights for this row
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
// normalize weights for this row
double abs_max = get_absmax(row_weights, block_size);
for(unsigned int i = 0; i < block_size; i++){
Scalar abs_max = get_absmax(row_weights, block_size);
for (unsigned int i = 0; i < block_size; i++) {
row_weights[i] /= abs_max;
}
}
@ -257,9 +268,9 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
double *block = mat->nnzValues + idx * block_size * block_size;
double *row_weights = weights.data() + block_size * row;
double value = 0.0;
Scalar* block = mat->nnzValues + idx * block_size * block_size;
Scalar* row_weights = weights.data() + block_size * row;
Scalar value = 0.0;
for (unsigned int i = 0; i < block_size; ++i) {
value += block[block_size * i + pressure_idx] * row_weights[i];
}
@ -276,10 +287,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
if (recalculate_aggregates) {
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
typedef DuneMat::CreateIterator Iter;
using Iter = typename DuneMat::CreateIterator;
// setup sparsity pattern
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
int start = mat->rowPointers[row.index()];
int end = mat->rowPointers[row.index() + 1];
for (int idx = start; idx < end; ++idx) {
@ -302,7 +313,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
Dune::Amg::SequentialInformation seqinfo;
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
Opm::PropertyTree property_tree;
PropertyTree property_tree;
property_tree.put("alpha", 0.333333333333);
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
@ -315,7 +326,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
num_pre_smooth_steps = c.getNoPreSmoothSteps();
num_post_smooth_steps = c.getNoPostSmoothSteps();
dune_amg->build<OverlapFlags>(c);
dune_amg->template build<OverlapFlags>(c);
analyzeHierarchy();
analyzeAggregateMaps();
@ -351,10 +362,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeHierarchy() {
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::analyzeHierarchy()
{
const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
// store coarsest AMG level in umfpack format, also performs LU decomposition
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
@ -372,8 +383,8 @@ void CPR<block_size>::analyzeHierarchy() {
// matrixIter.dereference() returns MatrixAdapter
// matrixIter.dereference().getmat() returns BCRSMat
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
for (int level = 0; level < num_levels; ++matrixIter, ++level) {
const auto& A = matrixIter.dereference().getmat();
level_sizes[level] = A.N();
diagIndices[level].reserve(A.N());
@ -395,38 +406,38 @@ void CPR<block_size>::analyzeHierarchy() {
}
}
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
Amatrices.back().colIndices);
// compute inverse diagonal values for current level
invDiags.emplace_back(A.N());
for (unsigned int row = 0; row < A.N(); ++row) {
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
}
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeAggregateMaps() {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::analyzeAggregateMaps()
{
PcolIndices.resize(num_levels - 1);
Rmatrices.clear();
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
DuneAmg::AggregatesMap *map = *mapIter;
typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
typename DuneAmg::AggregatesMap* map = *mapIter;
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
// get indices for each row of P and R
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
PcolIndices[level].resize(level_sizes[level]);
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
const long int diff = ai - map->begin();
PcolIndices[level][diff] = *ai;
@ -446,19 +457,20 @@ void CPR<block_size>::analyzeAggregateMaps() {
}
}
template <unsigned int block_size>
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
OpenclMatrix *A = &d_Amatrices[level];
OpenclMatrix *R = &d_Rmatrices[level];
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
{
OpenclMatrix<Scalar>* A = &d_Amatrices[level];
OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
int Ncur = A->Nb;
if (level == num_levels - 1) {
// solve coarsest level
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
events.resize(1);
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
@ -470,7 +482,8 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
umfpack.apply(h_x.data(), h_y.data());
events.resize(1);
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
@ -486,34 +499,37 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
// presmooth
double jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
// move to coarser level
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
amg_cycle_gpu(level + 1, f, u);
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
// postsmooth
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
x, y, t, Ncur, 1);
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
}
// x = prec(y)
template <unsigned int block_size>
void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
{
// 0-initialize u and x vectors
events.resize(d_u.size() + 1);
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
sizeof(Scalar) * Nb, nullptr, &events[0]);
for (unsigned int i = 0; i < d_u.size(); ++i) {
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
}
cl::WaitForEvents(events);
events.clear();
@ -522,16 +538,18 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
}
template <unsigned int block_size>
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
template<class Scalar, unsigned int block_size>
void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
Dune::Timer t_bilu0;
bilu0->apply(y, x);
if (verbosity >= 4) {
@ -549,20 +567,14 @@ void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
}
}
#define INSTANCE_TYPE(T) \
template class CPR<T,1>; \
template class CPR<T,2>; \
template class CPR<T,3>; \
template class CPR<T,4>; \
template class CPR<T,5>; \
template class CPR<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class CPR<n>;
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
INSTANCE_TYPE(double)
} // namespace Opm::Accelerator

View File

@ -33,18 +33,15 @@
#include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
class BlockedMatrix;
template<class Scalar> class BlockedMatrix;
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
template <unsigned int block_size>
class CPR : public Preconditioner<block_size>
template<class Scalar, unsigned int block_size>
class CPR : public Preconditioner<Scalar,block_size>
{
typedef Preconditioner<block_size> Base;
using Base = Preconditioner<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -58,25 +55,25 @@ class CPR : public Preconditioner<block_size>
private:
int num_levels;
std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
std::vector<cl::Buffer> d_PcolIndices;
std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
std::vector<cl::Buffer> d_invDiags;
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
std::unique_ptr<OpenclMatrix> d_mat; // stores blocked matrix
std::unique_ptr<OpenclMatrix<Scalar>> d_mat; // stores blocked matrix
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
std::unique_ptr<BILU0<block_size> > bilu0; // Blocked ILU0 preconditioner
BlockedMatrix *mat = nullptr; // input matrix, blocked
std::unique_ptr<BILU0<Scalar,block_size>> bilu0; // Blocked ILU0 preconditioner
BlockedMatrix<Scalar>* mat = nullptr; // input matrix, blocked
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
std::unique_ptr<DuneAmg> dune_amg;
@ -91,7 +88,7 @@ private:
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
// Analyze the AMG hierarchy build by Dune
@ -112,32 +109,35 @@ private:
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
void create_preconditioner_amg(BlockedMatrix *mat);
void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);
public:
CPR(bool opencl_ilu_parallel, int verbosity);
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
// set own Opencl variables, but also that of the bilu0 preconditioner
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
void setOpencl(std::shared_ptr<cl::Context>& context,
std::shared_ptr<cl::CommandQueue>& queue) override;
// applies blocked ilu0
// also applies amg for pressure component
void apply(const cl::Buffer& y, cl::Buffer& x) override;
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat) override;
};
// solve A^T * x = b
// A should represent a 3x3 matrix
// x and b are vectors with 3 elements
void solve_transposed_3x3(const double *A, const double *b, double *x);
template<class Scalar>
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -31,12 +31,19 @@ namespace Opm
namespace Accelerator
{
void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
template<class Scalar>
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue,
Scalar* vals, int* cols, int* rows)
{
std::vector<cl::Event> events(3);
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0,
sizeof(Scalar) * block_size * block_size * nnzbs,
vals, nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs,
cols, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1),
rows, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
@ -46,7 +53,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int
}
}
void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
template<class Scalar>
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix)
{
if (block_size != 1) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
}
@ -54,7 +63,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
}
void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
template<class Scalar>
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix)
{
if (matrix->block_size != block_size) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
}
@ -62,5 +73,7 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
}
template class OpenclMatrix<double>;
} // namespace Accelerator
} // namespace Opm

View File

@ -29,28 +29,30 @@ namespace Opm
namespace Accelerator
{
class Matrix;
class BlockedMatrix;
template<class Scalar> class Matrix;
template<class Scalar> class BlockedMatrix;
/// This struct resembles a csr matrix, only doubles are supported
/// The matrix data is stored in OpenCL Buffers
class OpenclMatrix {
template<class Scalar>
class OpenclMatrix
{
public:
OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
: Nb(Nb_),
Mb(Mb_),
nnzbs(nnzbs_),
block_size(block_size_)
{
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * block_size * block_size * nnzbs);
colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
}
void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
void upload(cl::CommandQueue *queue, Matrix *matrix);
void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
void upload(cl::CommandQueue* queue, Scalar* vals, int* cols, int* rows);
void upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix);
void upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix);
cl::Buffer nnzValues;
cl::Buffer colIndices;

View File

@ -30,61 +30,58 @@
#include <memory>
#include <string>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
template <unsigned int block_size>
void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
template<class Scalar, unsigned int block_size>
void Preconditioner<Scalar,block_size>::
setOpencl(std::shared_ptr<cl::Context>& context_,
std::shared_ptr<cl::CommandQueue>& queue_)
{
context = context_;
queue = queue_;
}
template <unsigned int block_size>
std::unique_ptr<Preconditioner<block_size>>
Preconditioner<block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
template<class Scalar, unsigned int block_size>
std::unique_ptr<Preconditioner<Scalar,block_size>>
Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
{
switch (type ) {
case Type::BILU0:
return std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity);
return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
case Type::CPR:
return std::make_unique<CPR<block_size> >(opencl_ilu_parallel, verbosity);
return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
case Type::BISAI:
return std::make_unique<BISAI<block_size> >(opencl_ilu_parallel, verbosity);
return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
}
OPM_THROW(std::logic_error,
"Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
}
template <unsigned int block_size>
bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
template<class Scalar, unsigned int block_size>
bool Preconditioner<Scalar,block_size>::
analyze_matrix(BlockedMatrix<Scalar>* mat,
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
{
return analyze_matrix(mat);
}
template <unsigned int block_size>
bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
template<class Scalar, unsigned int block_size>
bool Preconditioner<Scalar,block_size>::
create_preconditioner(BlockedMatrix<Scalar>* mat,
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
{
return create_preconditioner(mat);
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(Type, bool, int); \
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \
template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
#define INSTANCE_TYPE(T) \
template class Preconditioner<T,1>; \
template class Preconditioner<T,2>; \
template class Preconditioner<T,3>; \
template class Preconditioner<T,4>; \
template class Preconditioner<T,5>; \
template class Preconditioner<T,6>;
INSTANCE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace Accelerator
} //namespace Opm
} // namespace Opm::Accelerator

View File

@ -24,17 +24,13 @@
#include <memory>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
class BlockedMatrix;
template<class Scalar> class BlockedMatrix;
template <unsigned int block_size>
template<class Scalar, unsigned int block_size>
class Preconditioner
{
protected:
int N = 0; // number of rows of the matrix
int Nb = 0; // number of blockrows of the matrix
@ -65,7 +61,8 @@ public:
virtual ~Preconditioner() = default;
// nested Preconditioners might need to override this
virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
virtual void setOpencl(std::shared_ptr<cl::Context>& context,
std::shared_ptr<cl::CommandQueue>& queue);
// apply preconditioner, x = prec(y)
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
@ -73,16 +70,17 @@ public:
// analyze matrix, e.g. the sparsity pattern
// probably only called once
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat) = 0;
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat);
// create/update preconditioner, probably used every linear solve
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat) = 0;
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat,
BlockedMatrix<Scalar>* jacMat);
};
} //namespace Accelerator
} //namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -18,52 +18,71 @@
*/
#include <config.h>
#include <cmath>
#include <sstream>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
namespace Opm
{
namespace Accelerator
{
#include <cmath>
#include <sstream>
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int OpenclKernels::verbosity;
cl::CommandQueue *OpenclKernels::queue;
std::vector<double> OpenclKernels::tmp;
bool OpenclKernels::initialized = false;
std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
template<class Scalar> int OpenclKernels<Scalar>::verbosity;
template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
template<class Scalar>
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
template<class Scalar>
std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
template<class Scalar>
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
template<class Scalar>
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
template<class Scalar>
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
template<class Scalar>
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
template<class Scalar>
std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
template<class Scalar>
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
return A / B + (A % B > 0);
}
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
template<class Scalar>
void OpenclKernels<Scalar>::init(cl::Context *context,
cl::CommandQueue *queue_,
std::vector<cl::Device>& devices, int verbosity_)
{
if (initialized) {
OpmLog::debug("Warning OpenclKernels is already initialized");
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
initialized = true;
} // end get_opencl_kernels()
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
template<class Scalar>
Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_dot;
tmp.resize(num_work_groups);
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_sum = 0.0;
Scalar gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
return gpu_sum;
}
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
template<class Scalar>
Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_norm;
tmp.resize(num_work_groups);
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_norm = 0.0;
Scalar gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
return gpu_norm;
}
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
template<class Scalar>
void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
}
}
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
template<class Scalar>
void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
}
}
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
template<class Scalar>
void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
}
}
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
const double omega, const double beta, int N)
template<class Scalar>
void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
const Scalar omega, const Scalar beta, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
}
}
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
}
}
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
}
}
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
template<class Scalar>
void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
}
}
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
const cl::Buffer& x, cl::Buffer& b, int Nb,
unsigned int block_size, bool reset, bool add)
template<class Scalar>
void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
const cl::Buffer& x, cl::Buffer& b, int Nb,
unsigned int block_size, bool reset, bool add)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_spmv;
cl::Event event;
if (block_size > 1) {
if (add) {
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
} else {
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
}
} else {
if (reset) {
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
} else {
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
}
}
@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
}
}
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& x, const cl::Buffer& rhs,
cl::Buffer& out, int Nb, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& x, const cl::Buffer& rhs,
cl::Buffer& out, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_residual;
cl::Event event;
if (block_size > 1) {
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
} else {
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
}
if (verbosity >= 4) {
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
}
}
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
}
}
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = 128;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int num_hwarps_per_group = work_group_size / 16;
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot
Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group));
firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
}
}
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
template<class Scalar>
void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) {
event.wait();
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
}
}
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiL;
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
if (verbosity >= 4) {
event.wait();
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
}
}
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiU;
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
if (verbosity >= 4) {
event.wait();
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
}
}
} // namespace Accelerator
} // namespace Opm
template class OpenclKernels<double>;
} // namespace Opm::Accelerator

View File

@ -26,10 +26,7 @@
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
template<class Scalar>
class OpenclKernels
{
private:
static int verbosity;
static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static std::vector<Scalar> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
@ -117,12 +115,12 @@ public:
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N);
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const Scalar a, int N);
static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
@ -150,7 +148,40 @@ public:
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
};
} // namespace Accelerator
} // namespace Opm
#if CHOW_PATEL
#define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_str;
#else
#define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
#endif
#define DECLARE_INSTANCE(T) \
DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::axpy_str; \
template<> const std::string OpenclKernels<T>::scale_str; \
template<> const std::string OpenclKernels<T>::vmul_str; \
template<> const std::string OpenclKernels<T>::dot_1_str; \
template<> const std::string OpenclKernels<T>::norm_str; \
template<> const std::string OpenclKernels<T>::custom_str; \
template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str; \
template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
template<> const std::string OpenclKernels<T>::prolongate_vector_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_add_str; \
template<> const std::string OpenclKernels<T>::spmv_str; \
template<> const std::string OpenclKernels<T>::spmv_noreset_str; \
template<> const std::string OpenclKernels<T>::residual_blocked_str; \
template<> const std::string OpenclKernels<T>::residual_str; \
template<> const std::string OpenclKernels<T>::stdwell_apply_str; \
template<> const std::string OpenclKernels<T>::ILU_decomp_str; \
template<> const std::string OpenclKernels<T>::isaiL_str; \
template<> const std::string OpenclKernels<T>::isaiU_str;
DECLARE_INSTANCE(double)
} // namespace Opm::Accelerator
#endif

View File

@ -37,41 +37,50 @@
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
#define COPY_ROW_BY_ROW 0
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, bool opencl_ilu_parallel_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_parallel(opencl_ilu_parallel_) {
template<class Scalar, unsigned int block_size>
openclSolverBackend<Scalar,block_size>::
openclSolverBackend(int verbosity_,
int maxit_,
Scalar tolerance_,
unsigned int platformID_,
unsigned int deviceID_,
bool opencl_ilu_parallel_,
std::string linsolver)
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
bool use_cpr, use_isai;
if (linsolver.compare("ilu0") == 0) {
if (linsolver == "ilu0") {
use_cpr = false;
use_isai = false;
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
} else if (linsolver == "cpr_quasiimpes") {
use_cpr = true;
use_isai = false;
} else if (linsolver.compare("isai") == 0) {
} else if (linsolver == "isai") {
use_cpr = false;
use_isai = true;
} else if (linsolver.compare("cpr_trueimpes") == 0) {
OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
} else if (linsolver == "cpr_trueimpes") {
OPM_THROW(std::logic_error, "Error openclSolver does not support "
"--linear-solver=cpr_trueimpes");
} else {
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
}
using PreconditionerType = typename Preconditioner<block_size>::Type;
using PreconditionerType = Preconditioner<Scalar,block_size>;
if (use_cpr) {
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, opencl_ilu_parallel, verbosity);
prec = PreconditionerType::create(PreconditionerType::Type::CPR,
opencl_ilu_parallel, verbosity);
} else if (use_isai) {
prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, opencl_ilu_parallel, verbosity);
prec = PreconditionerType::create(PreconditionerType::Type::BISAI,
opencl_ilu_parallel, verbosity);
} else {
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, opencl_ilu_parallel, verbosity);
prec = PreconditionerType::create(PreconditionerType::Type::BILU0,
opencl_ilu_parallel, verbosity);
}
std::ostringstream out;
@ -103,7 +112,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
out.clear();
if (platforms.size() <= platformID) {
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
} else {
std::string platform_info;
out << "Chosen:\n";
@ -119,7 +128,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);
if (devices.empty()) {
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
OPM_THROW(std::logic_error, "Error openclSolver is selected but "
"no OpenCL devices are found");
}
out << "Found " << devices.size() << " OpenCL devices" << "\n";
@ -203,8 +213,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -217,26 +226,33 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
}
}
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, bool opencl_ilu_parallel_) :
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_parallel(opencl_ilu_parallel_)
template<class Scalar, unsigned int block_size>
openclSolverBackend<Scalar,block_size>::
openclSolverBackend(int verbosity_, int maxit_,
Scalar tolerance_, bool opencl_ilu_parallel_)
: Base(verbosity_, maxit_, tolerance_)
, opencl_ilu_parallel(opencl_ilu_parallel_)
{
// prec = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
setOpencl(std::shared_ptr<cl::Context>& context_,
std::shared_ptr<cl::CommandQueue>& queue_)
{
context = context_;
queue = queue_;
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0;
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
Scalar norm, norm_0;
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
@ -246,15 +262,15 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// set initial values
events.resize(5);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
rho = 1.0;
alpha = 1.0;
omega = 1.0;
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
@ -263,7 +279,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = OpenclKernels::norm(d_r, d_tmp, N);
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
norm_0 = norm;
if (verbosity > 1) {
@ -277,11 +293,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
}
if (verbosity >= 3) {
queue->finish();
@ -298,7 +314,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
// v = A * pw
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
if (verbosity >= 3) {
queue->finish();
t_spmv.stop();
@ -306,20 +322,20 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
}
if(verbosity >= 3) {
if (verbosity >= 3) {
queue->finish();
t_well.stop();
t_rest.start();
}
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1;
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels::norm(d_r, d_tmp, N);
OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();
@ -343,8 +359,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
// t = A * s
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if(verbosity >= 3){
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if (verbosity >= 3) {
queue->finish();
t_spmv.stop();
t_well.start();
@ -352,7 +368,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t);
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
}
if (verbosity >= 3) {
queue->finish();
@ -360,12 +376,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start();
}
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2;
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels::norm(d_r, d_tmp, N);
OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();
@ -382,7 +398,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
}
res.iterations = std::min(it, (float)maxit);
res.iterations = std::min(it, static_cast<float>(maxit));
res.reduction = norm / norm_0;
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
res.elapsed = t_total.stop();
@ -390,7 +406,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
if (verbosity > 0) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
out << "=== converged: " << res.converged << ", conv_rate: "
<< res.conv_rate << ", time: " << res.elapsed <<
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
@ -405,9 +422,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
@ -437,22 +456,21 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
mat = matrix;
jacMat = jacMatrix;
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -467,8 +485,10 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
copy_system_to_gpu()
{
Timer t;
events.resize(5);
@ -476,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, vals_contiguous.data(),
nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
sizeof(Scalar) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
@ -504,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
update_system_on_gpu()
{
Timer t;
events.resize(3);
@ -513,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
size_row * sizeof(Scalar) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, vals_contiguous.data(),
nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
sizeof(Scalar) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
@ -538,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyze_matrix() {
template<class Scalar, unsigned int block_size>
bool openclSolverBackend<Scalar,block_size>::
analyze_matrix()
{
Timer t;
bool success;
@ -560,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
return success;
} // end analyze_matrix()
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
update_system(Scalar* vals, Scalar* b)
{
Timer t;
mat->nnzValues = vals;
@ -575,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
}
} // end update_system()
template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() {
template<class Scalar, unsigned int block_size>
bool openclSolverBackend<Scalar,block_size>::
create_preconditioner()
{
Timer t;
bool result;
@ -594,9 +631,10 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
return result;
} // end create_preconditioner()
template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
Timer t;
// actually solve
@ -604,7 +642,8 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
gpu_pbicgstab(wellContribs, res);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
oss << "openclSolverBackend::solve_system error: " << error.what()
<< "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
@ -618,17 +657,17 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
out << "openclSolver::solve_system(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void openclSolverBackend<Scalar,block_size>::
get_result(Scalar* x)
{
Timer t;
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);
if (verbosity > 2) {
std::ostringstream out;
@ -637,13 +676,13 @@ void openclSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
template<class Scalar, unsigned int block_size>
SolverStatus openclSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
initialize(matrix, jacMatrix);
@ -668,21 +707,14 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class openclSolverBackend<T,1>; \
template class openclSolverBackend<T,2>; \
template class openclSolverBackend<T,3>; \
template class openclSolverBackend<T,4>; \
template class openclSolverBackend<T,5>; \
template class openclSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend( \
int, int, double, unsigned int, unsigned int, bool, std::string); \
template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -27,16 +27,13 @@
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<block_size>
template<class Scalar, unsigned int block_size>
class openclSolverBackend : public BdaSolver<Scalar,block_size>
{
typedef BdaSolver<block_size> Base;
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
double *h_b = nullptr; // b vector, on host
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
Scalar* h_b = nullptr; // b vector, on host
std::vector<Scalar> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize()
cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
@ -63,12 +60,12 @@ private:
bool useJacMatrix = false;
std::unique_ptr<Preconditioner<block_size> > prec;
std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
// can perform blocked ILU0 and AMG on pressure component
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
bool analysis_done = false;
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
std::vector<cl::Event> events;
cl_int err;
@ -76,12 +73,13 @@ private:
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Copy linear system to GPU
void copy_system_to_gpu();
@ -89,7 +87,7 @@ private:
/// Reassign pointers, in case the addresses of the Dune variables have changed
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vector b, contains N values
void update_system(double *vals, double *b);
void update_system(Scalar* vals, Scalar* b);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu();
@ -106,11 +104,11 @@ private:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// could be empty
/// \param[inout] res summary of solver result
void solve_system(WellContributions &wellContribs, BdaResult &res);
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
public:
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
std::shared_ptr<cl::Context> context{};
std::shared_ptr<cl::CommandQueue> queue{};
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver
@ -121,11 +119,13 @@ public:
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
/// only ilu0, cpr_quasiimpes and isai are supported
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
bool opencl_ilu_parallel, std::string linsolver);
openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
unsigned int platformID, unsigned int deviceID,
bool opencl_ilu_parallel, std::string linsolver);
/// For the CPR coarse solver
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
openclSolverBackend(int linear_solver_verbosity, int maxit,
Scalar tolerance, bool opencl_ilu_parallel);
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] matrix matrix A
@ -134,8 +134,11 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
@ -143,19 +146,16 @@ public:
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
/// Set OpenCL objects
/// This class either creates them based on platformID and deviceID or receives them through this function
/// \param[in] context the opencl context to be used
/// \param[in] queue the opencl queue to be used
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
void setOpencl(std::shared_ptr<cl::Context>& context,
std::shared_ptr<cl::CommandQueue>& queue);
}; // end class openclSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -25,93 +25,122 @@
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
namespace Opm
{
namespace Opm {
using Accelerator::OpenclKernels;
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
template<class Scalar>
void WellContributionsOCL<Scalar>::
setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_)
{
this->context = context_;
this->queue = queue_;
}
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
template<class Scalar>
void WellContributionsOCL<Scalar>::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
{
OpenclKernels<Scalar>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
*d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, this->dim, this->dim_wells,
*d_val_pointers_ocl, this->num_std_wells);
}
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
template<class Scalar>
void WellContributionsOCL<Scalar>::apply_mswells(cl::Buffer d_x, cl::Buffer d_y)
{
if (h_x.empty()) {
h_x.resize(N);
h_y.resize(N);
h_x.resize(this->N);
h_y.resize(this->N);
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(Scalar) * this->N,
h_x.data(), nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
h_y.data(), nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for (auto& well : multisegments) {
for (auto& well : this->multisegments) {
well->apply(h_x.data(), h_y.data());
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
h_y.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y){
if(num_std_wells > 0){
template<class Scalar>
void WellContributionsOCL<Scalar>::apply(cl::Buffer d_x, cl::Buffer d_y)
{
if (this->num_std_wells > 0){
apply_stdwells(d_x, d_y);
}
if(num_ms_wells > 0){
if (this->num_ms_wells > 0) {
apply_mswells(d_x, d_y);
}
}
void WellContributionsOCL::APIaddMatrix(MatrixType type,
int* colIndices,
double* values,
unsigned int val_size)
template<class Scalar>
void WellContributionsOCL<Scalar>::
APIaddMatrix(MatrixType type,
int* colIndices,
Scalar* values,
unsigned int val_size)
{
if (!allocated) {
if (!this->allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE,
sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
sizeof(Scalar) * val_size * this->dim * this->dim_wells,
values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE,
sizeof(int) * this->num_blocks_so_far,
sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE,
sizeof(Scalar) * this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
sizeof(Scalar) * this->dim_wells * this->dim_wells,
values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE,
sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
sizeof(Scalar) * val_size * this->dim * this->dim_wells,
values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE,
sizeof(int) * this->num_blocks_so_far, sizeof(int) * val_size,
colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
this->val_pointers[this->num_std_wells] = this->num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0,
sizeof(unsigned int) * (this->num_std_wells + 1),
this->val_pointers.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
@ -122,14 +151,21 @@ void WellContributionsOCL::APIaddMatrix(MatrixType type,
}
}
void WellContributionsOCL::APIalloc()
template<class Scalar>
void WellContributionsOCL<Scalar>::APIalloc()
{
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
sizeof(unsigned int) * (this->num_std_wells + 1));
}
} //namespace Opm
template class WellContributionsOCL<double>;
} // namespace Opm

View File

@ -29,10 +29,10 @@
#include <vector>
namespace Opm
{
namespace Opm {
class WellContributionsOCL : public WellContributions
template<class Scalar>
class WellContributionsOCL : public WellContributions<Scalar>
{
public:
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
@ -45,7 +45,10 @@ protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
using MatrixType = typename WellContributions<Scalar>::MatrixType;
void APIaddMatrix(MatrixType type, int* colIndices,
Scalar* values, unsigned int val_size) override;
cl::Context* context;
cl::CommandQueue* queue;
@ -55,10 +58,10 @@ protected:
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
std::vector<double> h_x;
std::vector<double> h_y;
std::vector<Scalar> h_x;
std::vector<Scalar> h_y;
};
} //namespace Opm
} // namespace Opm
#endif

View File

@ -47,27 +47,28 @@
#undef HIP_HAVE_CUDA_DEFINED
#endif
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
rocalutionSolverBackend<block_size>::rocalutionSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
template<class Scalar, unsigned int block_size>
rocalutionSolverBackend<Scalar,block_size>::
rocalutionSolverBackend(int verbosity_, int maxit_, Scalar tolerance_)
: Base(verbosity_, maxit_, tolerance_)
{
rocalution::init_rocalution();
rocalution::info_rocalution();
roc_solver = std::make_unique<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
roc_prec = std::make_unique<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
using BCGS = rocalution::BiCGStab<Mat,Vec,Scalar>;
roc_solver = std::make_unique<BCGS>();
using ILU = rocalution::ILU<Mat,Vec,Scalar>;
roc_prec = std::make_unique<ILU>();
roc_solver->Verbose(0);
roc_solver->Init(/*abs_tol=*/1e-15, tolerance, /*divergence_tol=*/1e3, maxit);
}
template <unsigned int block_size>
rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
template<class Scalar, unsigned int block_size>
rocalutionSolverBackend<Scalar,block_size>::~rocalutionSolverBackend()
{
// normally, these rocalution variables are destroyed after the destructor automatically,
// but sometimes it segfaults, both with test_rocalutionSolver and with an actual case
// release both variables here to prevent that segfault
@ -76,9 +77,10 @@ rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
rocalution::stop_rocalution();
}
template <unsigned int block_size>
void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
template<class Scalar, unsigned int block_size>
void rocalutionSolverBackend<Scalar,block_size>::
initialize(BlockedMatrix<Scalar>* matrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
@ -94,15 +96,16 @@ void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
initialized = true;
} // end initialize()
template <unsigned int block_size>
void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix) {
template<class Scalar, unsigned int block_size>
void rocalutionSolverBackend<Scalar,block_size>::
convert_matrix(BlockedMatrix<Scalar>* matrix)
{
Timer t;
for(int i = 0; i < Nb+1; ++i){
for (int i = 0; i < Nb+1; ++i) {
tmp_rowpointers[i] = matrix->rowPointers[i];
}
for(int i = 0; i < nnzb; ++i){
for (int i = 0; i < nnzb; ++i) {
tmp_colindices[i] = matrix->colIndices[i];
}
@ -112,7 +115,7 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
// BCSR_IND_BASE == 0: rocalution expects column-major
// BCSR_IND_BASE == 1: rocalution expects row-major
if (BCSR_IND_BASE == 0) {
for(int i = 0; i < nnzb; ++i){
for (int i = 0; i < nnzb; ++i) {
tmp_nnzvalues[i * block_size * block_size + 0] = matrix->nnzValues[i * block_size * block_size + 0];
tmp_nnzvalues[i * block_size * block_size + 1] = matrix->nnzValues[i * block_size * block_size + 3];
tmp_nnzvalues[i * block_size * block_size + 2] = matrix->nnzValues[i * block_size * block_size + 6];
@ -131,11 +134,12 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
}
}
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void rocalutionSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void rocalutionSolverBackend<Scalar,block_size>::
get_result(Scalar* x)
{
Timer t;
std::copy(h_x.begin(), h_x.end(), x);
@ -147,13 +151,13 @@ void rocalutionSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
[[maybe_unused]] WellContributions& wellContribs,
BdaResult &res)
template<class Scalar, unsigned int block_size>
SolverStatus rocalutionSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
[[maybe_unused]] WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
initialize(matrix.get());
@ -161,21 +165,20 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
tmp_rowpointers = new int[Nb+1];
tmp_colindices = new int[nnzb];
tmp_nnzvalues = new double[nnzb*block_size*block_size];
tmp_nnzvalues = new Scalar[nnzb*block_size*block_size];
convert_matrix(matrix.get());
rocalution::LocalVector<double> roc_x;
rocalution::LocalVector<double> roc_rhs;
rocalution::LocalMatrix<double> roc_mat;
Vec roc_x;
Vec roc_rhs;
Mat roc_mat;
// this also transfers ownership to the allocated memory to rocalution
// and sets the tmp_* pointers to nullptr
roc_mat.SetDataPtrBCSR(
&tmp_rowpointers,
&tmp_colindices,
&tmp_nnzvalues,
"matrix A", nnzb, Nb, Nb, block_size);
roc_mat.SetDataPtrBCSR(&tmp_rowpointers,
&tmp_colindices,
&tmp_nnzvalues,
"matrix A", nnzb, Nb, Nb, block_size);
roc_mat.MoveToAccelerator();
roc_x.MoveToAccelerator();
@ -196,7 +199,7 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
// so it just calls ILU::Build() everytime
roc_solver->ReBuildNumeric();
double norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)
Scalar norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)
// actually solve
Dune::Timer t_solve;
@ -215,7 +218,6 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / res.iterations));
res.converged = (roc_solver->GetSolverStatus() == 2);
// copy solution vector to host vector
// if roc_x could be reused, this should be removed here
// and roc_x should be directly copied into x in get_result()
@ -224,26 +226,25 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
if (verbosity >= 1) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / res.iterations << ", iterations: " << res.iterations;
out << "=== converged: " << res.converged
<< ", conv_rate: " << res.conv_rate
<< ", time: " << res.elapsed <<
", time per iteration: " << res.elapsed / res.iterations
<< ", iterations: " << res.iterations;
OpmLog::info(out.str());
}
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class rocalutionSolverBackend<T,1>; \
template class rocalutionSolverBackend<T,2>; \
template class rocalutionSolverBackend<T,3>; \
template class rocalutionSolverBackend<T,4>; \
template class rocalutionSolverBackend<T,5>; \
template class rocalutionSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template rocalutionSolverBackend<n>::rocalutionSolverBackend(int, int, double);
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -31,17 +31,14 @@ template<class Scalar> class LocalMatrix;
template<class Scalar> class LocalVector;
}
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a rocalution based linear solver solver on GPU
/// It uses ilu0-bicgstab
template <unsigned int block_size>
class rocalutionSolverBackend : public BdaSolver<block_size>
template<class Scalar, unsigned int block_size>
class rocalutionSolverBackend : public BdaSolver<Scalar,block_size>
{
typedef BdaSolver<block_size> Base;
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -55,31 +52,34 @@ class rocalutionSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
std::vector<double> h_x; // store solution vector on host
std::vector<Scalar> h_x; // store solution vector on host
int *tmp_rowpointers; // store matrix on host, this pointer is given to and freed by rocalution
int *tmp_colindices; // store matrix on host, this pointer is given to and freed by rocalution
double *tmp_nnzvalues; // store matrix on host, this pointer is given to and freed by rocalution
Scalar* tmp_nnzvalues; // store matrix on host, this pointer is given to and freed by rocalution
std::unique_ptr<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_prec;
std::unique_ptr<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_solver;
using Mat = rocalution::LocalMatrix<Scalar>;
using Vec = rocalution::LocalVector<Scalar>;
std::unique_ptr<rocalution::ILU<Mat,Vec,Scalar>> roc_prec;
std::unique_ptr<rocalution::BiCGStab<Mat,Vec,Scalar>> roc_solver;
/// Initialize sizes and allocate memory
/// \param[in] matrix matrix A
void initialize(BlockedMatrix *matrix);
void initialize(BlockedMatrix<Scalar>* matrix);
/// Convert matrix to rocalution format
/// copy matrix to raw pointers, which are given to and freed by rocalution
/// \param[in] matrix matrix A
void convert_matrix(BlockedMatrix *matrix);
void convert_matrix(BlockedMatrix<Scalar>* matrix);
public:
/// Construct a rocalutionSolver
/// also initialize rocalution library and rocalution variables
/// \param[in] linear_solver_verbosity verbosity of rocalutionSolver
/// \param[in] maxit maximum number of iterations for rocalutionSolver
/// \param[in] tolerance required relative tolerance for rocalutionSolver
rocalutionSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
rocalutionSolverBackend(int linear_solver_verbosity,
int maxit, Scalar tolerance);
/// Destroy a rocalutionSolver, and free memory
~rocalutionSolverBackend();
@ -91,17 +91,19 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
}; // end class rocalutionSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -93,20 +93,20 @@
extern std::shared_ptr<std::thread> copyThread;
#endif //HAVE_OPENMP
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
template<class Scalar, unsigned int block_size>
rocsparseSolverBackend<Scalar,block_size>::
rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
unsigned int platformID_, unsigned int deviceID_)
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
{
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (static_cast<int>(deviceID) >= numDevices) {
OPM_THROW(std::runtime_error, "Error chosen too high HIP device ID");
OPM_THROW(std::runtime_error, "Invalid HIP device ID");
}
HIP_CHECK(hipSetDevice(deviceID));
@ -126,45 +126,45 @@ rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int m
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
}
template <unsigned int block_size>
rocsparseSolverBackend<block_size>::~rocsparseSolverBackend() {
template<class Scalar, unsigned int block_size>
rocsparseSolverBackend<Scalar,block_size>::~rocsparseSolverBackend()
{
hipError_t hipstatus = hipStreamSynchronize(stream);
if(hipstatus != hipSuccess){
if (hipstatus != hipSuccess) {
OpmLog::error("Could not synchronize with hipStream");
}
hipstatus = hipStreamDestroy(stream);
if(hipstatus != hipSuccess){
if (hipstatus != hipSuccess) {
OpmLog::error("Could not destroy hipStream");
}
rocsparse_status status1 = rocsparse_destroy_handle(handle);
if(status1 != rocsparse_status_success){
if (status1 != rocsparse_status_success) {
OpmLog::error("Could not destroy rocsparse handle");
}
rocblas_status status2 = rocblas_destroy_handle(blas_handle);
if(status2 != rocblas_status_success){
if (status2 != rocblas_status_success) {
OpmLog::error("Could not destroy rocblas handle");
}
}
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellContributions& wellContribs,
BdaResult& res)
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
float it = 0.5;
double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
double norm, norm_0;
double zero = 0.0;
double one = 1.0;
double mone = -1.0;
Scalar rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
Scalar norm, norm_0;
Scalar zero = 0.0;
Scalar one = 1.0;
Scalar mone = -1.0;
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
// set stream here, the WellContributions object is destroyed every linear solve
// the number of wells can change every linear solve
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).setStream(stream);
}
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
@ -253,8 +253,8 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
}
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_pw, d_v);
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -312,15 +312,15 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
d_Avals, d_Arows, d_Acols, block_size,
d_s, &zero, d_t));
#endif
if(verbosity >= 3){
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
t_spmv.stop();
t_well.start();
}
// apply wellContributions
if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
if (wellContribs.getNumWells() > 0) {
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_s, d_t);
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -360,8 +360,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
if (verbosity >= 1) {
std::ostringstream out;
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
out << "=== converged: " << res.converged
<< ", conv_rate: " << res.conv_rate
<< ", time: " << res.elapsed << \
", time per iteration: " << res.elapsed / it
<< ", iterations: " << it;
OpmLog::info(out.str());
}
if (verbosity >= 3) {
@ -375,9 +378,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
}
}
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
{
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
@ -390,12 +395,14 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
}
std::ostringstream out;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
out << "Initializing GPU, matrix size: "
<< Nb << " blockrows, nnzb: " << nnzb << "\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
out << "Maxit: " << maxit
<< std::scientific << ", tolerance: " << tolerance << "\n"
<< "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
@ -403,26 +410,26 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
mat = matrix;
jacMat = jacMatrix;
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_p, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_s, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_v, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb));
HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz));
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N));
HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(Scalar) * nnz));
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));
if (useJacMatrix) {
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
} else { // preconditioner matrix is same
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
d_Mcols = d_Acols;
d_Mrows = d_Arows;
}
@ -430,26 +437,43 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
initialized = true;
} // end initialize()
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
copy_system_to_gpu(Scalar *b)
{
Timer t;
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers,
sizeof(rocsparse_int) * (Nb + 1),
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices,
sizeof(rocsparse_int) * nnzb,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues,
sizeof(Scalar) * nnz,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
#if HAVE_OPENMP
if(omp_get_max_threads() > 1)
copyThread->join();
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
sizeof(rocsparse_int) * (Nb + 1),
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
sizeof(rocsparse_int) * nnzbs_prec,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
}
if (verbosity >= 3) {
@ -459,29 +483,36 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
std::ostringstream out;
out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
OpmLog::info(out.str());
OpmLog::info(out.str());
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
update_system_on_gpu(Scalar* b)
{
Timer t;
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(Scalar) * nnz,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
hipMemcpyHostToDevice, stream));
if (useJacMatrix) {
#if HAVE_OPENMP
if (omp_get_max_threads() > 1)
copyThread->join();
if (omp_get_max_threads() > 1) {
copyThread->join();
}
#endif
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
hipMemcpyHostToDevice, stream));
} else {
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
}
if (verbosity >= 3) {
HIP_CHECK(hipStreamSynchronize(stream));
@ -493,8 +524,10 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
}
} // end update_system_on_gpu()
template <unsigned int block_size>
bool rocsparseSolverBackend<block_size>::analyze_matrix() {
template<class Scalar, unsigned int block_size>
bool rocsparseSolverBackend<Scalar,block_size>::
analyze_matrix()
{
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
Timer t;
@ -523,7 +556,8 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
d_bufferSize = std::max(d_bufferSize_M,
std::max(d_bufferSize_L, d_bufferSize_U));
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
@ -571,9 +605,10 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
return true;
} // end analyze_matrix()
template <unsigned int block_size>
bool rocsparseSolverBackend<block_size>::create_preconditioner() {
template<class Scalar, unsigned int block_size>
bool rocsparseSolverBackend<Scalar,block_size>::
create_preconditioner()
{
Timer t;
bool result = true;
@ -598,9 +633,10 @@ bool rocsparseSolverBackend<block_size>::create_preconditioner() {
return result;
} // end create_preconditioner()
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
{
Timer t;
// actually solve
@ -612,17 +648,18 @@ void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellCon
out << "rocsparseSolver::solve_system(): " << t.stop() << " s";
OpmLog::info(out.str());
}
} // end solve_system()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void rocsparseSolverBackend<block_size>::get_result(double *x) {
template<class Scalar, unsigned int block_size>
void rocsparseSolverBackend<Scalar,block_size>::
get_result(Scalar* x)
{
Timer t;
HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(Scalar) * N,
hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately
if (verbosity >= 3) {
@ -632,13 +669,13 @@ void rocsparseSolverBackend<block_size>::get_result(double *x) {
}
} // end get_result()
template <unsigned int block_size>
SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
template<class Scalar, unsigned int block_size>
SolverStatus rocsparseSolverBackend<Scalar,block_size>::
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res)
{
if (initialized == false) {
initialize(matrix, jacMatrix);
@ -662,19 +699,14 @@ SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<Bl
return SolverStatus::BDA_SOLVER_SUCCESS;
}
#define INSTANTIATE_TYPE(T) \
template class rocsparseSolverBackend<T,1>; \
template class rocsparseSolverBackend<T,2>; \
template class rocsparseSolverBackend<T,3>; \
template class rocsparseSolverBackend<T,4>; \
template class rocsparseSolverBackend<T,5>; \
template class rocsparseSolverBackend<T,6>;
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template rocsparseSolverBackend<n>::rocsparseSolverBackend( \
int, int, double, unsigned int, unsigned int);
INSTANTIATE_TYPE(double)
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -31,16 +31,13 @@
#include <hip/hip_version.h>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
/// This class implements a rocsparse-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class rocsparseSolverBackend : public BdaSolver<block_size>
template<class Scalar, unsigned int block_size>
class rocsparseSolverBackend : public BdaSolver<Scalar,block_size>
{
typedef BdaSolver<block_size> Base;
using Base = BdaSolver<Scalar,block_size>;
using Base::N;
using Base::Nb;
@ -54,14 +51,13 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
bool useJacMatrix = false;
bool analysis_done = false;
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
rocsparse_direction dir = rocsparse_direction_row;
@ -77,31 +73,31 @@ private:
rocsparse_int *d_Arows, *d_Mrows;
rocsparse_int *d_Acols, *d_Mcols;
double *d_Avals, *d_Mvals;
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
double *d_pw, *d_s, *d_t, *d_v;
Scalar *d_Avals, *d_Mvals;
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
Scalar *d_pw, *d_s, *d_t, *d_v;
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
int ver;
char rev[64];
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
/// Copy linear system to GPU
/// \param[in] b input vector, contains N values
void copy_system_to_gpu(double *b);
void copy_system_to_gpu(Scalar* b);
/// Update linear system to GPU
/// \param[in] b input vector, contains N values
void update_system_on_gpu(double *b);
void update_system_on_gpu(Scalar* b);
/// Analyze sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@ -114,16 +110,20 @@ private:
/// Solve linear system
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions &wellContribs, BdaResult &res);
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
public:
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver
/// \param[in] maxit maximum number of iterations for openclSolver
/// \param[in] tolerance required relative tolerance for openclSolver
/// Construct a rocsparseSolver
/// \param[in] linear_solver_verbosity verbosity of rocsparseSolver
/// \param[in] maxit maximum number of iterations for rocsparseSolver
/// \param[in] tolerance required relative tolerance for rocsparseSolver
/// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used
rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
rocsparseSolverBackend(int linear_solver_verbosity,
int maxit,
Scalar tolerance,
unsigned int platformID,
unsigned int deviceID);
/// For the CPR coarse solver
// rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
@ -138,8 +138,11 @@ public:
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
Scalar* b,
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
WellContributions<Scalar>& wellContribs,
BdaResult& res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
@ -147,13 +150,10 @@ public:
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
void get_result(Scalar* x) override;
}; // end class rocsparseSolverBackend
} // namespace Accelerator
} // namespace Opm
} // namespace Opm::Accelerator
#endif

View File

@ -56,17 +56,17 @@ namespace Opm
#ifdef __HIP__
/// HIP kernel to apply the standard wellcontributions
__global__ void stdwell_apply(
const double *Cnnzs,
const double *Dnnzs,
const double *Bnnzs,
const unsigned *Ccols,
const unsigned *Bcols,
const double *x,
double *y,
const unsigned dim,
const unsigned dim_wells,
const unsigned *val_pointers)
template<class Scalar>
__global__ void stdwell_apply(const Scalar* Cnnzs,
const Scalar* Dnnzs,
const Scalar* Bnnzs,
const unsigned* Ccols,
const unsigned* Bcols,
const Scalar* x,
Scalar* y,
const unsigned dim,
const unsigned dim_wells,
const unsigned *val_pointers)
{
unsigned wgId = blockIdx.x;
unsigned wiId = threadIdx.x;
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
unsigned c = wiId % dim;
unsigned r = (wiId/dim) % dim_wells;
double temp;
Scalar temp;
extern __shared__ double localSum[];
double *z1 = localSum + gridDim.x;
double *z2 = z1 + dim_wells;
extern __shared__ Scalar localSum[];
Scalar* z1 = localSum + gridDim.x;
Scalar* z2 = z1 + dim_wells;
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
if (wiId < numActiveWorkItems) {
unsigned b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
while (b < valSize + val_pointers[wgId]) {
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// workitem i will hold the sum of workitems i and i + valsPerBlock
if(wiId < valsPerBlock){
if (wiId < valsPerBlock){
for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
localSum[wiId] += localSum[wiId + i*valsPerBlock];
}
}
if(c == 0 && wiId < valsPerBlock){
for(unsigned i = dim - 1; i > 0; --i){
if (c == 0 && wiId < valsPerBlock){
for (unsigned i = dim - 1; i > 0; --i) {
localSum[wiId] += localSum[wiId + i];
}
z1[r] = localSum[wiId];
@ -117,7 +117,7 @@ __global__ void stdwell_apply(
if(wiId < dim_wells){
temp = 0.0;
for(unsigned i = 0; i < dim_wells; ++i){
for (unsigned i = 0; i < dim_wells; ++i) {
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
@ -125,10 +125,10 @@ __global__ void stdwell_apply(
__syncthreads();
if(wiId < dim*valSize){
if (wiId < dim*valSize){
temp = 0.0;
unsigned bb = wiId/dim + val_pointers[wgId];
for (unsigned j = 0; j < dim_wells; ++j){
for (unsigned j = 0; j < dim_wells; ++j) {
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
@ -138,17 +138,26 @@ __global__ void stdwell_apply(
}
#endif
void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
[[maybe_unused]] double *d_y){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply_stdwells([[maybe_unused]] Scalar* d_x,
[[maybe_unused]] Scalar* d_y)
{
#ifdef __HIP__
unsigned gridDim = num_std_wells;
unsigned blockDim = 64;
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
// dim3(N) will create a vector {N, 1, 1}
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
d_x, d_y, dim, dim_wells, d_val_pointers_hip
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
d_Dnnzs_hip,
d_Bnnzs_hip,
d_Ccols_hip,
d_Bcols_hip,
d_x,
d_y,
dim,
dim_wells,
d_val_pointers_hip
);
HIP_CHECK(hipStreamSynchronize(stream));
#else
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
#endif
}
void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply_mswells(Scalar* d_x, Scalar* d_y)
{
if (h_x.empty()) {
h_x.resize(N);
h_y.resize(N);
h_x.resize(this->N);
h_y.resize(this->N);
}
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
HIP_CHECK(hipStreamSynchronize(stream));
// actually apply MultisegmentWells
for (auto& well : multisegments) {
for (auto& well : this->multisegments) {
well->apply(h_x.data(), h_y.data());
}
// copy vector y from CPU to GPU
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipStreamSynchronize(stream));
}
void WellContributionsRocsparse::apply(double *d_x, double *d_y){
if(num_std_wells > 0){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
apply(Scalar* d_x, Scalar* d_y)
{
if (this->num_std_wells > 0) {
apply_stdwells(d_x, d_y);
}
if(num_ms_wells > 0){
if (this->num_ms_wells > 0) {
apply_mswells(d_x, d_y);
}
}
void WellContributionsRocsparse::setStream(hipStream_t stream_){
template<class Scalar>
void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
{
stream = stream_;
}
void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
int* colIndices,
double* values,
unsigned int val_size)
template<class Scalar>
void WellContributionsRocsparse<Scalar>::
APIaddMatrix(MatrixType type,
int* colIndices,
Scalar* values,
unsigned int val_size)
{
if (!allocated) {
if (!this->allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
sizeof(d_Ccols_hip) * val_size,
hipMemcpyHostToDevice, stream));
break;
case MatrixType::D:
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
hipMemcpyHostToDevice, stream));
break;
case MatrixType::B:
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
hipMemcpyHostToDevice, stream));
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
sizeof(d_Bcols_hip) * val_size,
hipMemcpyHostToDevice, stream));
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
this->val_pointers[this->num_std_wells] = this->num_blocks;
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
hipMemcpyHostToDevice, stream));
}
break;
@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
HIP_CHECK(hipStreamSynchronize(stream));
}
void WellContributionsRocsparse::APIalloc()
template<class Scalar>
void WellContributionsRocsparse<Scalar>::APIalloc()
{
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
}
} //namespace Opm
template class WellContributionsRocsparse<double>;
} // namespace Opm

View File

@ -26,33 +26,35 @@
#include <vector>
namespace Opm {
namespace Opm
{
class WellContributionsRocsparse : public WellContributions
template<class Scalar>
class WellContributionsRocsparse : public WellContributions<Scalar>
{
private:
hipStream_t stream;
public:
void apply_stdwells(double *d_x, double *d_y);
void apply_mswells(double *d_x, double *d_y);
void apply(double *d_x, double *d_y);
void apply_stdwells(Scalar* d_x, Scalar* d_y);
void apply_mswells(Scalar* d_x, Scalar* d_y);
void apply(Scalar* d_x, Scalar* d_y);
void setStream(hipStream_t stream);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
using MatrixType = typename WellContributions<Scalar>::MatrixType;
double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
void APIaddMatrix(MatrixType type, int* colIndices,
Scalar* values, unsigned int val_size) override;
Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
unsigned *d_Ccols_hip, *d_Bcols_hip;
unsigned *d_val_pointers_hip;
std::vector<double> h_x;
std::vector<double> h_y;
std::vector<Scalar> h_x;
std::vector<Scalar> h_y;
};
} //namespace Opm

View File

@ -90,7 +90,7 @@ struct EnableTerminalOutput {
namespace Opm {
#if COMPILE_BDA_BRIDGE
class WellContributions;
template<class Scalar> class WellContributions;
#endif
/// Class for handling the blackoil well model.
@ -287,7 +287,7 @@ class WellContributions;
#if COMPILE_BDA_BRIDGE
// accumulate the contributions of all Wells in the WellContributions object
void getWellContributions(WellContributions& x) const;
void getWellContributions(WellContributions<Scalar>& x) const;
#endif
// apply well model with scaling of alpha

View File

@ -1568,7 +1568,7 @@ namespace Opm {
template<typename TypeTag>
void
BlackoilWellModel<TypeTag>::
getWellContributions(WellContributions& wellContribs) const
getWellContributions(WellContributions<Scalar>& wellContribs) const
{
// prepare for StandardWells
wellContribs.setBlockSize(StandardWell<TypeTag>::Indices::numEq, StandardWell<TypeTag>::numStaticWellEq);

View File

@ -202,7 +202,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
#if COMPILE_BDA_BRIDGE
template<class Scalar, int numWellEq, int numEq>
void MultisegmentWellEquations<Scalar,numWellEq,numEq>::
extract(WellContributions& wellContribs) const
extract(WellContributions<Scalar>& wellContribs) const
{
unsigned int Mb = duneB_.N(); // number of blockrows in duneB_, duneC_ and duneD_
unsigned int BnumBlocks = duneB_.nonzeroes();

View File

@ -39,7 +39,7 @@ namespace Opm
template<class Scalar, int numWellEq, int numEq> class MultisegmentWellEquationAccess;
template<class Scalar> class MultisegmentWellGeneric;
#if COMPILE_BDA_BRIDGE
class WellContributions;
template<class Scalar> class WellContributions;
#endif
template<class Scalar> class WellInterfaceGeneric;
template<class Scalar> class WellState;
@ -105,7 +105,7 @@ public:
#if COMPILE_BDA_BRIDGE
//! \brief Add the matrices of this well to the WellContributions object.
void extract(WellContributions& wellContribs) const;
void extract(WellContributions<Scalar>& wellContribs) const;
#endif
//! \brief Add the matrices of this well to the sparse matrix adapter.

View File

@ -198,7 +198,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
template<class Scalar, int numEq>
void StandardWellEquations<Scalar,numEq>::
extract(const int numStaticWellEq,
WellContributions& wellContribs) const
WellContributions<Scalar>& wellContribs) const
{
std::vector<int> colIndices;
std::vector<Scalar> nnzValues;
@ -216,7 +216,7 @@ extract(const int numStaticWellEq,
}
}
}
wellContribs.addMatrix(WellContributions::MatrixType::C,
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::C,
colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
// invDuneD
@ -229,7 +229,7 @@ extract(const int numStaticWellEq,
nnzValues.emplace_back(invDuneD_[0][0][i][j]);
}
}
wellContribs.addMatrix(WellContributions::MatrixType::D,
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::D,
colIndices.data(), nnzValues.data(), 1);
// duneB
@ -245,7 +245,7 @@ extract(const int numStaticWellEq,
}
}
}
wellContribs.addMatrix(WellContributions::MatrixType::B,
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::B,
colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
}
#endif

View File

@ -37,7 +37,7 @@ namespace Opm
template<class Scalar> class ParallelWellInfo;
template<class Scalar, int numEq> class StandardWellEquationAccess;
#if COMPILE_BDA_BRIDGE
class WellContributions;
template<class Scalar> class WellContributions;
#endif
template<class Scalar> class WellInterfaceGeneric;
template<class Scalar> class WellState;
@ -102,7 +102,7 @@ public:
#if COMPILE_BDA_BRIDGE
//! \brief Add the matrices of this well to the WellContributions object.
void extract(const int numStaticWellEq,
WellContributions& wellContribs) const;
WellContributions<Scalar>& wellContribs) const;
#endif
//! \brief Add the matrices of this well to the sparse matrix adapter.

View File

@ -38,7 +38,7 @@ class ConvergenceReport;
class DeferredLogger;
class Schedule;
class SummaryState;
class WellContributions;
template<class Scalar> class WellContributions;
template<class FluidSystem, class Indices> class WellInterfaceIndices;
template<class Scalar> class WellState;

View File

@ -272,7 +272,7 @@ computeBhpAtThpLimitProd(const std::function<std::vector<Scalar>(const Scalar)>&
"find bhp-point where production becomes non-zero for well " + well_.name());
return std::nullopt;
}
const std::array<Scalar, 2> range {controls.bhp_limit, *bhp_max};
const std::array<Scalar, 2> range {static_cast<Scalar>(controls.bhp_limit), *bhp_max};
return this->computeBhpAtThpLimit(frates, fbhp, range, deferred_logger);
}
@ -518,9 +518,9 @@ computeBhpAtThpLimitInjImpl(const std::function<std::vector<Scalar>(const Scalar
// Get the flo samples, add extra samples at low rates and bhp
// limit point if necessary.
std::vector<Scalar> flo_samples = table.getFloAxis();
std::vector<double> flo_samples = table.getFloAxis();
if (flo_samples[0] > 0.0) {
const Scalar f0 = flo_samples[0];
const double f0 = flo_samples[0];
flo_samples.insert(flo_samples.begin(), { f0/20.0, f0/10.0, f0/5.0, f0/2.0 });
}
const Scalar flo_bhp_limit = flo(frates(controls.bhp_limit));

View File

@ -123,7 +123,7 @@ testCusparseSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -138,7 +138,7 @@ testCusparseSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Mat
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different

View File

@ -120,7 +120,7 @@ testOpenclSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>&
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -135,7 +135,7 @@ testOpenclSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matri
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different

View File

@ -96,7 +96,7 @@ testRocalutionSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix,
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create(accelerator_mode, true);
auto wellContribs = Opm::WellContributions<double>::create(accelerator_mode, true);
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,

View File

@ -127,7 +127,7 @@ testRocsparseSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -142,7 +142,7 @@ testRocsparseSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>,
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different