mirror of
https://github.com/OPM/opm-simulators.git
synced 2024-11-25 18:50:19 -06:00
Merge pull request #5380 from akva2/linalg_template_scalar
LinAlg classes: template Scalar type
This commit is contained in:
commit
bcbac79486
@ -31,7 +31,7 @@ endif()
|
||||
foreach(CL ${CL_LIST})
|
||||
get_filename_component(FNAME ${CL} NAME_WE)
|
||||
|
||||
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
|
||||
file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
|
||||
file(READ "${CL}" CL_CONTENT)
|
||||
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
|
||||
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")
|
||||
|
@ -50,15 +50,14 @@
|
||||
std::shared_ptr<std::thread> copyThread;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm {
|
||||
namespace detail {
|
||||
namespace Opm::detail {
|
||||
|
||||
template<class Matrix, class Vector>
|
||||
BdaSolverInfo<Matrix,Vector>::
|
||||
BdaSolverInfo(const std::string& accelerator_mode,
|
||||
const int linear_solver_verbosity,
|
||||
const int maxit,
|
||||
const double tolerance,
|
||||
const Scalar tolerance,
|
||||
const int platformID,
|
||||
const int deviceID,
|
||||
const bool opencl_ilu_parallel,
|
||||
@ -104,7 +103,7 @@ apply(Vector& rhs,
|
||||
{
|
||||
bool use_gpu = bridge_->getUseGpu();
|
||||
if (use_gpu) {
|
||||
auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
|
||||
auto wellContribs = WellContributions<Scalar>::create(accelerator_mode_, useWellConn);
|
||||
bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
|
||||
|
||||
// the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
|
||||
@ -179,8 +178,9 @@ blockJacobiAdjacency(const Grid& grid,
|
||||
const auto& gridView = grid.leafGridView();
|
||||
auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
|
||||
|
||||
//Loop over cells
|
||||
for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
|
||||
// Loop over cells
|
||||
for (Iter row = blockJacobiForGPUILU0_->createbegin();
|
||||
row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
|
||||
{
|
||||
const auto& elem = *elemIt;
|
||||
size_type idx = lid.id(elem);
|
||||
@ -221,25 +221,26 @@ copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
|
||||
auto outerCol = (*outerRow).begin();
|
||||
for (auto col = (*row).begin(); col != (*row).end(); ++col) {
|
||||
// outerRow is guaranteed to have all column entries that row has!
|
||||
while(outerCol.index() < col.index()) ++outerCol;
|
||||
while (outerCol.index() < col.index()) {
|
||||
++outerCol;
|
||||
}
|
||||
assert(outerCol.index() == col.index());
|
||||
*col = *outerCol; // copy nonzero block
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<int Dim>
|
||||
using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
|
||||
template<int Dim>
|
||||
using BV = Dune::BlockVector<Dune::FieldVector<double,Dim>>;
|
||||
template<class Scalar, int Dim>
|
||||
using BM = Dune::BCRSMatrix<MatrixBlock<Scalar,Dim,Dim>>;
|
||||
template<class Scalar, int Dim>
|
||||
using BV = Dune::BlockVector<Dune::FieldVector<Scalar,Dim>>;
|
||||
|
||||
|
||||
#define INSTANCE_GRID(Dim, Grid) \
|
||||
template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
|
||||
prepare(const Grid&, \
|
||||
const Dune::CartesianIndexMapper<Grid>&, \
|
||||
const std::vector<Well>&, \
|
||||
const std::vector<int>&, \
|
||||
#define INSTANTIATE_GRID(T, Dim, Grid) \
|
||||
template void BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>:: \
|
||||
prepare(const Grid&, \
|
||||
const Dune::CartesianIndexMapper<Grid>&, \
|
||||
const std::vector<Well>&, \
|
||||
const std::vector<int>&, \
|
||||
const std::size_t, const bool);
|
||||
using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
|
||||
#if HAVE_DUNE_ALUGRID
|
||||
@ -248,23 +249,26 @@ using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
|
||||
#else
|
||||
using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridNoComm>;
|
||||
#endif //HAVE_MPI
|
||||
#define INSTANCE(Dim) \
|
||||
template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
|
||||
INSTANCE_GRID(Dim,Dune::CpGrid) \
|
||||
INSTANCE_GRID(Dim,ALUGrid3CN) \
|
||||
INSTANCE_GRID(Dim,PolyHedralGrid3D)
|
||||
#define INSTANTIATE(T,Dim) \
|
||||
template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
|
||||
INSTANTIATE_GRID(T,Dim,Dune::CpGrid) \
|
||||
INSTANTIATE_GRID(T,Dim,ALUGrid3CN) \
|
||||
INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
|
||||
#else
|
||||
#define INSTANCE(Dim) \
|
||||
template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
|
||||
INSTANCE_GRID(Dim,Dune::CpGrid) \
|
||||
INSTANCE_GRID(Dim,PolyHedralGrid3D)
|
||||
#define INSTANTIATE(T,Dim) \
|
||||
template struct BdaSolverInfo<BM<T,Dim>,BV<T,Dim>>; \
|
||||
INSTANTIATE_GRID(T,Dim,Dune::CpGrid) \
|
||||
INSTANTIATE_GRID(T,Dim,PolyHedralGrid3D)
|
||||
#endif
|
||||
INSTANCE(1)
|
||||
INSTANCE(2)
|
||||
INSTANCE(3)
|
||||
INSTANCE(4)
|
||||
INSTANCE(5)
|
||||
INSTANCE(6)
|
||||
|
||||
} // namespace detail
|
||||
} // namespace Opm
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
INSTANTIATE(T,1) \
|
||||
INSTANTIATE(T,2) \
|
||||
INSTANTIATE(T,3) \
|
||||
INSTANTIATE(T,4) \
|
||||
INSTANTIATE(T,5) \
|
||||
INSTANTIATE(T,6)
|
||||
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
} // namespace Opm::detail
|
||||
|
@ -35,60 +35,61 @@ namespace Opm {
|
||||
class Well;
|
||||
|
||||
template<class Matrix, class Vector, int block_size> class BdaBridge;
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
namespace detail {
|
||||
|
||||
template<class Matrix, class Vector>
|
||||
struct BdaSolverInfo
|
||||
{
|
||||
using WellContribFunc = std::function<void(WellContributions&)>;
|
||||
using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
|
||||
using Scalar = typename Vector::field_type;
|
||||
using WellContribFunc = std::function<void(WellContributions<Scalar>&)>;
|
||||
using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
|
||||
|
||||
BdaSolverInfo(const std::string& accelerator_mode,
|
||||
const int linear_solver_verbosity,
|
||||
const int maxit,
|
||||
const double tolerance,
|
||||
const int platformID,
|
||||
const int deviceID,
|
||||
const bool opencl_ilu_parallel,
|
||||
const std::string& linsolver);
|
||||
BdaSolverInfo(const std::string& accelerator_mode,
|
||||
const int linear_solver_verbosity,
|
||||
const int maxit,
|
||||
const Scalar tolerance,
|
||||
const int platformID,
|
||||
const int deviceID,
|
||||
const bool opencl_ilu_parallel,
|
||||
const std::string& linsolver);
|
||||
|
||||
~BdaSolverInfo();
|
||||
~BdaSolverInfo();
|
||||
|
||||
template<class Grid>
|
||||
void prepare(const Grid& grid,
|
||||
const Dune::CartesianIndexMapper<Grid>& cartMapper,
|
||||
const std::vector<Well>& wellsForConn,
|
||||
const std::vector<int>& cellPartition,
|
||||
const std::size_t nonzeroes,
|
||||
const bool useWellConn);
|
||||
template<class Grid>
|
||||
void prepare(const Grid& grid,
|
||||
const Dune::CartesianIndexMapper<Grid>& cartMapper,
|
||||
const std::vector<Well>& wellsForConn,
|
||||
const std::vector<int>& cellPartition,
|
||||
const std::size_t nonzeroes,
|
||||
const bool useWellConn);
|
||||
|
||||
bool apply(Vector& rhs,
|
||||
const bool useWellConn,
|
||||
WellContribFunc getContribs,
|
||||
const int rank,
|
||||
Matrix& matrix,
|
||||
Vector& x,
|
||||
Dune::InverseOperatorResult& result);
|
||||
bool apply(Vector& rhs,
|
||||
const bool useWellConn,
|
||||
WellContribFunc getContribs,
|
||||
const int rank,
|
||||
Matrix& matrix,
|
||||
Vector& x,
|
||||
Dune::InverseOperatorResult& result);
|
||||
|
||||
bool gpuActive();
|
||||
bool gpuActive();
|
||||
|
||||
int numJacobiBlocks_ = 0;
|
||||
int numJacobiBlocks_ = 0;
|
||||
|
||||
private:
|
||||
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
|
||||
/// Do not initialize the values, that is done in copyMatToBlockJac()
|
||||
template<class Grid>
|
||||
void blockJacobiAdjacency(const Grid& grid,
|
||||
const std::vector<int>& cell_part,
|
||||
std::size_t nonzeroes);
|
||||
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
|
||||
/// Do not initialize the values, that is done in copyMatToBlockJac()
|
||||
template<class Grid>
|
||||
void blockJacobiAdjacency(const Grid& grid,
|
||||
const std::vector<int>& cell_part,
|
||||
std::size_t nonzeroes);
|
||||
|
||||
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
|
||||
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
|
||||
|
||||
std::unique_ptr<Bridge> bridge_;
|
||||
std::string accelerator_mode_;
|
||||
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
|
||||
std::vector<std::set<int>> wellConnectionsGraph_;
|
||||
std::unique_ptr<Bridge> bridge_;
|
||||
std::string accelerator_mode_;
|
||||
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
|
||||
std::vector<std::set<int>> wellConnectionsGraph_;
|
||||
};
|
||||
|
||||
}
|
||||
@ -249,8 +250,8 @@ public:
|
||||
// Solve system.
|
||||
Dune::InverseOperatorResult result;
|
||||
|
||||
std::function<void(WellContributions&)> getContribs =
|
||||
[this](WellContributions& w)
|
||||
std::function<void(WellContributions<Scalar>&)> getContribs =
|
||||
[this](WellContributions<Scalar>& w)
|
||||
{
|
||||
this->simulator_.problem().wellModel().getWellContributions(w);
|
||||
};
|
||||
|
@ -50,11 +50,11 @@
|
||||
#include <opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Opm {
|
||||
|
||||
template <class Smoother>
|
||||
struct AMGSmootherArgsHelper {
|
||||
struct AMGSmootherArgsHelper
|
||||
{
|
||||
static auto args(const PropertyTree& prm)
|
||||
{
|
||||
using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
|
||||
@ -69,10 +69,11 @@ struct AMGSmootherArgsHelper {
|
||||
};
|
||||
|
||||
template <class M, class V, class C>
|
||||
struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
|
||||
struct AMGSmootherArgsHelper<ParallelOverlappingILU0<M, V, V, C>>
|
||||
{
|
||||
static auto args(const PropertyTree& prm)
|
||||
{
|
||||
using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
|
||||
using Smoother = ParallelOverlappingILU0<M, V, V, C>;
|
||||
using SmootherArgs = typename Dune::Amg::SmootherTraits<Smoother>::Arguments;
|
||||
SmootherArgs smootherArgs;
|
||||
smootherArgs.iterations = prm.get<int>("iterations", 1);
|
||||
@ -88,7 +89,6 @@ struct AMGSmootherArgsHelper<Opm::ParallelOverlappingILU0<M, V, V, C>> {
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
// trailing return type with decltype used for detecting existence of setUseFixedOrder member function by overloading the setUseFixedOrder function
|
||||
template <typename C>
|
||||
auto setUseFixedOrder(C criterion, bool booleanValue) -> decltype(criterion.setUseFixedOrder(booleanValue))
|
||||
@ -209,7 +209,7 @@ struct StandardPreconditioners {
|
||||
const std::string smoother = prm.get<std::string>("smoother", "ParOverILU0");
|
||||
// TODO: merge this with ILUn, and possibly simplify the factory to only work with ILU?
|
||||
if (smoother == "ILU0" || smoother == "ParOverILU0") {
|
||||
using Smoother = Opm::ParallelOverlappingILU0<M, V, V, C>;
|
||||
using Smoother = ParallelOverlappingILU0<M, V, V, C>;
|
||||
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
|
||||
auto sargs = AMGSmootherArgsHelper<Smoother>::args(prm);
|
||||
PrecPtr prec = std::make_shared<Dune::Amg::AMGCPR<O, V, Smoother, C>>(op, crit, sargs, comm);
|
||||
@ -279,7 +279,8 @@ struct StandardPreconditioners {
|
||||
OPM_THROW(std::logic_error,
|
||||
"Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, false>;
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, false>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
|
||||
op, prm, weightsCalculator, pressureIndex, comm);
|
||||
});
|
||||
@ -294,7 +295,8 @@ struct StandardPreconditioners {
|
||||
OPM_THROW(std::logic_error,
|
||||
"Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Comm, true>;
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy = PressureTransferPolicy<O, Comm, Scalar, true>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
|
||||
op, prm, weightsCalculator, pressureIndex, comm);
|
||||
});
|
||||
@ -311,7 +313,8 @@ struct StandardPreconditioners {
|
||||
OPM_THROW(std::logic_error,
|
||||
"Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using LevelTransferPolicy = Opm::PressureBhpTransferPolicy<O, Comm, false>;
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy = PressureBhpTransferPolicy<O, Comm, Scalar, false>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy, Comm>>(
|
||||
op, prm, weightsCalculator, pressureIndex, comm);
|
||||
});
|
||||
@ -321,12 +324,12 @@ struct StandardPreconditioners {
|
||||
F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
using field_type = typename V::field_type;
|
||||
using CuILU0 = typename Opm::cuistl::
|
||||
CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
using CuILU0 = typename cuistl::
|
||||
CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
auto cuILU0 = std::make_shared<CuILU0>(op.getmat(), w);
|
||||
|
||||
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
|
||||
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(cuILU0);
|
||||
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
return wrapped;
|
||||
});
|
||||
|
||||
@ -334,21 +337,21 @@ struct StandardPreconditioners {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
using field_type = typename V::field_type;
|
||||
using CuJac =
|
||||
typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
auto cuJac = std::make_shared<CuJac>(op.getmat(), w);
|
||||
|
||||
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
|
||||
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
|
||||
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
return wrapped;
|
||||
});
|
||||
|
||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||
using field_type = typename V::field_type;
|
||||
using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
auto cuDILU = std::make_shared<CuDILU>(op.getmat());
|
||||
|
||||
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
|
||||
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
|
||||
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
return wrapped;
|
||||
});
|
||||
#endif
|
||||
@ -368,11 +371,11 @@ struct StandardPreconditioners {
|
||||
// Already a parallel preconditioner. Need to pass comm, but no need to wrap it in a BlockPreconditioner.
|
||||
if (ilulevel == 0) {
|
||||
const std::size_t num_interior = interiorIfGhostLast(comm);
|
||||
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
|
||||
op.getmat(), comm, w, Opm::MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
|
||||
return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
|
||||
op.getmat(), comm, w, MILU_VARIANT::ILU, num_interior, redblack, reorder_spheres);
|
||||
} else {
|
||||
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, Comm>>(
|
||||
op.getmat(), comm, ilulevel, w, Opm::MILU_VARIANT::ILU, redblack, reorder_spheres);
|
||||
return std::make_shared<ParallelOverlappingILU0<M, V, V, Comm>>(
|
||||
op.getmat(), comm, ilulevel, w, MILU_VARIANT::ILU, redblack, reorder_spheres);
|
||||
}
|
||||
}
|
||||
|
||||
@ -412,8 +415,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
using P = PropertyTree;
|
||||
F::addCreator("ILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), 0, w, Opm::MILU_VARIANT::ILU);
|
||||
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), 0, w, MILU_VARIANT::ILU);
|
||||
});
|
||||
F::addCreator("DuneILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
@ -424,14 +427,14 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
F::addCreator("ParOverILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
const int n = prm.get<int>("ilulevel", 0);
|
||||
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
|
||||
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), n, w, MILU_VARIANT::ILU);
|
||||
});
|
||||
F::addCreator("ILUn", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
const int n = prm.get<int>("ilulevel", 0);
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
return std::make_shared<Opm::ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), n, w, Opm::MILU_VARIANT::ILU);
|
||||
return std::make_shared<ParallelOverlappingILU0<M, V, V, C>>(
|
||||
op.getmat(), n, w, MILU_VARIANT::ILU);
|
||||
});
|
||||
F::addCreator("DILU", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
DUNE_UNUSED_PARAMETER(prm);
|
||||
@ -513,11 +516,16 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
}
|
||||
});
|
||||
F::addCreator("famg", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
|
||||
Dune::Amg::Parameters parms;
|
||||
parms.setNoPreSmoothSteps(1);
|
||||
parms.setNoPostSmoothSteps(1);
|
||||
return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
|
||||
if constexpr (std::is_same_v<typename V::field_type, float>) {
|
||||
OPM_THROW(std::logic_error, "famg requires UMFPack which is not available for floats");
|
||||
return nullptr;
|
||||
} else {
|
||||
auto crit = AMGHelper<O, C, M, V>::criterion(prm);
|
||||
Dune::Amg::Parameters parms;
|
||||
parms.setNoPreSmoothSteps(1);
|
||||
parms.setNoPostSmoothSteps(1);
|
||||
return getRebuildOnUpdateWrapper<Dune::Amg::FastAMG<O, V>>(op, crit, parms);
|
||||
}
|
||||
});
|
||||
}
|
||||
if constexpr (std::is_same_v<O, WellModelMatrixAdapter<M, V, V, false>>) {
|
||||
@ -527,8 +535,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
|
||||
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy
|
||||
= Opm::PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
|
||||
= PressureBhpTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
|
||||
op, prm, weightsCalculator, pressureIndex);
|
||||
});
|
||||
@ -540,7 +549,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
|
||||
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, false>;
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, false>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
|
||||
op, prm, weightsCalculator, pressureIndex);
|
||||
});
|
||||
@ -550,7 +560,8 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
if (pressureIndex == std::numeric_limits<std::size_t>::max()) {
|
||||
OPM_THROW(std::logic_error, "Pressure index out of bounds. It needs to specified for CPR");
|
||||
}
|
||||
using LevelTransferPolicy = Opm::PressureTransferPolicy<O, Dune::Amg::SequentialInformation, true>;
|
||||
using Scalar = typename V::field_type;
|
||||
using LevelTransferPolicy = PressureTransferPolicy<O, Dune::Amg::SequentialInformation, Scalar, true>;
|
||||
return std::make_shared<OwningTwoLevelPreconditioner<O, V, LevelTransferPolicy>>(
|
||||
op, prm, weightsCalculator, pressureIndex);
|
||||
});
|
||||
@ -559,9 +570,9 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
F::addCreator("CUILU0", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
using field_type = typename V::field_type;
|
||||
using CuILU0 = typename Opm::cuistl::
|
||||
CuSeqILU0<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuILU0>>(
|
||||
using CuILU0 = typename cuistl::
|
||||
CuSeqILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CuILU0>>(
|
||||
std::make_shared<CuILU0>(op.getmat(), w));
|
||||
});
|
||||
|
||||
@ -571,10 +582,10 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
||||
using matrix_type_to =
|
||||
typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
||||
using CuILU0 = typename Opm::cuistl::
|
||||
CuSeqILU0<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
|
||||
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
|
||||
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||
using CuILU0 = typename cuistl::
|
||||
CuSeqILU0<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
|
||||
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuILU0>;
|
||||
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||
auto converted = std::make_shared<Converter>(op.getmat());
|
||||
auto adapted = std::make_shared<Adapter>(std::make_shared<CuILU0>(converted->getConvertedMatrix(), w));
|
||||
converted->setUnderlyingPreconditioner(adapted);
|
||||
@ -585,24 +596,24 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
||||
const double w = prm.get<double>("relaxation", 1.0);
|
||||
using field_type = typename V::field_type;
|
||||
using CUJac =
|
||||
typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(
|
||||
typename cuistl::CuJac<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUJac>>(
|
||||
std::make_shared<CUJac>(op.getmat(), w));
|
||||
});
|
||||
|
||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||
using field_type = typename V::field_type;
|
||||
using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
|
||||
using CUDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
|
||||
});
|
||||
|
||||
F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||
using block_type = typename V::block_type;
|
||||
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
||||
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
||||
using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
|
||||
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
|
||||
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||
using CuDILU = typename cuistl::CuDILU<matrix_type_to, cuistl::CuVector<float>, cuistl::CuVector<float>>;
|
||||
using Adapter = typename cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
|
||||
using Converter = typename cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||
auto converted = std::make_shared<Converter>(op.getmat());
|
||||
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
|
||||
converted->setUnderlyingPreconditioner(adapted);
|
||||
@ -744,7 +755,7 @@ using OpFSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Dune::FieldMatrix<double, Di
|
||||
Dune::BlockVector<Dune::FieldVector<double, Dim>>,
|
||||
Dune::BlockVector<Dune::FieldVector<double, Dim>>>;
|
||||
template <int Dim>
|
||||
using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<Opm::MatrixBlock<double, Dim, Dim>>,
|
||||
using OpBSeq = Dune::MatrixAdapter<Dune::BCRSMatrix<MatrixBlock<double, Dim, Dim>>,
|
||||
Dune::BlockVector<Dune::FieldVector<double, Dim>>,
|
||||
Dune::BlockVector<Dune::FieldVector<double, Dim>>>;
|
||||
|
||||
|
@ -76,31 +76,36 @@ namespace Opm
|
||||
|
||||
namespace Details
|
||||
{
|
||||
using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
|
||||
using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
|
||||
using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
|
||||
template <class Comm>
|
||||
template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
|
||||
template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
|
||||
template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
PressureVectorType<Scalar>>;
|
||||
template<class Scalar, class Comm>
|
||||
using ParCoarseOperatorType
|
||||
= Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
|
||||
template <class Comm>
|
||||
= Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
Comm>;
|
||||
template<class Scalar, class Comm>
|
||||
using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
|
||||
SeqCoarseOperatorType,
|
||||
ParCoarseOperatorType<Comm>>;
|
||||
SeqCoarseOperatorType<Scalar>,
|
||||
ParCoarseOperatorType<Scalar,Comm>>;
|
||||
} // namespace Details
|
||||
|
||||
template <class FineOperator, class Communication, bool transpose = false>
|
||||
class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
|
||||
template<class FineOperator, class Communication, class Scalar, bool transpose = false>
|
||||
class PressureBhpTransferPolicy : public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
|
||||
{
|
||||
public:
|
||||
typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
|
||||
typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
|
||||
typedef Communication ParallelInformation;
|
||||
typedef typename FineOperator::domain_type FineVectorType;
|
||||
using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
|
||||
using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
|
||||
using ParallelInformation = Communication;
|
||||
using FineVectorType= typename FineOperator::domain_type;
|
||||
|
||||
public:
|
||||
PressureBhpTransferPolicy(const Communication& comm,
|
||||
const FineVectorType& weights,
|
||||
const Opm::PropertyTree& prm,
|
||||
const PropertyTree& prm,
|
||||
const std::size_t pressureIndex)
|
||||
: communication_(&const_cast<Communication&>(comm))
|
||||
, weights_(weights)
|
||||
@ -109,7 +114,7 @@ namespace Opm
|
||||
{
|
||||
}
|
||||
|
||||
virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
|
||||
void createCoarseLevelSystem(const FineOperator& fineOperator) override
|
||||
{
|
||||
OPM_TIMEBLOCK(createCoarseLevelSystem);
|
||||
using CoarseMatrix = typename CoarseOperator::matrix_type;
|
||||
@ -164,7 +169,7 @@ namespace Opm
|
||||
this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
|
||||
}
|
||||
|
||||
virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
|
||||
void calculateCoarseEntries(const FineOperator& fineOperator) override
|
||||
{
|
||||
OPM_TIMEBLOCK(calculateCoarseEntries);
|
||||
const auto& fineMatrix = fineOperator.getmat();
|
||||
@ -175,7 +180,7 @@ namespace Opm
|
||||
auto entryCoarse = rowCoarse->begin();
|
||||
for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
|
||||
assert(entry.index() == entryCoarse.index());
|
||||
double matrix_el = 0;
|
||||
Scalar matrix_el = 0;
|
||||
if (transpose) {
|
||||
const auto& bw = weights_[entry.index()];
|
||||
for (std::size_t i = 0; i < bw.size(); ++i) {
|
||||
@ -203,7 +208,7 @@ namespace Opm
|
||||
}
|
||||
}
|
||||
|
||||
virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
|
||||
void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
|
||||
{
|
||||
OPM_TIMEBLOCK(moveToCoarseLevel);
|
||||
//NB we iterate over fine assumming welldofs is at the end
|
||||
@ -214,7 +219,7 @@ namespace Opm
|
||||
|
||||
for (auto block = begin; block != end; ++block) {
|
||||
const auto& bw = weights_[block.index()];
|
||||
double rhs_el = 0.0;
|
||||
Scalar rhs_el = 0.0;
|
||||
if (transpose) {
|
||||
rhs_el = (*block)[pressure_var_index_];
|
||||
} else {
|
||||
@ -228,7 +233,7 @@ namespace Opm
|
||||
this->lhs_ = 0;
|
||||
}
|
||||
|
||||
virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
|
||||
void moveToFineLevel(typename ParentType::FineDomainType& fine) override
|
||||
{
|
||||
OPM_TIMEBLOCK(moveToFineLevel);
|
||||
//NB we iterate over fine assumming welldofs is at the end
|
||||
@ -246,7 +251,7 @@ namespace Opm
|
||||
}
|
||||
}
|
||||
|
||||
virtual PressureBhpTransferPolicy* clone() const override
|
||||
PressureBhpTransferPolicy* clone() const override
|
||||
{
|
||||
return new PressureBhpTransferPolicy(*this);
|
||||
}
|
||||
|
@ -28,39 +28,40 @@
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
namespace Details
|
||||
{
|
||||
using PressureMatrixType = Dune::BCRSMatrix<Opm::MatrixBlock<double, 1, 1>>;
|
||||
using PressureVectorType = Dune::BlockVector<Dune::FieldVector<double, 1>>;
|
||||
using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType, PressureVectorType, PressureVectorType>;
|
||||
template <class Comm>
|
||||
namespace Opm { namespace Details {
|
||||
template<class Scalar> using PressureMatrixType = Dune::BCRSMatrix<MatrixBlock<Scalar, 1, 1>>;
|
||||
template<class Scalar> using PressureVectorType = Dune::BlockVector<Dune::FieldVector<Scalar, 1>>;
|
||||
template<class Scalar> using SeqCoarseOperatorType = Dune::MatrixAdapter<PressureMatrixType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
PressureVectorType<Scalar>>;
|
||||
template<class Scalar, class Comm>
|
||||
using ParCoarseOperatorType
|
||||
= Dune::OverlappingSchwarzOperator<PressureMatrixType, PressureVectorType, PressureVectorType, Comm>;
|
||||
template <class Comm>
|
||||
= Dune::OverlappingSchwarzOperator<PressureMatrixType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
PressureVectorType<Scalar>,
|
||||
Comm>;
|
||||
template<class Scalar, class Comm>
|
||||
using CoarseOperatorType = std::conditional_t<std::is_same<Comm, Dune::Amg::SequentialInformation>::value,
|
||||
SeqCoarseOperatorType,
|
||||
ParCoarseOperatorType<Comm>>;
|
||||
SeqCoarseOperatorType<Scalar>,
|
||||
ParCoarseOperatorType<Scalar,Comm>>;
|
||||
} // namespace Details
|
||||
|
||||
|
||||
|
||||
template <class FineOperator, class Communication, bool transpose = false>
|
||||
template <class FineOperator, class Communication, class Scalar, bool transpose = false>
|
||||
class PressureTransferPolicy
|
||||
: public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Communication>>
|
||||
: public Dune::Amg::LevelTransferPolicyCpr<FineOperator, Details::CoarseOperatorType<Scalar,Communication>>
|
||||
{
|
||||
public:
|
||||
typedef typename Details::CoarseOperatorType<Communication> CoarseOperator;
|
||||
typedef Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator> ParentType;
|
||||
typedef Communication ParallelInformation;
|
||||
typedef typename FineOperator::domain_type FineVectorType;
|
||||
using CoarseOperator = typename Details::CoarseOperatorType<Scalar,Communication>;
|
||||
using ParentType = Dune::Amg::LevelTransferPolicyCpr<FineOperator, CoarseOperator>;
|
||||
using ParallelInformation = Communication;
|
||||
using FineVectorType = typename FineOperator::domain_type;
|
||||
|
||||
public:
|
||||
PressureTransferPolicy(const Communication& comm,
|
||||
const FineVectorType& weights,
|
||||
const Opm::PropertyTree& /*prm*/,
|
||||
const PropertyTree& /*prm*/,
|
||||
int pressure_var_index)
|
||||
: communication_(&const_cast<Communication&>(comm))
|
||||
, weights_(weights)
|
||||
@ -68,7 +69,7 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
virtual void createCoarseLevelSystem(const FineOperator& fineOperator) override
|
||||
void createCoarseLevelSystem(const FineOperator& fineOperator) override
|
||||
{
|
||||
using CoarseMatrix = typename CoarseOperator::matrix_type;
|
||||
const auto& fineLevelMatrix = fineOperator.getmat();
|
||||
@ -92,7 +93,7 @@ public:
|
||||
this->operator_ = Dune::Amg::ConstructionTraits<CoarseOperator>::construct(oargs);
|
||||
}
|
||||
|
||||
virtual void calculateCoarseEntries(const FineOperator& fineOperator) override
|
||||
void calculateCoarseEntries(const FineOperator& fineOperator) override
|
||||
{
|
||||
const auto& fineMatrix = fineOperator.getmat();
|
||||
*coarseLevelMatrix_ = 0;
|
||||
@ -102,7 +103,7 @@ public:
|
||||
auto entryCoarse = rowCoarse->begin();
|
||||
for (auto entry = row->begin(), entryEnd = row->end(); entry != entryEnd; ++entry, ++entryCoarse) {
|
||||
assert(entry.index() == entryCoarse.index());
|
||||
double matrix_el = 0;
|
||||
Scalar matrix_el = 0;
|
||||
if (transpose) {
|
||||
const auto& bw = weights_[entry.index()];
|
||||
for (std::size_t i = 0; i < bw.size(); ++i) {
|
||||
@ -120,7 +121,7 @@ public:
|
||||
assert(rowCoarse == coarseLevelMatrix_->end());
|
||||
}
|
||||
|
||||
virtual void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
|
||||
void moveToCoarseLevel(const typename ParentType::FineRangeType& fine) override
|
||||
{
|
||||
// Set coarse vector to zero
|
||||
this->rhs_ = 0;
|
||||
@ -129,7 +130,7 @@ public:
|
||||
|
||||
for (auto block = begin; block != end; ++block) {
|
||||
const auto& bw = weights_[block.index()];
|
||||
double rhs_el = 0.0;
|
||||
Scalar rhs_el = 0.0;
|
||||
if (transpose) {
|
||||
rhs_el = (*block)[pressure_var_index_];
|
||||
} else {
|
||||
@ -143,7 +144,7 @@ public:
|
||||
this->lhs_ = 0;
|
||||
}
|
||||
|
||||
virtual void moveToFineLevel(typename ParentType::FineDomainType& fine) override
|
||||
void moveToFineLevel(typename ParentType::FineDomainType& fine) override
|
||||
{
|
||||
auto end = fine.end(), begin = fine.begin();
|
||||
|
||||
@ -159,7 +160,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
virtual PressureTransferPolicy* clone() const override
|
||||
PressureTransferPolicy* clone() const override
|
||||
{
|
||||
return new PressureTransferPolicy(*this);
|
||||
}
|
||||
|
@ -52,56 +52,70 @@
|
||||
|
||||
typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Opm {
|
||||
|
||||
using Opm::Accelerator::BdaResult;
|
||||
using Opm::Accelerator::BdaSolver;
|
||||
using Opm::Accelerator::SolverStatus;
|
||||
using Accelerator::BdaResult;
|
||||
using Accelerator::BdaSolver;
|
||||
using Accelerator::SolverStatus;
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
|
||||
int linear_solver_verbosity,
|
||||
[[maybe_unused]] int maxit,
|
||||
[[maybe_unused]] double tolerance,
|
||||
[[maybe_unused]] unsigned int platformID,
|
||||
[[maybe_unused]] unsigned int deviceID,
|
||||
[[maybe_unused]] bool opencl_ilu_parallel,
|
||||
[[maybe_unused]] std::string linsolver)
|
||||
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
|
||||
template<class BridgeMatrix, class BridgeVector, int block_size>
|
||||
BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
BdaBridge(std::string accelerator_mode_,
|
||||
int linear_solver_verbosity,
|
||||
[[maybe_unused]] int maxit,
|
||||
[[maybe_unused]] Scalar tolerance,
|
||||
[[maybe_unused]] unsigned int platformID,
|
||||
[[maybe_unused]] unsigned int deviceID,
|
||||
[[maybe_unused]] bool opencl_ilu_parallel,
|
||||
[[maybe_unused]] std::string linsolver)
|
||||
: verbosity(linear_solver_verbosity)
|
||||
, accelerator_mode(accelerator_mode_)
|
||||
{
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
#if HAVE_CUDA
|
||||
use_gpu = true;
|
||||
backend.reset(new Opm::Accelerator::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
||||
using CU = Accelerator::cusparseSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
||||
#endif
|
||||
} else if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
use_gpu = true;
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
|
||||
using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<OCL>(linear_solver_verbosity,
|
||||
maxit,
|
||||
tolerance,
|
||||
platformID,
|
||||
deviceID,
|
||||
opencl_ilu_parallel,
|
||||
linsolver);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
} else if (accelerator_mode.compare("amgcl") == 0) {
|
||||
#if HAVE_AMGCL
|
||||
use_gpu = true; // should be replaced by a 'use_bridge' boolean
|
||||
backend.reset(new Opm::Accelerator::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
|
||||
using AMGCL = Accelerator::amgclSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<AMGCL>(linear_solver_verbosity, maxit,
|
||||
tolerance, platformID, deviceID);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
|
||||
#endif
|
||||
} else if (accelerator_mode.compare("rocalution") == 0) {
|
||||
#if HAVE_ROCALUTION
|
||||
use_gpu = true; // should be replaced by a 'use_bridge' boolean
|
||||
backend.reset(new Opm::Accelerator::rocalutionSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance));
|
||||
using ROCA = Accelerator::rocalutionSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<ROCA>(linear_solver_verbosity, maxit, tolerance);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake");
|
||||
#endif
|
||||
} else if (accelerator_mode.compare("rocsparse") == 0) {
|
||||
#if HAVE_ROCSPARSE
|
||||
use_gpu = true; // should be replaced by a 'use_bridge' boolean
|
||||
backend.reset(new Opm::Accelerator::rocsparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
|
||||
using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
|
||||
tolerance, platformID, deviceID);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
|
||||
#endif
|
||||
@ -112,13 +126,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <class BridgeMatrix>
|
||||
int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::size_type>& diag_indices) {
|
||||
int replaceZeroDiagonal(BridgeMatrix& mat,
|
||||
std::vector<typename BridgeMatrix::size_type>& diag_indices)
|
||||
{
|
||||
using Scalar = typename BridgeMatrix::field_type;
|
||||
int numZeros = 0;
|
||||
const int dim = mat[0][0].N(); // might be replaced with BridgeMatrix::block_type::size()
|
||||
const double zero_replace = 1e-15;
|
||||
const Scalar zero_replace = 1e-15;
|
||||
if (diag_indices.empty()) {
|
||||
int Nb = mat.N();
|
||||
diag_indices.reserve(Nb);
|
||||
@ -134,7 +149,7 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
|
||||
}
|
||||
diag_indices.emplace_back(diag.offset());
|
||||
}
|
||||
}else{
|
||||
} else {
|
||||
for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
|
||||
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
|
||||
@ -151,13 +166,15 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
|
||||
return numZeros;
|
||||
}
|
||||
|
||||
|
||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||
// sparsity pattern should stay the same
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
copySparsityPatternFromISTL(const BridgeMatrix& mat,
|
||||
std::vector<int>& h_rows,
|
||||
std::vector<int>& h_cols)
|
||||
{
|
||||
h_rows.clear();
|
||||
h_cols.clear();
|
||||
|
||||
@ -172,17 +189,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
|
||||
OPM_THROW(std::logic_error,
|
||||
"Error size of rows do not sum to number of nonzeroes "
|
||||
"in BdaBridge::copySparsityPatternFromISTL()");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// check if the nnz values of the matrix are in contiguous memory
|
||||
// this is done by checking if the distance between the last value of the last block of row 0 and
|
||||
// the first value of the first row of row 1 is equal to 1
|
||||
// if the matrix only has 1 row, it is always contiguous
|
||||
template <class BridgeMatrix>
|
||||
void checkMemoryContiguous(const BridgeMatrix& mat) {
|
||||
void checkMemoryContiguous(const BridgeMatrix& mat)
|
||||
{
|
||||
auto block_size = mat[0][0].N();
|
||||
auto row = mat.begin();
|
||||
auto last_of_row0 = row->begin();
|
||||
@ -199,14 +218,14 @@ void checkMemoryContiguous(const BridgeMatrix& mat) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
|
||||
BridgeMatrix* jacMat,
|
||||
int numJacobiBlocks,
|
||||
BridgeVector& b,
|
||||
WellContributions& wellContribs,
|
||||
InverseOperatorResult& res)
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
solve_system(BridgeMatrix* bridgeMat,
|
||||
BridgeMatrix* jacMat,
|
||||
int numJacobiBlocks,
|
||||
BridgeVector& b,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
InverseOperatorResult& res)
|
||||
{
|
||||
if (use_gpu) {
|
||||
BdaResult result;
|
||||
@ -221,38 +240,48 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
return;
|
||||
}
|
||||
|
||||
using Mat = Accelerator::BlockedMatrix<Scalar>;
|
||||
if (!matrix) {
|
||||
h_rows.reserve(Nb+1);
|
||||
h_cols.reserve(nnzb);
|
||||
copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
|
||||
checkMemoryContiguous(*bridgeMat);
|
||||
matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
|
||||
matrix = std::make_unique<Mat>(Nb, nnzb, block_size,
|
||||
static_cast<Scalar*>(&(((*bridgeMat)[0][0][0][0]))),
|
||||
h_cols.data(),
|
||||
h_rows.data());
|
||||
}
|
||||
|
||||
Dune::Timer t_zeros;
|
||||
int numZeros = replaceZeroDiagonal(*bridgeMat, diagIndices);
|
||||
if (verbosity >= 2) {
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found "
|
||||
<< numZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
if (numJacobiBlocks >= 2) {
|
||||
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
|
||||
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes()
|
||||
: h_jacRows.back();
|
||||
|
||||
if (!jacMatrix) {
|
||||
h_jacRows.reserve(Nb+1);
|
||||
h_jacCols.reserve(jacNnzb);
|
||||
copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
|
||||
checkMemoryContiguous(*jacMat);
|
||||
jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
|
||||
jacMatrix = std::make_unique<Mat>(Nb, jacNnzb, block_size,
|
||||
static_cast<Scalar*>(&(((*jacMat)[0][0][0][0]))),
|
||||
h_jacCols.data(),
|
||||
h_jacRows.data());
|
||||
}
|
||||
|
||||
Dune::Timer t_zeros2;
|
||||
int jacNumZeros = replaceZeroDiagonal(*jacMat, jacDiagIndices);
|
||||
if (verbosity >= 2) {
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
|
||||
out << "Checking zeros for jacMat took: " << t_zeros2.stop()
|
||||
<< " s, found " << jacNumZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
@ -260,17 +289,23 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
// assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
|
||||
SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
|
||||
SolverStatus status = backend->solve_system(matrix,
|
||||
static_cast<Scalar*>(&(b[0][0])),
|
||||
jacMatrix, wellContribs, result);
|
||||
|
||||
switch(status) {
|
||||
switch (status) {
|
||||
case SolverStatus::BDA_SOLVER_SUCCESS:
|
||||
//OpmLog::info("BdaSolver converged");
|
||||
break;
|
||||
case SolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
OpmLog::warning("BdaSolver could not analyse level information of matrix, "
|
||||
"perhaps there is still a 0.0 on the diagonal of a "
|
||||
"block on the diagonal");
|
||||
break;
|
||||
case SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
OpmLog::warning("BdaSolver could not create preconditioner, "
|
||||
"perhaps there is still a 0.0 on the diagonal "
|
||||
"of a block on the diagonal");
|
||||
break;
|
||||
default:
|
||||
OpmLog::warning("BdaSolver returned unknown status code");
|
||||
@ -286,21 +321,27 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unused]] BridgeVector& x) {
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
get_result([[maybe_unused]] BridgeVector& x)
|
||||
{
|
||||
if (use_gpu) {
|
||||
backend->get_result(static_cast<double*>(&(x[0][0])));
|
||||
backend->get_result(static_cast<Scalar*>(&(x[0][0])));
|
||||
}
|
||||
}
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
|
||||
[[maybe_unused]] unsigned N) {
|
||||
if(accelerator_mode.compare("opencl") == 0){
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
|
||||
initWellContributions([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
[[maybe_unused]] unsigned N)
|
||||
{
|
||||
if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
|
||||
static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
|
||||
using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
|
||||
const auto openclBackend = static_cast<const OCL*>(backend.get());
|
||||
using WCOCL = WellContributionsOCL<Scalar>;
|
||||
static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
|
||||
openclBackend->queue.get());
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
@ -309,23 +350,20 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
|
||||
}
|
||||
|
||||
// the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||
n>; \
|
||||
\
|
||||
template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<double, n, n>, std::allocator<Dune::FieldMatrix<double, n, n> > >, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||
n>;
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(T,n) \
|
||||
template class BdaBridge<Dune::BCRSMatrix<MatrixBlock<T,n,n>>, \
|
||||
Dune::BlockVector<Dune::FieldVector<T,n>>,n>; \
|
||||
template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<T,n,n>>, \
|
||||
Dune::BlockVector<Dune::FieldVector<T,n>>,n>;
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,1) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,2) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,3) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,4) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,5) \
|
||||
INSTANTIATE_BDA_FUNCTIONS(T,6)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
} // namespace Opm
|
||||
|
@ -27,7 +27,7 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
|
||||
typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
|
||||
@ -36,12 +36,13 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
class BdaBridge
|
||||
{
|
||||
private:
|
||||
using Scalar = typename BridgeVector::field_type;
|
||||
int verbosity = 0;
|
||||
bool use_gpu = false;
|
||||
std::string accelerator_mode;
|
||||
std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
|
||||
std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::unique_ptr<Accelerator::BdaSolver<Scalar,block_size>> backend;
|
||||
std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::vector<int> h_rows, h_cols; // store the sparsity pattern of the matrix
|
||||
std::vector<int> h_jacRows, h_jacCols; // store the sparsity pattern of the jacMatrix
|
||||
std::vector<typename BridgeMatrix::size_type> diagIndices; // contains offsets of the diagonal blocks wrt start of the row, used for replaceZeroDiagonal()
|
||||
@ -57,8 +58,14 @@ public:
|
||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||
BdaBridge(std::string accelerator_mode, int linear_solver_verbosity, int maxit, double tolerance,
|
||||
unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
|
||||
BdaBridge(std::string accelerator_mode,
|
||||
int linear_solver_verbosity,
|
||||
int maxit,
|
||||
Scalar tolerance,
|
||||
unsigned int platformID,
|
||||
unsigned int deviceID,
|
||||
bool opencl_ilu_parallel,
|
||||
std::string linsolver);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
@ -69,7 +76,12 @@ public:
|
||||
/// \param[in] b vector b, should be of type Dune::BlockVector
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] result summary of solver result
|
||||
void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
|
||||
void solve_system(BridgeMatrix* bridgeMat,
|
||||
BridgeMatrix* jacMat,
|
||||
int numJacobiBlocks,
|
||||
BridgeVector& b,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
InverseOperatorResult &result);
|
||||
|
||||
/// Get the resulting x vector
|
||||
/// \param[inout] x vector x, should be of type Dune::BlockVector
|
||||
@ -77,7 +89,8 @@ public:
|
||||
|
||||
/// Return whether the BdaBridge will use the GPU or not
|
||||
/// return whether the BdaBridge will use the GPU or not
|
||||
bool getUseGpu(){
|
||||
bool getUseGpu()
|
||||
{
|
||||
return use_gpu;
|
||||
}
|
||||
|
||||
@ -85,19 +98,21 @@ public:
|
||||
/// \param[in] mat input matrix, probably BCRSMatrix
|
||||
/// \param[out] h_rows rowpointers
|
||||
/// \param[out] h_cols columnindices
|
||||
static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
|
||||
static void copySparsityPatternFromISTL(const BridgeMatrix& mat,
|
||||
std::vector<int>& h_rows,
|
||||
std::vector<int>& h_cols);
|
||||
|
||||
/// Initialize the WellContributions object with opencl context and queue
|
||||
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
|
||||
/// \param[in] wellContribs container to hold all WellContributions
|
||||
/// \param[in] N number of rows in scalar vector that wellContribs will be applied on
|
||||
void initWellContributions(WellContributions& wellContribs, unsigned N);
|
||||
void initWellContributions(WellContributions<Scalar>& wellContribs, unsigned N);
|
||||
|
||||
/// Return the selected accelerator mode, this is input via the command-line
|
||||
std::string getAccleratorName(){
|
||||
std::string getAccleratorName()
|
||||
{
|
||||
return accelerator_mode;
|
||||
}
|
||||
|
||||
}; // end class BdaBridge
|
||||
|
||||
}
|
||||
|
@ -25,70 +25,86 @@
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace Opm {
|
||||
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
|
||||
namespace Accelerator {
|
||||
enum class SolverStatus {
|
||||
BDA_SOLVER_SUCCESS,
|
||||
BDA_SOLVER_ANALYSIS_FAILED,
|
||||
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
BDA_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
|
||||
/// This class is abstract, no instantiations can of it can be made, only of its children
|
||||
template <unsigned int block_size>
|
||||
class BdaSolver
|
||||
{
|
||||
enum class SolverStatus {
|
||||
BDA_SOLVER_SUCCESS,
|
||||
BDA_SOLVER_ANALYSIS_FAILED,
|
||||
BDA_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
BDA_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
protected:
|
||||
/// This class serves to simplify choosing between different backend solvers, such as cusparseSolver and openclSolver
|
||||
/// This class is abstract, no instantiations can of it can be made, only of its children
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class BdaSolver
|
||||
{
|
||||
protected:
|
||||
// verbosity
|
||||
// 0: print nothing during solves, only when initializing
|
||||
// 1: print number of iterations and final norm
|
||||
// 2: also print norm each iteration
|
||||
// 3: also print timings of different backend functions
|
||||
int verbosity = 0;
|
||||
|
||||
// verbosity
|
||||
// 0: print nothing during solves, only when initializing
|
||||
// 1: print number of iterations and final norm
|
||||
// 2: also print norm each iteration
|
||||
// 3: also print timings of different backend functions
|
||||
int maxit = 200;
|
||||
Scalar tolerance = 1e-2;
|
||||
|
||||
int verbosity = 0;
|
||||
int N; // number of rows
|
||||
int Nb; // number of blocked rows (Nb*block_size == N)
|
||||
int nnz; // number of nonzeroes (scalars)
|
||||
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
|
||||
|
||||
int maxit = 200;
|
||||
double tolerance = 1e-2;
|
||||
unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
|
||||
unsigned int deviceID = 0; // ID of the device to be used
|
||||
|
||||
int N; // number of rows
|
||||
int Nb; // number of blocked rows (Nb*block_size == N)
|
||||
int nnz; // number of nonzeroes (scalars)
|
||||
int nnzb; // number of nonzero blocks (nnzb*block_size*block_size == nnz)
|
||||
bool initialized = false;
|
||||
|
||||
unsigned int platformID = 0; // ID of OpenCL platform to be used, only used by openclSolver now
|
||||
unsigned int deviceID = 0; // ID of the device to be used
|
||||
public:
|
||||
/// Construct a BdaSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of solver
|
||||
/// \param[in] maxit maximum number of iterations for solver
|
||||
/// \param[in] tolerance required relative tolerance for solver
|
||||
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, Scalar tolerance_)
|
||||
: verbosity(linear_solver_verbosity)
|
||||
, maxit(max_it)
|
||||
, tolerance(tolerance_)
|
||||
{}
|
||||
BdaSolver(int linear_solver_verbosity, int max_it,
|
||||
Scalar tolerance_, unsigned int deviceID_)
|
||||
: verbosity(linear_solver_verbosity)
|
||||
, maxit(max_it)
|
||||
, tolerance(tolerance_)
|
||||
, deviceID(deviceID_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it,
|
||||
double tolerance_, unsigned int platformID_,
|
||||
unsigned int deviceID_)
|
||||
: verbosity(linear_solver_verbosity)
|
||||
, maxit(max_it)
|
||||
, tolerance(tolerance_)
|
||||
, platformID(platformID_)
|
||||
, deviceID(deviceID_)
|
||||
{}
|
||||
|
||||
bool initialized = false;
|
||||
/// Define virtual destructor, so that the derivedclass destructor will be called
|
||||
virtual ~BdaSolver() = default;
|
||||
|
||||
public:
|
||||
/// Construct a BdaSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of solver
|
||||
/// \param[in] maxit maximum number of iterations for solver
|
||||
/// \param[in] tolerance required relative tolerance for solver
|
||||
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
||||
/// Define as pure virtual functions, so derivedclass must implement them
|
||||
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) = 0;
|
||||
|
||||
/// Define virtual destructor, so that the derivedclass destructor will be called
|
||||
virtual ~BdaSolver() {};
|
||||
|
||||
/// Define as pure virtual functions, so derivedclass must implement them
|
||||
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
|
||||
|
||||
virtual void get_result(double *x) = 0;
|
||||
|
||||
}; // end class BdaSolver
|
||||
virtual void get_result(Scalar* x) = 0;
|
||||
}; // end class BdaSolver
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
@ -17,9 +17,6 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
@ -29,16 +26,10 @@
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
void sortRow(int *colIndices, int *data, int left, int right)
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
|
||||
|
||||
void sortRow(int *colIndices, int *data, int left, int right) {
|
||||
int l = left;
|
||||
int r = right;
|
||||
int middle = colIndices[(l + r) >> 1];
|
||||
@ -67,14 +58,14 @@ void sortRow(int *colIndices, int *data, int left, int right) {
|
||||
sortRow(colIndices, data, l, right);
|
||||
}
|
||||
|
||||
|
||||
// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
|
||||
// a = a - (b * c)
|
||||
void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size)
|
||||
{
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
for (unsigned int k = 0; k < block_size; k++) {
|
||||
temp += b[block_size * row + k] * c[block_size * k + col];
|
||||
}
|
||||
@ -84,11 +75,12 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
|
||||
}
|
||||
|
||||
/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
|
||||
|
||||
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
|
||||
template<class Scalar>
|
||||
void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size)
|
||||
{
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
double temp = 0;
|
||||
Scalar temp = 0;
|
||||
for (unsigned int k = 0; k < block_size; k++) {
|
||||
temp += mat1[block_size * row + k] * mat2[block_size * k + col];
|
||||
}
|
||||
@ -97,5 +89,10 @@ void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_si
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template void blockMultSub(double*, double*, double*, unsigned int); \
|
||||
template void blockMult(double*, double*, double*, unsigned int);
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -20,44 +20,40 @@
|
||||
#ifndef OPM_BLOCKED_MATRIX_HPP
|
||||
#define OPM_BLOCKED_MATRIX_HPP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
template<class Scalar>
|
||||
class BlockedMatrix
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
/// Allocate BlockedMatrix and data arrays with given sizes
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
/// \param[in] block_size the number of rows and columns for each block
|
||||
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
|
||||
: nnzValues(new double[nnzbs_*block_size_*block_size_]),
|
||||
colIndices(new int[nnzbs_*block_size_*block_size_]),
|
||||
rowPointers(new int[Nb_+1]),
|
||||
Nb(Nb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_),
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(true)
|
||||
: nnzValues(new Scalar[nnzbs_*block_size_*block_size_])
|
||||
, colIndices(new int[nnzbs_*block_size_*block_size_])
|
||||
, rowPointers(new int[Nb_+1])
|
||||
, Nb(Nb_)
|
||||
, nnzbs(nnzbs_)
|
||||
, block_size(block_size_)
|
||||
, deleteNnzs(true)
|
||||
, deleteSparsity(true)
|
||||
{}
|
||||
|
||||
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
|
||||
/// \param[in] M matrix to be copied
|
||||
BlockedMatrix(const BlockedMatrix& M)
|
||||
: nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
|
||||
colIndices(M.colIndices),
|
||||
rowPointers(M.rowPointers),
|
||||
Nb(M.Nb),
|
||||
nnzbs(M.nnzbs),
|
||||
block_size(M.block_size),
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(false)
|
||||
: nnzValues(new Scalar[M.nnzbs*M.block_size*M.block_size])
|
||||
, colIndices(M.colIndices)
|
||||
, rowPointers(M.rowPointers)
|
||||
, Nb(M.Nb)
|
||||
, nnzbs(M.nnzbs)
|
||||
, block_size(M.block_size)
|
||||
, deleteNnzs(true)
|
||||
, deleteSparsity(false)
|
||||
{}
|
||||
|
||||
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
|
||||
@ -67,18 +63,20 @@ public:
|
||||
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
|
||||
/// \param[in] colIndices array of column indices, contains nnzb entries
|
||||
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
|
||||
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
||||
: nnzValues(nnzValues_),
|
||||
colIndices(colIndices_),
|
||||
rowPointers(rowPointers_),
|
||||
Nb(Nb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_),
|
||||
deleteNnzs(false),
|
||||
deleteSparsity(false)
|
||||
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_,
|
||||
Scalar* nnzValues_, int *colIndices_, int *rowPointers_)
|
||||
: nnzValues(nnzValues_)
|
||||
, colIndices(colIndices_)
|
||||
, rowPointers(rowPointers_)
|
||||
, Nb(Nb_)
|
||||
, nnzbs(nnzbs_)
|
||||
, block_size(block_size_)
|
||||
, deleteNnzs(false)
|
||||
, deleteSparsity(false)
|
||||
{}
|
||||
|
||||
~BlockedMatrix(){
|
||||
~BlockedMatrix()
|
||||
{
|
||||
if (deleteNnzs) {
|
||||
delete[] nnzValues;
|
||||
}
|
||||
@ -88,8 +86,7 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
double *nnzValues;
|
||||
Scalar* nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
int Nb;
|
||||
@ -99,14 +96,13 @@ public:
|
||||
bool deleteSparsity;
|
||||
};
|
||||
|
||||
|
||||
/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
|
||||
/// These ints aren't actually nonzeroes, but represent a mapping used later
|
||||
/// \param[inout] colIndices represent keys in sorting
|
||||
/// \param[inout] data sorted according to the colIndices
|
||||
/// \param[in] left lower index of data of row
|
||||
/// \param[in] right upper index of data of row
|
||||
void sortRow(int *colIndices, int *data, int left, int right);
|
||||
void sortRow(int* colIndices, int* data, int left, int right);
|
||||
|
||||
/// Multiply and subtract blocks
|
||||
/// a = a - (b * c)
|
||||
@ -114,7 +110,8 @@ void sortRow(int *colIndices, int *data, int left, int right);
|
||||
/// \param[in] b input block
|
||||
/// \param[in] c input block
|
||||
/// \param[in] block_size size of block
|
||||
void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
|
||||
template<class Scalar>
|
||||
void blockMultSub(Scalar* a, Scalar* b, Scalar* c, unsigned int block_size);
|
||||
|
||||
/// Perform a matrix-matrix multiplication on two blocks
|
||||
/// resMat = mat1 * mat2
|
||||
@ -122,9 +119,9 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
|
||||
/// \param[in] mat2 input block 2
|
||||
/// \param[out] resMat output block
|
||||
/// \param[in] block_size size of block
|
||||
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
|
||||
template<class Scalar>
|
||||
void blockMult(Scalar* mat1, Scalar* mat2, Scalar* resMat, unsigned int block_size);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
@ -29,17 +29,17 @@ namespace Accelerator
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
class Matrix {
|
||||
|
||||
template<class Scalar>
|
||||
class Matrix
|
||||
{
|
||||
public:
|
||||
|
||||
/// Allocate square Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int nnzs_)
|
||||
: N(N_),
|
||||
M(N_),
|
||||
nnzs(nnzs_)
|
||||
: N(N_)
|
||||
, M(N_)
|
||||
, nnzs(nnzs_)
|
||||
{
|
||||
nnzValues.resize(nnzs);
|
||||
colIndices.resize(nnzs);
|
||||
@ -51,12 +51,12 @@ public:
|
||||
/// \param[in] M number of columns
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int M_, int nnzs_)
|
||||
: Matrix(N_, nnzs_)
|
||||
: Matrix(N_, nnzs_)
|
||||
{
|
||||
M = M_;
|
||||
}
|
||||
|
||||
std::vector<double> nnzValues;
|
||||
std::vector<Scalar> nnzValues;
|
||||
std::vector<int> colIndices;
|
||||
std::vector<int> rowPointers;
|
||||
int N, M;
|
||||
|
@ -29,21 +29,27 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
|
||||
unsigned int Mb_,
|
||||
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks_, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
|
||||
std::vector<double> &Cvalues)
|
||||
:
|
||||
dim(dim_), // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
dim_wells(dim_wells_), // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
M(Mb_ * dim_wells), // number of rows, M == dim_wells*Mb
|
||||
Mb(Mb_), // number of blockrows in C, D and B
|
||||
DnumBlocks(DnumBlocks_), // number of blocks in D
|
||||
template<class Scalar>
|
||||
MultisegmentWellContribution<Scalar>::
|
||||
MultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
|
||||
unsigned int Mb_,
|
||||
std::vector<Scalar>& Bvalues,
|
||||
std::vector<unsigned int>& BcolIndices,
|
||||
std::vector<unsigned int>& BrowPointers,
|
||||
unsigned int DnumBlocks_,
|
||||
Scalar* Dvalues,
|
||||
UMFPackIndex* DcolPointers,
|
||||
UMFPackIndex* DrowIndices,
|
||||
std::vector<Scalar>& Cvalues)
|
||||
: dim(dim_) // size of blockvectors in vectors x and y, equal to MultisegmentWell::numEq
|
||||
, dim_wells(dim_wells_) // size of blocks in C, B and D, equal to MultisegmentWell::numWellEq
|
||||
, M(Mb_ * dim_wells) // number of rows, M == dim_wells*Mb
|
||||
, Mb(Mb_) // number of blockrows in C, D and B
|
||||
, DnumBlocks(DnumBlocks_) // number of blocks in D
|
||||
// copy data for matrix D into vectors to prevent it going out of scope
|
||||
Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells),
|
||||
Dcols(DcolPointers, DcolPointers + M + 1),
|
||||
Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
|
||||
, Dvals(Dvalues, Dvalues + DnumBlocks * dim_wells * dim_wells)
|
||||
, Dcols(DcolPointers, DcolPointers + M + 1)
|
||||
, Drows(DrowIndices, DrowIndices + DnumBlocks * dim_wells * dim_wells)
|
||||
{
|
||||
Cvals = std::move(Cvalues);
|
||||
Bvals = std::move(Bvalues);
|
||||
@ -57,17 +63,18 @@ MultisegmentWellContribution::MultisegmentWellContribution(unsigned int dim_, un
|
||||
umfpack_di_numeric(Dcols.data(), Drows.data(), Dvals.data(), UMFPACK_Symbolic, &UMFPACK_Numeric, nullptr, nullptr);
|
||||
}
|
||||
|
||||
MultisegmentWellContribution::~MultisegmentWellContribution()
|
||||
template<class Scalar>
|
||||
MultisegmentWellContribution<Scalar>::~MultisegmentWellContribution()
|
||||
{
|
||||
umfpack_di_free_symbolic(&UMFPACK_Symbolic);
|
||||
umfpack_di_free_numeric(&UMFPACK_Numeric);
|
||||
}
|
||||
|
||||
|
||||
// Apply the MultisegmentWellContribution, similar to MultisegmentWell::apply()
|
||||
// h_x and h_y reside on host
|
||||
// y -= (C^T * (D^-1 * (B * x)))
|
||||
void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
template<class Scalar>
|
||||
void MultisegmentWellContribution<Scalar>::apply(Scalar* h_x, Scalar* h_y)
|
||||
{
|
||||
OPM_TIMEBLOCK(apply);
|
||||
// reset z1 and z2
|
||||
@ -80,7 +87,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||
unsigned int colIdx = Bcols[blockID];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j) {
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim; ++k) {
|
||||
temp += Bvals[blockID * dim * dim_wells + j * dim + k] * h_x[colIdx * dim + k];
|
||||
}
|
||||
@ -100,7 +107,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
|
||||
unsigned int colIdx = Bcols[blockID];
|
||||
for (unsigned int j = 0; j < dim; ++j) {
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
for (unsigned int k = 0; k < dim_wells; ++k) {
|
||||
temp += Cvals[blockID * dim * dim_wells + j + k * dim] * z2[row * dim_wells + k];
|
||||
}
|
||||
@ -111,11 +118,14 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
|
||||
}
|
||||
|
||||
#if HAVE_CUDA
|
||||
void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
|
||||
template<class Scalar>
|
||||
void MultisegmentWellContribution<Scalar>::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
stream = stream_;
|
||||
}
|
||||
#endif
|
||||
|
||||
template class MultisegmentWellContribution<double>;
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
|
@ -41,6 +41,7 @@ namespace Opm
|
||||
/// B*x and D*B*x are a vector with M*numWellEq doubles
|
||||
/// C*D*B*x is a vector with N*numEq doubles.
|
||||
|
||||
template<class Scalar>
|
||||
class MultisegmentWellContribution
|
||||
{
|
||||
|
||||
@ -57,15 +58,15 @@ private:
|
||||
// C and B are stored in BCRS format, D is stored in CSC format (Dune::UMFPack)
|
||||
// Sparsity pattern for C is not stored, since it is the same as B
|
||||
unsigned int DnumBlocks; // number of blocks in D
|
||||
std::vector<double> Cvals;
|
||||
std::vector<double> Dvals;
|
||||
std::vector<double> Bvals;
|
||||
std::vector<Scalar> Cvals;
|
||||
std::vector<Scalar> Dvals;
|
||||
std::vector<Scalar> Bvals;
|
||||
std::vector<int> Dcols; // Columnpointers, contains M+1 entries
|
||||
std::vector<unsigned int> Bcols;
|
||||
std::vector<int> Drows; // Rowindicies, contains DnumBlocks*dim*dim_wells entries
|
||||
std::vector<unsigned int> Brows;
|
||||
std::vector<double> z1; // z1 = B * x
|
||||
std::vector<double> z2; // z2 = D^-1 * B * x
|
||||
std::vector<Scalar> z1; // z1 = B * x
|
||||
std::vector<Scalar> z2; // z2 = D^-1 * B * x
|
||||
void *UMFPACK_Symbolic, *UMFPACK_Numeric;
|
||||
|
||||
/// Translate the columnIndex if needed
|
||||
@ -97,9 +98,14 @@ public:
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
MultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
unsigned int Mb,
|
||||
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers,
|
||||
UMFPackIndex *DrowIndices, std::vector<double> &Cvalues);
|
||||
std::vector<Scalar>& Bvalues,
|
||||
std::vector<unsigned int>& BcolIndices,
|
||||
std::vector<unsigned int>& BrowPointers,
|
||||
unsigned int DnumBlocks,
|
||||
Scalar* Dvalues,
|
||||
UMFPackIndex* DcolPointers,
|
||||
UMFPackIndex* DrowIndices,
|
||||
std::vector<Scalar>& Cvalues);
|
||||
|
||||
/// Destroy a MultisegmentWellContribution, and free memory
|
||||
~MultisegmentWellContribution();
|
||||
@ -108,7 +114,7 @@ public:
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for MultisegmentWell
|
||||
/// \param[in] h_x vector x, must be on CPU
|
||||
/// \param[inout] h_y vector y, must be on CPU
|
||||
void apply(double *h_x, double *h_y);
|
||||
void apply(Scalar* h_x, Scalar* h_y);
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
@ -39,35 +39,36 @@
|
||||
|
||||
namespace Opm {
|
||||
|
||||
WellContributions::~WellContributions() = default;
|
||||
template<class Scalar>
|
||||
WellContributions<Scalar>::~WellContributions() = default;
|
||||
|
||||
std::unique_ptr<WellContributions>
|
||||
WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
|
||||
template<class Scalar>
|
||||
std::unique_ptr<WellContributions<Scalar>>
|
||||
WellContributions<Scalar>::create(const std::string& accelerator_mode, bool useWellConn)
|
||||
{
|
||||
if(accelerator_mode.compare("cusparse") == 0){
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
#if HAVE_CUDA
|
||||
return std::make_unique<WellContributionsCuda>();
|
||||
return std::make_unique<WellContributionsCuda<Scalar>>();
|
||||
#else
|
||||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
|
||||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
|
||||
#endif
|
||||
}
|
||||
else if(accelerator_mode.compare("opencl") == 0){
|
||||
else if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
return std::make_unique<WellContributionsOCL>();
|
||||
return std::make_unique<WellContributionsOCL<Scalar>>();
|
||||
#else
|
||||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
|
||||
#endif
|
||||
}
|
||||
else if(accelerator_mode.compare("rocsparse") == 0){
|
||||
else if (accelerator_mode.compare("rocsparse") == 0) {
|
||||
if (!useWellConn) {
|
||||
#if HAVE_ROCSPARSE
|
||||
return std::make_unique<WellContributionsRocsparse>();
|
||||
return std::make_unique<WellContributionsRocsparse<Scalar>>();
|
||||
#else
|
||||
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
|
||||
#endif
|
||||
}
|
||||
return std::make_unique<WellContributions>();
|
||||
|
||||
}
|
||||
else if(accelerator_mode.compare("amgcl") == 0){
|
||||
if (!useWellConn) {
|
||||
@ -86,10 +87,12 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
|
||||
[[maybe_unused]] int* colIndices,
|
||||
[[maybe_unused]] double* values,
|
||||
[[maybe_unused]] unsigned int val_size)
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::
|
||||
addMatrix([[maybe_unused]] MatrixType type,
|
||||
[[maybe_unused]] int* colIndices,
|
||||
[[maybe_unused]] Scalar* values,
|
||||
[[maybe_unused]] unsigned int val_size)
|
||||
{
|
||||
#if !HAVE_CUDA && !HAVE_OPENCL
|
||||
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
|
||||
@ -107,7 +110,8 @@ void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
{
|
||||
dim = dim_;
|
||||
dim_wells = dim_wells_;
|
||||
@ -121,11 +125,14 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::setVectorSize(unsigned N_) {
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::setVectorSize(unsigned N_)
|
||||
{
|
||||
N = N_;
|
||||
}
|
||||
|
||||
void WellContributions::addNumBlocks(unsigned int numBlocks)
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::addNumBlocks(unsigned int numBlocks)
|
||||
{
|
||||
if (allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add more sizes after allocated in WellContributions");
|
||||
@ -134,7 +141,8 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
|
||||
num_std_wells++;
|
||||
}
|
||||
|
||||
void WellContributions::alloc()
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::alloc()
|
||||
{
|
||||
if (num_std_wells > 0) {
|
||||
val_pointers.resize(num_std_wells+1);
|
||||
@ -144,31 +152,36 @@ void WellContributions::alloc()
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::addMultisegmentWellContribution(unsigned int dim_,
|
||||
unsigned int dim_wells_,
|
||||
unsigned int Mb,
|
||||
std::vector<double>& Bvalues,
|
||||
std::vector<unsigned int>& BcolIndices,
|
||||
std::vector<unsigned int>& BrowPointers,
|
||||
unsigned int DnumBlocks,
|
||||
double* Dvalues,
|
||||
UMFPackIndex* DcolPointers,
|
||||
UMFPackIndex* DrowIndices,
|
||||
std::vector<double>& Cvalues)
|
||||
template<class Scalar>
|
||||
void WellContributions<Scalar>::
|
||||
addMultisegmentWellContribution(unsigned int dim_,
|
||||
unsigned int dim_wells_,
|
||||
unsigned int Mb,
|
||||
std::vector<Scalar>& Bvalues,
|
||||
std::vector<unsigned int>& BcolIndices,
|
||||
std::vector<unsigned int>& BrowPointers,
|
||||
unsigned int DnumBlocks,
|
||||
Scalar* Dvalues,
|
||||
UMFPackIndex* DcolPointers,
|
||||
UMFPackIndex* DrowIndices,
|
||||
std::vector<Scalar>& Cvalues)
|
||||
{
|
||||
assert(dim==dim_);
|
||||
multisegments.push_back(std::make_unique<MultisegmentWellContribution>(dim_,
|
||||
dim_wells_,
|
||||
Mb,
|
||||
Bvalues,
|
||||
BcolIndices,
|
||||
BrowPointers,
|
||||
DnumBlocks,
|
||||
Dvalues,
|
||||
DcolPointers,
|
||||
DrowIndices,
|
||||
Cvalues));
|
||||
using MSW = MultisegmentWellContribution<Scalar>;
|
||||
multisegments.push_back(std::make_unique<MSW>(dim_,
|
||||
dim_wells_,
|
||||
Mb,
|
||||
Bvalues,
|
||||
BcolIndices,
|
||||
BrowPointers,
|
||||
DnumBlocks,
|
||||
Dvalues,
|
||||
DcolPointers,
|
||||
DrowIndices,
|
||||
Cvalues));
|
||||
++num_ms_wells;
|
||||
}
|
||||
|
||||
template class WellContributions<double>;
|
||||
|
||||
} //namespace Opm
|
||||
|
@ -30,7 +30,7 @@
|
||||
|
||||
namespace Opm {
|
||||
|
||||
class MultisegmentWellContribution;
|
||||
template<class Scalar> class MultisegmentWellContribution;
|
||||
|
||||
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver or openclSolver.
|
||||
/// If the --matrix-add-well-contributions commandline parameter is true, this class should still be used, but be empty.
|
||||
@ -48,6 +48,7 @@ class MultisegmentWellContribution;
|
||||
/// - get total size of all wellcontributions that must be stored here
|
||||
/// - allocate memory
|
||||
/// - copy data of wellcontributions
|
||||
template<class Scalar>
|
||||
class WellContributions
|
||||
{
|
||||
public:
|
||||
@ -74,7 +75,7 @@ protected:
|
||||
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
|
||||
std::vector<unsigned int> val_pointers; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
|
||||
|
||||
std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;
|
||||
std::vector<std::unique_ptr<MultisegmentWellContribution<Scalar>>> multisegments;
|
||||
|
||||
public:
|
||||
unsigned int getNumWells(){
|
||||
@ -105,7 +106,7 @@ public:
|
||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||
/// \param[in] values array of nonzeroes
|
||||
/// \param[in] val_size number of blocks in C or B, ignored for D
|
||||
void addMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size);
|
||||
void addMatrix(MatrixType type, int* colIndices, Scalar* values, unsigned int val_size);
|
||||
|
||||
/// Add a MultisegmentWellContribution, actually creates an object on heap that is destroyed in the destructor
|
||||
/// Matrices C and B are passed in Blocked CSR, matrix D in CSC
|
||||
@ -120,19 +121,25 @@ public:
|
||||
/// \param[in] DcolPointers columnpointers of matrix D
|
||||
/// \param[in] DrowIndices rowindices of matrix D
|
||||
/// \param[in] Cvalues nonzero values of matrix C
|
||||
void addMultisegmentWellContribution(unsigned int dim, unsigned int dim_wells,
|
||||
void addMultisegmentWellContribution(unsigned int dim,
|
||||
unsigned int dim_wells,
|
||||
unsigned int Mb,
|
||||
std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
|
||||
unsigned int DnumBlocks, double *Dvalues,
|
||||
UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
|
||||
std::vector<double> &Cvalues);
|
||||
std::vector<Scalar>& Bvalues,
|
||||
std::vector<unsigned int>& BcolIndices,
|
||||
std::vector<unsigned int>& BrowPointers,
|
||||
unsigned int DnumBlocks,
|
||||
Scalar* Dvalues,
|
||||
UMFPackIndex* DcolPointers,
|
||||
UMFPackIndex* DrowIndices,
|
||||
std::vector<Scalar>& Cvalues);
|
||||
protected:
|
||||
//! \brief API specific allocation.
|
||||
virtual void APIalloc() {}
|
||||
|
||||
/// Api specific upload of matrix.
|
||||
virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
|
||||
virtual void APIaddMatrix(MatrixType, int*, Scalar*, unsigned int) {}
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
#endif
|
||||
|
@ -46,36 +46,35 @@
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
amgclSolverBackend<block_size>::amgclSolverBackend(const int verbosity_,
|
||||
const int maxit_,
|
||||
const double tolerance_,
|
||||
const unsigned int platformID_,
|
||||
const unsigned int deviceID_)
|
||||
: BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
amgclSolverBackend<Scalar,block_size>::
|
||||
amgclSolverBackend(const int verbosity_,
|
||||
const int maxit_,
|
||||
const Scalar tolerance_,
|
||||
const unsigned int platformID_,
|
||||
const unsigned int deviceID_)
|
||||
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
|
||||
{}
|
||||
|
||||
template <unsigned int block_size>
|
||||
amgclSolverBackend<block_size>::~amgclSolverBackend() {}
|
||||
template<class Scalar, unsigned int block_size>
|
||||
amgclSolverBackend<Scalar,block_size>::~amgclSolverBackend()
|
||||
{}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::initialize(int Nb_, int nnzbs)
|
||||
{
|
||||
this->Nb = Nb_;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = nnzbs;
|
||||
this->nnz = nnzbs * block_size * block_size;
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
|
||||
out << "Initializing amgclSolverBackend, matrix size: " << Nb
|
||||
<< " blockrows, nnzb: " << nnzb << " blocks\n";
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "DeviceID: " << deviceID << "\n";
|
||||
OpmLog::info(out.str());
|
||||
@ -118,7 +117,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
|
||||
prm.put("solver.maxiter", t3);
|
||||
bool t4 = prm.get("solver.verbose", verbosity >= 2);
|
||||
prm.put("solver.verbose", t4);
|
||||
out << "Using parameters from " << filename << " (with default values for omitted parameters):\n";
|
||||
out << "Using parameters from " << filename
|
||||
<< " (with default values for omitted parameters):\n";
|
||||
} else { // otherwise use default parameters, same as Dune
|
||||
prm.put("backend_type", "cpu"); // put it in the tree so it gets printed
|
||||
prm.put("precond.class", "relaxation");
|
||||
@ -142,7 +142,8 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
|
||||
} else if (backend_type_string == "vexcl") {
|
||||
backend_type = Amgcl_backend_type::vexcl;
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
|
||||
OPM_THROW(std::logic_error,
|
||||
"Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
|
||||
}
|
||||
|
||||
if (backend_type == Amgcl_backend_type::cuda) {
|
||||
@ -160,9 +161,10 @@ void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *cols) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::
|
||||
convert_sparsity_pattern(int* rows, int* cols)
|
||||
{
|
||||
Timer t;
|
||||
const unsigned int bs = block_size;
|
||||
int idx = 0; // indicates the unblocked write index
|
||||
@ -189,9 +191,10 @@ void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *co
|
||||
}
|
||||
} // end convert_sparsity_pattern()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::
|
||||
convert_data(Scalar* vals, int* rows)
|
||||
{
|
||||
Timer t;
|
||||
const unsigned int bs = block_size;
|
||||
int idx = 0; // indicates the unblocked write index
|
||||
@ -217,7 +220,9 @@ void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
|
||||
} // end convert_data()
|
||||
|
||||
#if HAVE_VEXCL
|
||||
void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformID, unsigned int deviceID) {
|
||||
void initialize_vexcl(std::vector<cl::CommandQueue>& ctx,
|
||||
unsigned int platformID, unsigned int deviceID)
|
||||
{
|
||||
std::vector<cl::Platform> platforms;
|
||||
std::vector<cl::Device> devices;
|
||||
cl::Platform::get(&platforms);
|
||||
@ -245,19 +250,20 @@ void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformI
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
template <typename vexcl_matrix_type, typename vexcl_vector_type, unsigned int block_size, typename AIJInfo>
|
||||
void solve_vexcl(
|
||||
const AIJInfo& A,
|
||||
const boost::property_tree::ptree prm,
|
||||
const std::vector<cl::CommandQueue>& ctx,
|
||||
double *b,
|
||||
std::vector<double>& x,
|
||||
const int N,
|
||||
int& iters,
|
||||
double& error)
|
||||
template <typename vexcl_matrix_type, typename vexcl_vector_type,
|
||||
unsigned int block_size, typename Scalar, typename AIJInfo>
|
||||
void solve_vexcl(const AIJInfo& A,
|
||||
const boost::property_tree::ptree prm,
|
||||
const std::vector<cl::CommandQueue>& ctx,
|
||||
Scalar* b,
|
||||
std::vector<Scalar>& x,
|
||||
const int N,
|
||||
int& iters,
|
||||
Scalar& error)
|
||||
{
|
||||
typedef amgcl::backend::vexcl<vexcl_matrix_type> Backend;
|
||||
typedef amgcl::make_solver<amgcl::runtime::preconditioner<Backend>, amgcl::runtime::solver::wrapper<Backend> > Solver;
|
||||
using Backend = amgcl::backend::vexcl<vexcl_matrix_type>;
|
||||
using Solver = amgcl::make_solver<amgcl::runtime::preconditioner<Backend>,
|
||||
amgcl::runtime::solver::wrapper<Backend>>;
|
||||
|
||||
typename Solver::backend_params bprm;
|
||||
bprm.q = ctx; // set vexcl context
|
||||
@ -275,8 +281,10 @@ void solve_vexcl(
|
||||
}
|
||||
#endif
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::
|
||||
solve_system(Scalar* b, BdaResult& res)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
try {
|
||||
@ -306,7 +314,7 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
|
||||
// reset x vector
|
||||
std::fill(x.begin(), x.end(), 0.0);
|
||||
|
||||
std::vector<double> b_(b, b + N);
|
||||
std::vector<Scalar> b_(b, b + N);
|
||||
|
||||
// create numa vectors
|
||||
typename CPU_Backend::params bprm;
|
||||
@ -349,10 +357,11 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
|
||||
if constexpr(block_size == 1){
|
||||
auto A = std::tie(N, A_rows, A_cols, A_vals);
|
||||
|
||||
solve_vexcl<double, double, block_size>(A, prm, ctx, b, x, N, iters, error);
|
||||
solve_vexcl<Scalar, Scalar, block_size>(A, prm, ctx, b, x, N, iters, error);
|
||||
} else {
|
||||
// allow vexcl to use blocked matrices
|
||||
vex::scoped_program_header h1(ctx, amgcl::backend::vexcl_static_matrix_declaration<double, block_size>());
|
||||
vex::scoped_program_header h1(ctx,
|
||||
amgcl::backend::vexcl_static_matrix_declaration<Scalar, block_size>());
|
||||
|
||||
auto Atmp = std::tie(N, A_rows, A_cols, A_vals);
|
||||
auto A = amgcl::adapter::block_matrix<dmat_type>(Atmp);
|
||||
@ -375,8 +384,8 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
|
||||
|
||||
if (verbosity >= 1) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
|
||||
out << "=== converged: " << res.converged << ", time: " << res.elapsed
|
||||
<< ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
@ -384,14 +393,13 @@ void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
|
||||
out << "amgclSolverBackend::solve_system(): " << time_elapsed << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::get_result(double *x_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::get_result(Scalar* x_)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
std::copy(x.begin(), x.end(), x_);
|
||||
@ -403,13 +411,13 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
[[maybe_unused]] WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus amgclSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
[[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix->Nb, matrix->nnzbs);
|
||||
@ -420,15 +428,14 @@ SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<Blocke
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class amgclSolverBackend<1>; \
|
||||
template class amgclSolverBackend<2>; \
|
||||
template class amgclSolverBackend<3>; \
|
||||
template class amgclSolverBackend<4>; \
|
||||
template class amgclSolverBackend<5>; \
|
||||
template class amgclSolverBackend<6>;
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class amgclSolverBackend<T,1>; \
|
||||
template class amgclSolverBackend<T,2>; \
|
||||
template class amgclSolverBackend<T,3>; \
|
||||
template class amgclSolverBackend<T,4>; \
|
||||
template class amgclSolverBackend<T,5>; \
|
||||
template class amgclSolverBackend<T,6>;
|
||||
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -41,17 +41,14 @@
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
|
||||
/// Note amgcl also implements blocked solvers, but looks like it needs unblocked input data
|
||||
template <unsigned int block_size>
|
||||
class amgclSolverBackend : public BdaSolver<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class amgclSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -64,17 +61,16 @@ class amgclSolverBackend : public BdaSolver<block_size>
|
||||
using Base::tolerance;
|
||||
using Base::initialized;
|
||||
|
||||
using dmat_type = amgcl::static_matrix<double, block_size, block_size>; // matrix value type in double precision
|
||||
using dvec_type = amgcl::static_matrix<double, block_size, 1>; // the corresponding vector value type
|
||||
using dmat_type = amgcl::static_matrix<Scalar, block_size, block_size>; // matrix value type in double precision
|
||||
using dvec_type = amgcl::static_matrix<Scalar, block_size, 1>; // the corresponding vector value type
|
||||
using CPU_Backend = std::conditional_t<block_size == 1,
|
||||
amgcl::backend::builtin<double>,
|
||||
amgcl::backend::builtin<Scalar>,
|
||||
amgcl::backend::builtin<dmat_type>>;
|
||||
|
||||
using CPU_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CPU_Backend>,
|
||||
amgcl::runtime::solver::wrapper<CPU_Backend>>;
|
||||
|
||||
private:
|
||||
|
||||
// amgcl can use different backends, this lets the user choose
|
||||
enum Amgcl_backend_type {
|
||||
cpu,
|
||||
@ -84,18 +80,18 @@ private:
|
||||
|
||||
// store matrix in CSR format
|
||||
std::vector<unsigned> A_rows, A_cols;
|
||||
std::vector<double> A_vals, rhs;
|
||||
std::vector<double> x;
|
||||
std::vector<Scalar> A_vals, rhs;
|
||||
std::vector<Scalar> x;
|
||||
std::once_flag print_info;
|
||||
Amgcl_backend_type backend_type = cpu;
|
||||
|
||||
boost::property_tree::ptree prm; // amgcl parameters
|
||||
int iters = 0;
|
||||
double error = 0.0;
|
||||
Scalar error = 0.0;
|
||||
|
||||
#if HAVE_CUDA
|
||||
std::once_flag cuda_initialize;
|
||||
void solve_cuda(double *b);
|
||||
void solve_cuda(Scalar* b);
|
||||
#endif
|
||||
|
||||
#if HAVE_VEXCL
|
||||
@ -114,21 +110,23 @@ private:
|
||||
/// Convert the BCSR nonzero data to a CSR format
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
void convert_data(double *vals, int *rows);
|
||||
void convert_data(Scalar* vals, int* rows);
|
||||
|
||||
/// Solve linear system
|
||||
/// \param[in] b pointer to b vector
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(double *b, BdaResult &res);
|
||||
void solve_system(Scalar* b, BdaResult& res);
|
||||
|
||||
public:
|
||||
/// Construct a openclSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of openclSolver
|
||||
/// \param[in] maxit maximum number of iterations for openclSolver
|
||||
/// \param[in] tolerance required relative tolerance for openclSolver
|
||||
/// Construct an amgcl solver
|
||||
/// \param[in] linear_solver_verbosity verbosity of amgclSolver
|
||||
/// \param[in] maxit maximum number of iterations for amgclSolver
|
||||
/// \param[in] tolerance required relative tolerance for amgclSolver
|
||||
/// \param[in] platformID the OpenCL platform to be used
|
||||
/// \param[in] deviceID the device to be used
|
||||
amgclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
|
||||
amgclSolverBackend(int linear_solver_verbosity, int maxit,
|
||||
Scalar tolerance, unsigned int platformID,
|
||||
unsigned int deviceID);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~amgclSolverBackend();
|
||||
@ -140,18 +138,18 @@ public:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
}; // end class amgclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -28,18 +28,14 @@
|
||||
|
||||
/// This file is only compiled when both amgcl and CUDA are found by CMake
|
||||
|
||||
namespace Opm
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void amgclSolverBackend<Scalar,block_size>::solve_cuda(Scalar* b)
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::solve_cuda(double *b) {
|
||||
typedef amgcl::backend::cuda<double> CUDA_Backend;
|
||||
typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
|
||||
using CUDA_Backend = amgcl::backend::cuda<Scalar>;
|
||||
using CUDA_Solver = amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>,
|
||||
amgcl::runtime::solver::wrapper<CUDA_Backend>>;
|
||||
|
||||
static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle
|
||||
|
||||
@ -67,8 +63,8 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
|
||||
OpmLog::info(out.str());
|
||||
});
|
||||
|
||||
thrust::device_vector<double> B(b, b + N);
|
||||
thrust::device_vector<double> X(N, 0.0);
|
||||
thrust::device_vector<Scalar> B(b, b + N);
|
||||
thrust::device_vector<Scalar> X(N, 0.0);
|
||||
|
||||
// actually solve
|
||||
std::tie(iters, error) = solve(B, X);
|
||||
@ -76,19 +72,15 @@ void amgclSolverBackend<block_size>::solve_cuda(double *b) {
|
||||
thrust::copy(X.begin(), X.end(), x.begin());
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template void amgclSolverBackend<T,1>::solve_cuda(T*); \
|
||||
template void amgclSolverBackend<T,2>::solve_cuda(T*); \
|
||||
template void amgclSolverBackend<T,3>::solve_cuda(T*); \
|
||||
template void amgclSolverBackend<T,4>::solve_cuda(T*); \
|
||||
template void amgclSolverBackend<T,5>::solve_cuda(T*); \
|
||||
template void amgclSolverBackend<T,6>::solve_cuda(T*);
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template void amgclSolverBackend<n>::solve_cuda(double*); \
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
|
@ -33,18 +33,17 @@ namespace Opm
|
||||
{
|
||||
|
||||
// apply WellContributions using y -= C^T * (D^-1 * (B * x))
|
||||
__global__ void apply_well_contributions(
|
||||
const double * __restrict__ Cnnzs,
|
||||
const double * __restrict__ Dnnzs,
|
||||
const double * __restrict__ Bnnzs,
|
||||
const int * __restrict__ Ccols,
|
||||
const int * __restrict__ Bcols,
|
||||
const double * __restrict__ x,
|
||||
double * __restrict__ y,
|
||||
const int dim,
|
||||
const int dim_wells,
|
||||
const unsigned int * __restrict__ val_pointers
|
||||
)
|
||||
template<class Scalar>
|
||||
__global__ void apply_well_contributions(const Scalar* __restrict__ Cnnzs,
|
||||
const Scalar* __restrict__ Dnnzs,
|
||||
const Scalar* __restrict__ Bnnzs,
|
||||
const int* __restrict__ Ccols,
|
||||
const int* __restrict__ Bcols,
|
||||
const Scalar* __restrict__ x,
|
||||
Scalar* __restrict__ y,
|
||||
const int dim,
|
||||
const int dim_wells,
|
||||
const unsigned int * __restrict__ val_pointers)
|
||||
{
|
||||
const int idx_b = blockIdx.x;
|
||||
const int idx_t = threadIdx.x;
|
||||
@ -57,9 +56,9 @@ __global__ void apply_well_contributions(
|
||||
const int c = lane % dim; // col in block
|
||||
const int r = (lane / dim) % dim_wells; // row in block
|
||||
|
||||
extern __shared__ double smem[];
|
||||
double * __restrict__ z1 = smem;
|
||||
double * __restrict__ z2 = z1 + dim_wells;
|
||||
extern __shared__ unsigned char smem[];
|
||||
Scalar* __restrict__ z1 = reinterpret_cast<Scalar*>(smem);
|
||||
Scalar* __restrict__ z2 = z1 + dim_wells;
|
||||
|
||||
if (idx_t < dim_wells) {
|
||||
z1[idx_t] = 0.0;
|
||||
@ -70,7 +69,7 @@ __global__ void apply_well_contributions(
|
||||
// z1 = B * x
|
||||
if (idx_t < num_active_threads) {
|
||||
// multiply all blocks with x
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
int b = idx_t / vals_per_block + val_pointers[idx_b]; // block id, val_size indicates number of blocks
|
||||
while (b < val_size + val_pointers[idx_b]) {
|
||||
int colIdx = Bcols[b];
|
||||
@ -106,7 +105,7 @@ __global__ void apply_well_contributions(
|
||||
|
||||
// z2 = D^-1 * B * x = D^-1 * z1
|
||||
if (idx_t < dim_wells) {
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
for (int c = 0; c < dim_wells; ++c) {
|
||||
temp += Dnnzs[idx_b * dim_wells * dim_wells + idx_t * dim_wells + c] * z1[c];
|
||||
}
|
||||
@ -118,7 +117,7 @@ __global__ void apply_well_contributions(
|
||||
// y -= C^T * D^-1 * B * x
|
||||
// use dim * val_size threads, each block is assigned 'dim' threads
|
||||
if (idx_t < dim * val_size) {
|
||||
double temp = 0.0;
|
||||
Scalar temp = 0.0;
|
||||
int b = idx_t / dim + val_pointers[idx_b];
|
||||
int cc = idx_t % dim;
|
||||
int colIdx = Ccols[b];
|
||||
@ -127,13 +126,13 @@ __global__ void apply_well_contributions(
|
||||
}
|
||||
y[colIdx * dim + cc] -= temp;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
WellContributionsCuda::~WellContributionsCuda()
|
||||
template<class Scalar>
|
||||
WellContributionsCuda<Scalar>::~WellContributionsCuda()
|
||||
{
|
||||
// delete data for StandardWell
|
||||
if (num_std_wells > 0) {
|
||||
if (this->num_std_wells > 0) {
|
||||
cudaFree(d_Cnnzs);
|
||||
cudaFree(d_Dnnzs);
|
||||
cudaFree(d_Bnnzs);
|
||||
@ -142,80 +141,108 @@ WellContributionsCuda::~WellContributionsCuda()
|
||||
cudaFree(d_val_pointers);
|
||||
}
|
||||
|
||||
if (num_ms_wells > 0 && h_x) {
|
||||
if (this->num_ms_wells > 0 && h_x) {
|
||||
cudaFreeHost(h_x);
|
||||
cudaFreeHost(h_y);
|
||||
h_x = h_y = nullptr; // Mark as free for constructor
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributionsCuda::APIalloc()
|
||||
template<class Scalar>
|
||||
void WellContributionsCuda<Scalar>::APIalloc()
|
||||
{
|
||||
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
|
||||
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
|
||||
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
|
||||
cudaMalloc((void**)&d_Cnnzs,
|
||||
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
|
||||
cudaMalloc((void**)&d_Dnnzs,
|
||||
sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
|
||||
cudaMalloc((void**)&d_Bnnzs,
|
||||
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
|
||||
cudaMalloc((void**)&d_Ccols, sizeof(int) * this->num_blocks);
|
||||
cudaMalloc((void**)&d_Bcols, sizeof(int) * this->num_blocks);
|
||||
cudaMalloc((void**)&this->d_val_pointers, sizeof(unsigned int) * (this->num_std_wells + 1));
|
||||
cudaCheckLastError("apply_gpu malloc failed");
|
||||
}
|
||||
|
||||
// Apply the WellContributions, similar to StandardWell::apply()
|
||||
// y -= (C^T *(D^-1*( B*x)))
|
||||
void WellContributionsCuda::apply(double *d_x, double *d_y)
|
||||
template<class Scalar>
|
||||
void WellContributionsCuda<Scalar>::apply(Scalar* d_x, Scalar* d_y)
|
||||
{
|
||||
// apply MultisegmentWells
|
||||
|
||||
// make sure the stream is empty if timing measurements are done
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (num_ms_wells > 0) {
|
||||
if (this->num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
if (h_x == nullptr) {
|
||||
cudaMallocHost(&h_x, sizeof(double) * N);
|
||||
cudaMallocHost(&h_y, sizeof(double) * N);
|
||||
cudaMallocHost(&h_x, sizeof(Scalar) * this->N);
|
||||
cudaMallocHost(&h_y, sizeof(Scalar) * this->N);
|
||||
}
|
||||
|
||||
// copy vectors x and y from GPU to CPU
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(double) * N, cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_x, d_x, sizeof(Scalar) * this->N,
|
||||
cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(h_y, d_y, sizeof(Scalar) * this->N,
|
||||
cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (auto& well : multisegments) {
|
||||
for (auto& well : this->multisegments) {
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(double) * N, cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_y, h_y, sizeof(Scalar) * this->N,
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
}
|
||||
|
||||
// apply StandardWells
|
||||
if (num_std_wells > 0) {
|
||||
int smem_size = 2 * sizeof(double) * dim_wells;
|
||||
apply_well_contributions <<< num_std_wells, 32, smem_size, stream>>>(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_x, d_y, dim, dim_wells, d_val_pointers);
|
||||
if (this->num_std_wells > 0) {
|
||||
int smem_size = 2 * sizeof(Scalar) * this->dim_wells;
|
||||
apply_well_contributions <<< this->num_std_wells, 32, smem_size, stream>>>(d_Cnnzs,
|
||||
d_Dnnzs,
|
||||
d_Bnnzs,
|
||||
d_Ccols,
|
||||
d_Bcols,
|
||||
d_x,
|
||||
d_y,
|
||||
this->dim,
|
||||
this->dim_wells,
|
||||
this->d_val_pointers);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
|
||||
template<class Scalar>
|
||||
void WellContributionsCuda<Scalar>::APIaddMatrix(MatrixType type, int* colIndices,
|
||||
Scalar* values, unsigned int val_size)
|
||||
{
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Ccols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Cnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Ccols + this->num_blocks_so_far, colIndices,
|
||||
sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::D:
|
||||
cudaMemcpy(d_Dnnzs + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(double) * dim_wells * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Dnnzs + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
|
||||
values, sizeof(Scalar) * this->dim_wells * this->dim_wells,
|
||||
cudaMemcpyHostToDevice);
|
||||
break;
|
||||
case MatrixType::B:
|
||||
cudaMemcpy(d_Bnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Bcols + num_blocks_so_far, colIndices, sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if (num_std_wells_so_far == num_std_wells - 1) {
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
cudaMemcpy(d_val_pointers, val_pointers.data(), sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Bnnzs + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(Scalar) * val_size * this->dim * this->dim_wells,
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_Bcols + this->num_blocks_so_far, colIndices,
|
||||
sizeof(int) * val_size, cudaMemcpyHostToDevice);
|
||||
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
|
||||
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
|
||||
this->val_pointers[this->num_std_wells] = this->num_blocks;
|
||||
cudaMemcpy(d_val_pointers, this->val_pointers.data(),
|
||||
sizeof(unsigned int) * (this->num_std_wells + 1),
|
||||
cudaMemcpyHostToDevice);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -224,13 +251,16 @@ void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, doubl
|
||||
cudaCheckLastError("WellContributions::addMatrix() failed");
|
||||
}
|
||||
|
||||
void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
|
||||
template<class Scalar>
|
||||
void WellContributionsCuda<Scalar>::setCudaStream(cudaStream_t stream_)
|
||||
{
|
||||
this->stream = stream_;
|
||||
for (auto& well : multisegments) {
|
||||
for (auto& well : this->multisegments) {
|
||||
well->setCudaStream(stream_);
|
||||
}
|
||||
}
|
||||
|
||||
template class WellContributionsCuda<double>;
|
||||
|
||||
} //namespace Opm
|
||||
|
||||
|
@ -25,10 +25,10 @@
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Opm {
|
||||
|
||||
class WellContributionsCuda : public WellContributions
|
||||
template<class Scalar>
|
||||
class WellContributionsCuda : public WellContributions<Scalar>
|
||||
{
|
||||
public:
|
||||
~WellContributionsCuda() override;
|
||||
@ -41,33 +41,35 @@ public:
|
||||
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
|
||||
/// \param[in] d_x vector x, must be on GPU
|
||||
/// \param[inout] d_y vector y, must be on GPU
|
||||
void apply(double *d_x, double *d_y);
|
||||
void apply(Scalar* d_x, Scalar* d_y);
|
||||
|
||||
protected:
|
||||
/// Allocate memory for the StandardWells
|
||||
void APIalloc() override;
|
||||
|
||||
using MatrixType = typename WellContributions<Scalar>::MatrixType;
|
||||
|
||||
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||
/// \param[in] type indicate if C, D or B is sent
|
||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||
/// \param[in] values array of nonzeroes
|
||||
/// \param[in] val_size number of blocks in C or B, ignored for D
|
||||
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
|
||||
void APIaddMatrix(MatrixType type, int* colIndices,
|
||||
Scalar* values, unsigned int val_size) override;
|
||||
|
||||
cudaStream_t stream;
|
||||
|
||||
// data for StandardWells, could remain nullptrs if not used
|
||||
double *d_Cnnzs = nullptr;
|
||||
double *d_Dnnzs = nullptr;
|
||||
double *d_Bnnzs = nullptr;
|
||||
int *d_Ccols = nullptr;
|
||||
int *d_Bcols = nullptr;
|
||||
double *d_z1 = nullptr;
|
||||
double *d_z2 = nullptr;
|
||||
Scalar* d_Cnnzs = nullptr;
|
||||
Scalar* d_Dnnzs = nullptr;
|
||||
Scalar* d_Bnnzs = nullptr;
|
||||
int* d_Ccols = nullptr;
|
||||
int* d_Bcols = nullptr;
|
||||
Scalar* d_z1 = nullptr;
|
||||
Scalar* d_z2 = nullptr;
|
||||
unsigned int *d_val_pointers = nullptr;
|
||||
double* h_x = nullptr;
|
||||
double* h_y = nullptr;
|
||||
|
||||
Scalar* h_x = nullptr;
|
||||
Scalar* h_y = nullptr;
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
@ -44,22 +44,20 @@
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
cusparseSolverBackend<Scalar, block_size>::
|
||||
cusparseSolverBackend(int verbosity_, int maxit_,
|
||||
Scalar tolerance_, unsigned int deviceID_)
|
||||
: Base(verbosity_, maxit_, tolerance_, deviceID_)
|
||||
{
|
||||
// initialize CUDA device, stream and libraries
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
@ -67,7 +65,8 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
std::ostringstream out;
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: "
|
||||
<< props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
@ -84,26 +83,29 @@ cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int max
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
cusparseSolverBackend<Scalar,block_size>::~cusparseSolverBackend()
|
||||
{
|
||||
finalize();
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
Scalar rho = 1.0, rhop;
|
||||
Scalar alpha, nalpha, beta;
|
||||
Scalar omega, nomega, tmp1, tmp2;
|
||||
Scalar norm, norm_0;
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
Scalar mone = -1.0;
|
||||
float it;
|
||||
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).setCudaStream(stream);
|
||||
}
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
|
||||
@ -147,7 +149,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
@ -178,7 +180,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply wellContributions
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
|
||||
static_cast<WellContributionsCuda<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
@ -190,7 +192,6 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
break;
|
||||
}
|
||||
@ -210,15 +211,18 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: "
|
||||
<< res.conv_rate << ", time: " << res.elapsed
|
||||
<< ", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
@ -232,46 +236,49 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
out << "Initializing GPU, matrix size: " << Nb
|
||||
<< " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "Maxit: " << maxit << std::scientific
|
||||
<< ", tolerance: " << tolerance << "\n";
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_x, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_rw, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_pw, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(Scalar) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(Scalar) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
||||
if (useJacMatrix) {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
||||
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
||||
} else {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_mVals, sizeof(Scalar) * nnz);
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
}
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
|
||||
cudaMallocHost((void**)&vals_contiguous, sizeof(Scalar) * nnz);
|
||||
cudaCheckLastError("Could not allocate pinned memory");
|
||||
#endif
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::finalize() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::finalize()
|
||||
{
|
||||
if (initialized) {
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
@ -307,40 +314,54 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
}
|
||||
} // end finalize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, N * sizeof(Scalar), stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, d_bVals,
|
||||
nnz * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -353,33 +374,43 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<Block
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(Scalar) * N, stream);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(Scalar),
|
||||
cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -394,10 +425,9 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<Blo
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool cusparseSolverBackend<Scalar,block_size>::analyse_matrix()
|
||||
{
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
Timer t;
|
||||
|
||||
@ -472,8 +502,9 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool cusparseSolverBackend<Scalar,block_size>::create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
@ -497,23 +528,24 @@ bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
// actually solve
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void cusparseSolverBackend<Scalar,block_size>::get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(Scalar), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
@ -523,14 +555,13 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix, jacMatrix);
|
||||
@ -551,18 +582,14 @@ SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<Blo
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class cusparseSolverBackend<T,1>; \
|
||||
template class cusparseSolverBackend<T,2>; \
|
||||
template class cusparseSolverBackend<T,3>; \
|
||||
template class cusparseSolverBackend<T,4>; \
|
||||
template class cusparseSolverBackend<T,5>; \
|
||||
template class cusparseSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template cusparseSolverBackend<n>::cusparseSolverBackend(int, int, double, unsigned int); \
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -28,16 +28,13 @@
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class cusparseSolverBackend : public BdaSolver<block_size> {
|
||||
|
||||
typedef BdaSolver<block_size> Base;
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class cusparseSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -50,7 +47,6 @@ class cusparseSolverBackend : public BdaSolver<block_size> {
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
|
||||
cublasHandle_t cublasHandle;
|
||||
cusparseHandle_t cusparseHandle;
|
||||
cudaStream_t stream;
|
||||
@ -58,13 +54,13 @@ private:
|
||||
bsrilu02Info_t info_M;
|
||||
bsrsv2Info_t info_L, info_U;
|
||||
// b: bsr matrix, m: preconditioner
|
||||
double *d_bVals, *d_mVals;
|
||||
Scalar *d_bVals, *d_mVals;
|
||||
int *d_bCols, *d_mCols;
|
||||
int *d_bRows, *d_mRows;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
double *d_pw, *d_s, *d_t, *d_v;
|
||||
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
Scalar *d_pw, *d_s, *d_t, *d_v;
|
||||
void *d_buffer;
|
||||
double *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||
Scalar *vals_contiguous; // only used if COPY_ROW_BY_ROW is true in cusparseSolverBackend.cpp
|
||||
|
||||
bool analysis_done = false;
|
||||
|
||||
@ -77,12 +73,13 @@ private:
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
@ -92,14 +89,18 @@ private:
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
@ -112,17 +113,16 @@ private:
|
||||
/// Solve linear system
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions& wellContribs, BdaResult &res);
|
||||
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult &res);
|
||||
|
||||
public:
|
||||
|
||||
|
||||
/// Construct a cusparseSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int deviceID);
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit,
|
||||
Scalar tolerance, unsigned int deviceID);
|
||||
|
||||
/// Destroy a cusparseSolver, and free memory
|
||||
~cusparseSolverBackend();
|
||||
@ -134,17 +134,19 @@ public:
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
}; // end class cusparseSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -31,33 +31,29 @@
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_) :
|
||||
Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
#if CHOW_PATEL
|
||||
chowPatelIlu.setVerbosity(verbosity);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@ -77,30 +73,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
CSCRowIndices.resize(matToDecompose->nnzbs);
|
||||
CSCColPointers.resize(Nb + 1);
|
||||
|
||||
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
|
||||
LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
|
||||
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
|
||||
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers,
|
||||
CSCRowIndices.data(), CSCColPointers.data(), Nb);
|
||||
if(verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} else {
|
||||
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
|
||||
LUmat = std::make_unique<BlockedMatrix<Scalar>>(*matToDecompose);
|
||||
}
|
||||
|
||||
Timer t_analysis;
|
||||
std::ostringstream out;
|
||||
if (opencl_ilu_parallel) {
|
||||
out << "opencl_ilu_parallel: true (level_scheduling)\n";
|
||||
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers,
|
||||
CSCRowIndices.data(), CSCColPointers.data(), Nb,
|
||||
&numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else {
|
||||
out << "opencl_ilu_parallel: false\n";
|
||||
// numColors = 1;
|
||||
// rowsPerColor.emplace_back(Nb);
|
||||
numColors = Nb;
|
||||
for(int i = 0; i < Nb; ++i){
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
rowsPerColor.emplace_back(1);
|
||||
}
|
||||
}
|
||||
@ -118,44 +117,52 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
invDiagVals.resize(mat->Nb * bs * bs);
|
||||
|
||||
#if CHOW_PATEL
|
||||
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
Lmat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
Umat = std::make_unique<BlockedMatrix<Scalar>>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
#endif
|
||||
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
|
||||
s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
|
||||
#if CHOW_PATEL
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * Lmat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
#else
|
||||
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
|
||||
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * bs * bs * LUmat->nnzbs);
|
||||
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
|
||||
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
|
||||
#endif
|
||||
|
||||
events.resize(3);
|
||||
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0,
|
||||
mat->Nb * sizeof(Scalar) * bs * bs,
|
||||
invDiagVals.data(), nullptr, &events[0]);
|
||||
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0,
|
||||
(numColors + 1) * sizeof(int),
|
||||
rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
|
||||
if (opencl_ilu_parallel) {
|
||||
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
|
||||
Nb * sizeof(unsigned), fromOrder.data(),
|
||||
nullptr, &events[2]);
|
||||
} else {
|
||||
// fromOrder is not initialized, so use something else to fill s.rowIndices
|
||||
// s.rowIndices[i] == i must hold, since every rowidx is mapped to itself (i.e. no actual mapping)
|
||||
// rowsPerColorPrefix is misused here, it contains an increasing sequence (0, 1, 2, ...)
|
||||
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), rowsPerColorPrefix.data(), nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0,
|
||||
Nb * sizeof(unsigned),
|
||||
rowsPerColorPrefix.data(), nullptr, &events[2]);
|
||||
}
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
@ -168,17 +175,15 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@ -186,7 +191,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
Timer t_copy;
|
||||
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
|
||||
memcpy(LUmat->nnzValues, matToDecompose->nnzValues,
|
||||
sizeof(Scalar) * bs * bs * matToDecompose->nnzbs);
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
@ -205,7 +211,9 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
Timer t_copyToGpu;
|
||||
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0,
|
||||
LUmat->nnzbs * bs * bs * sizeof(Scalar),
|
||||
LUmat->nnzValues, nullptr, &events[0]);
|
||||
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
@ -213,14 +221,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row+1];
|
||||
|
||||
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
|
||||
auto candidate = std::find(LUmat->colIndices + rowStart,
|
||||
LUmat->colIndices + rowEnd, row);
|
||||
assert(candidate != LUmat->colIndices + rowEnd);
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int),
|
||||
diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int),
|
||||
LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int),
|
||||
LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
@ -242,11 +254,12 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
|
||||
if (verbosity >= 5) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow
|
||||
<< " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
|
||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||
s.invDiagVals, rowsPerColor[color], block_size);
|
||||
OpenclKernels<Scalar>::ILU_decomp(firstRow, lastRow, s.rowIndices,
|
||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||
s.invDiagVals, rowsPerColor[color], block_size);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -259,43 +272,42 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const double relaxation = 0.9;
|
||||
const Scalar relaxation = 0.9;
|
||||
cl::Event event;
|
||||
Timer t_apply;
|
||||
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<Scalar>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int color = numColors - 1; color >= 0; --color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<Scalar>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
OpenclKernels::scale(x, relaxation, N);
|
||||
OpenclKernels<Scalar>::scale(x, relaxation, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
@ -304,20 +316,14 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class BILU0<T,1>; \
|
||||
template class BILU0<T,2>; \
|
||||
template class BILU0<T,3>; \
|
||||
template class BILU0<T,4>; \
|
||||
template class BILU0<T,5>; \
|
||||
template class BILU0<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class BILU0<n>;
|
||||
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -29,18 +29,15 @@
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a Blocked ILU0 preconditioner
|
||||
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
|
||||
/// The preconditioner is applied via two exact triangular solves
|
||||
template <unsigned int block_size>
|
||||
class BILU0 : public Preconditioner<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class BILU0 : public Preconditioner<Scalar,block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -53,11 +50,11 @@ class BILU0 : public Preconditioner<block_size>
|
||||
using Base::err;
|
||||
|
||||
private:
|
||||
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
|
||||
std::unique_ptr<BlockedMatrix<Scalar>> LUmat{};
|
||||
#if CHOW_PATEL
|
||||
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
|
||||
std::unique_ptr<BlockedMatrix<Scalar>> Lmat{}, Umat{};
|
||||
#endif
|
||||
std::vector<double> invDiagVals;
|
||||
std::vector<Scalar> invDiagVals;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||
@ -67,7 +64,7 @@ private:
|
||||
|
||||
bool opencl_ilu_parallel;
|
||||
|
||||
typedef struct {
|
||||
struct GPU_storage {
|
||||
cl::Buffer invDiagVals; // nnz values of diagonal blocks of the matrix, inverted
|
||||
cl::Buffer diagIndex; // index of diagonal block of each row, used to differentiate between lower and upper triangular part
|
||||
cl::Buffer rowsPerColor; // number of rows for every color
|
||||
@ -80,7 +77,7 @@ private:
|
||||
#else
|
||||
cl::Buffer LUvals, LUcols, LUrows;
|
||||
#endif
|
||||
} GPU_storage;
|
||||
};
|
||||
|
||||
GPU_storage s;
|
||||
|
||||
@ -93,21 +90,25 @@ public:
|
||||
BILU0(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
// analysis, extract parallelism if specified
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
// via Lz = y
|
||||
// and Ux = z
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
|
||||
get_preconditioner_structure()
|
||||
{
|
||||
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
||||
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)},
|
||||
{LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
||||
}
|
||||
|
||||
std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
|
||||
@ -120,8 +121,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -34,26 +34,25 @@
|
||||
|
||||
#include <sstream>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
BISAI<block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_) :
|
||||
Preconditioner<block_size>(verbosity_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
BISAI<Scalar,block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
{
|
||||
#if CHOW_PATEL
|
||||
OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
|
||||
#endif
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel_, verbosity_);
|
||||
bilu0 = std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
@ -61,7 +60,9 @@ void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::s
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices){
|
||||
std::vector<int>
|
||||
buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices)
|
||||
{
|
||||
std::vector<int> aux(colPointers); // colPointers must be copied to this vector
|
||||
std::vector<int> csrToCscOffsetMap(rowIndices.size()); // map must have the same size as the indices vector
|
||||
|
||||
@ -77,14 +78,15 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
|
||||
return csrToCscOffsetMap;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
auto *m = mat;
|
||||
@ -105,21 +107,22 @@ bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::buildLowerSubsystemsStructures(){
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
|
||||
{
|
||||
lower.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
Dune::Timer t_buildLowerSubsystemsStructures;
|
||||
|
||||
for(int tcol = 0; tcol < Nb; tcol++){
|
||||
for (int tcol = 0; tcol < Nb; tcol++) {
|
||||
int frow = diagIndex[tcol] + 1;
|
||||
int lrow = colPointers[tcol + 1];
|
||||
int nx = lrow - frow;
|
||||
int nv = 0;
|
||||
|
||||
for(int sweep = 0; sweep < nx - 1; sweep++){
|
||||
for(int xid = sweep + 1; xid < nx; xid++){
|
||||
for(int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++){
|
||||
for (int sweep = 0; sweep < nx - 1; sweep++) {
|
||||
for (int xid = sweep + 1; xid < nx; xid++) {
|
||||
for( int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++) {
|
||||
if(rowIndices[ptr] == rowIndices[frow + xid]){
|
||||
lower.nzIndices.push_back(csrToCscOffsetMap[ptr]);
|
||||
lower.knownRhsIndices.push_back(csrToCscOffsetMap[frow + sweep]);
|
||||
@ -133,29 +136,31 @@ void BISAI<block_size>::buildLowerSubsystemsStructures(){
|
||||
lower.subsystemPointers[tcol + 1] = lower.subsystemPointers[tcol] + nv;
|
||||
}
|
||||
|
||||
if(verbosity >= 4){
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildLowerSubsystemsStructures time: " << t_buildLowerSubsystemsStructures.stop() << " s";
|
||||
out << "BISAI buildLowerSubsystemsStructures time: "
|
||||
<< t_buildLowerSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::buildUpperSubsystemsStructures(){
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
|
||||
{
|
||||
upper.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
Dune::Timer t_buildUpperSubsystemsStructures;
|
||||
|
||||
for(int tcol = 0; tcol < Nb; tcol++){
|
||||
for (int tcol = 0; tcol < Nb; tcol++) {
|
||||
int frow = colPointers[tcol];
|
||||
int lrow = diagIndex[tcol];
|
||||
int nx = lrow - frow + 1;
|
||||
int nv = 0;
|
||||
|
||||
for(int sweep = 0; sweep < nx - 1; sweep++){
|
||||
for(int xid = 0; xid < nx; xid++){
|
||||
for(int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++){
|
||||
if(rowIndices[ptr] == rowIndices[lrow - xid]){
|
||||
for (int sweep = 0; sweep < nx - 1; sweep++) {
|
||||
for (int xid = 0; xid < nx; xid++) {
|
||||
for (int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++) {
|
||||
if (rowIndices[ptr] == rowIndices[lrow - xid]) {
|
||||
upper.nzIndices.push_back(csrToCscOffsetMap[ptr]);
|
||||
upper.knownRhsIndices.push_back(csrToCscOffsetMap[lrow - sweep]);
|
||||
upper.unknownRhsIndices.push_back(csrToCscOffsetMap[lrow - xid]);
|
||||
@ -168,15 +173,17 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
|
||||
upper.subsystemPointers[tcol + 1] = upper.subsystemPointers[tcol] + nv;
|
||||
}
|
||||
|
||||
if(verbosity >= 4){
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildUpperSubsystemsStructures time: " << t_buildUpperSubsystemsStructures.stop() << " s";
|
||||
out << "BISAI buildUpperSubsystemsStructures time: "
|
||||
<< t_buildUpperSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@ -199,48 +206,93 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
buildLowerSubsystemsStructures();
|
||||
buildUpperSubsystemsStructures();
|
||||
|
||||
d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * colPointers.size());
|
||||
d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * rowIndices.size());
|
||||
d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * csrToCscOffsetMap.size());
|
||||
d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * diagIndex.size());
|
||||
d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
|
||||
d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
|
||||
d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb * bs);
|
||||
d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.subsystemPointers.size());
|
||||
d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.subsystemPointers.size());
|
||||
d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * colPointers.size());
|
||||
d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * rowIndices.size());
|
||||
d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * csrToCscOffsetMap.size());
|
||||
d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * diagIndex.size());
|
||||
d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * nnzb * bs * bs);
|
||||
d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * nnzb * bs * bs);
|
||||
d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * Nb * bs);
|
||||
d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * lower.subsystemPointers.size());
|
||||
d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * upper.subsystemPointers.size());
|
||||
|
||||
if(!lower.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.nzIndices.size());
|
||||
d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.knownRhsIndices.size());
|
||||
d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.unknownRhsIndices.size());
|
||||
if (!lower.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * lower.nzIndices.size());
|
||||
d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * lower.knownRhsIndices.size());
|
||||
d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * lower.unknownRhsIndices.size());
|
||||
}
|
||||
|
||||
if(!upper.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.nzIndices.size());
|
||||
d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.knownRhsIndices.size());
|
||||
d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.unknownRhsIndices.size());
|
||||
if (!upper.nzIndices.empty()) { // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * upper.nzIndices.size());
|
||||
d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * upper.knownRhsIndices.size());
|
||||
d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(int) * upper.unknownRhsIndices.size());
|
||||
}
|
||||
|
||||
events.resize(6);
|
||||
err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0, colPointers.size() * sizeof(int), colPointers.data(), nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0, rowIndices.size() * sizeof(int), rowIndices.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0, csrToCscOffsetMap.size() * sizeof(int), csrToCscOffsetMap.data(), nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, diagIndex.size() * sizeof(int), diagIndex.data(), nullptr, &events[3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0, sizeof(int) * lower.subsystemPointers.size(), lower.subsystemPointers.data(), nullptr, &events[4]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0, sizeof(int) * upper.subsystemPointers.size(), upper.subsystemPointers.data(), nullptr, &events[5]);
|
||||
err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0,
|
||||
colPointers.size() * sizeof(int),
|
||||
colPointers.data(), nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0,
|
||||
rowIndices.size() * sizeof(int),
|
||||
rowIndices.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0,
|
||||
csrToCscOffsetMap.size() * sizeof(int),
|
||||
csrToCscOffsetMap.data(), nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0,
|
||||
diagIndex.size() * sizeof(int),
|
||||
diagIndex.data(), nullptr, &events[3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0,
|
||||
sizeof(int) * lower.subsystemPointers.size(),
|
||||
lower.subsystemPointers.data(), nullptr, &events[4]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0,
|
||||
sizeof(int) * upper.subsystemPointers.size(),
|
||||
upper.subsystemPointers.data(), nullptr, &events[5]);
|
||||
|
||||
if(!lower.nzIndices.empty()){
|
||||
if (!lower.nzIndices.empty()) {
|
||||
events.resize(events.size() + 3);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0, sizeof(int) * lower.nzIndices.size(), lower.nzIndices.data(), nullptr, &events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.knownRhsIndices.size(), lower.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.unknownRhsIndices.size(), lower.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0,
|
||||
sizeof(int) * lower.nzIndices.size(),
|
||||
lower.nzIndices.data(), nullptr,
|
||||
&events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0,
|
||||
sizeof(int) * lower.knownRhsIndices.size(),
|
||||
lower.knownRhsIndices.data(), nullptr,
|
||||
&events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0,
|
||||
sizeof(int) * lower.unknownRhsIndices.size(),
|
||||
lower.unknownRhsIndices.data(), nullptr,
|
||||
&events[events.size() - 1]);
|
||||
}
|
||||
|
||||
if(!upper.nzIndices.empty()){
|
||||
if (!upper.nzIndices.empty()) {
|
||||
events.resize(events.size() + 3);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE, 0, sizeof(int) * upper.nzIndices.size(), upper.nzIndices.data(), nullptr, &events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.knownRhsIndices.size(), upper.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.unknownRhsIndices.size(), upper.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE,
|
||||
0, sizeof(int) * upper.nzIndices.size(),
|
||||
upper.nzIndices.data(), nullptr,
|
||||
&events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0,
|
||||
sizeof(int) * upper.knownRhsIndices.size(),
|
||||
upper.knownRhsIndices.data(), nullptr,
|
||||
&events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0,
|
||||
sizeof(int) * upper.unknownRhsIndices.size(),
|
||||
upper.unknownRhsIndices.data(), nullptr,
|
||||
&events[events.size() - 1]);
|
||||
}
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
@ -255,16 +307,24 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
std::tie(d_LUvals, d_invDiagVals) = bilu0->get_preconditioner_data();
|
||||
|
||||
events.resize(2);
|
||||
err = queue->enqueueFillBuffer(d_invLvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[0]);
|
||||
err |= queue->enqueueFillBuffer(d_invUvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[1]);
|
||||
err = queue->enqueueFillBuffer(d_invLvals, 0, 0,
|
||||
sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[0]);
|
||||
err |= queue->enqueueFillBuffer(d_invUvals, 0, 0,
|
||||
sizeof(Scalar) * nnzb * bs * bs, nullptr, &events[1]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
|
||||
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
|
||||
OpenclKernels<Scalar>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
|
||||
d_lower.subsystemPointers, d_lower.nzIndices,
|
||||
d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
|
||||
d_LUvals, d_invLvals, Nb);
|
||||
OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
|
||||
d_csrToCscOffsetMap, d_upper.subsystemPointers,
|
||||
d_upper.nzIndices, d_upper.unknownRhsIndices,
|
||||
d_upper.knownRhsIndices, d_LUvals,
|
||||
d_invDiagVals, d_invUvals, Nb);
|
||||
|
||||
if(verbosity >= 4){
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
@ -273,33 +333,34 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
||||
return true;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
|
||||
// (to compensate for the unitary diagonal that is not
|
||||
// included in isaiL, for simplicity)
|
||||
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
|
||||
OpenclKernels<Scalar>::spmv(d_invLvals, d_rowIndices, d_colPointers,
|
||||
x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
|
||||
// (to compensate for the unitary diagonal that is not
|
||||
// included in isaiL, for simplicity)
|
||||
OpenclKernels<Scalar>::spmv(d_invUvals, d_rowIndices, d_colPointers,
|
||||
d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class BISAI<n>;
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class BISAI<T,1>; \
|
||||
template class BISAI<T,2>; \
|
||||
template class BISAI<T,3>; \
|
||||
template class BISAI<T,4>; \
|
||||
template class BISAI<T,5>; \
|
||||
template class BISAI<T,6>;
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -26,19 +26,16 @@
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
class BlockedMatrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
|
||||
/// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
|
||||
template <unsigned int block_size>
|
||||
class BISAI : public Preconditioner<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class BISAI : public Preconditioner<Scalar,block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -57,8 +54,8 @@ private:
|
||||
std::vector<int> rowIndices;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> csrToCscOffsetMap;
|
||||
std::vector<double> invLvals;
|
||||
std::vector<double> invUvals;
|
||||
std::vector<Scalar> invLvals;
|
||||
std::vector<Scalar> invUvals;
|
||||
|
||||
cl::Buffer d_colPointers;
|
||||
cl::Buffer d_rowIndices;
|
||||
@ -71,10 +68,10 @@ private:
|
||||
cl::Buffer d_invL_x;
|
||||
|
||||
bool opencl_ilu_parallel;
|
||||
std::unique_ptr<BILU0<block_size> > bilu0;
|
||||
std::unique_ptr<BILU0<Scalar,block_size>> bilu0;
|
||||
|
||||
/// Struct that holds the structure of the small subsystems for each column
|
||||
typedef struct{
|
||||
struct subsystemStructure {
|
||||
/// This vector holds the cumulative sum for the number of non-zero blocks for each subsystem.
|
||||
/// Works similarly to row and column pointers for the CSR and CSC matrix representations.
|
||||
std::vector<int> subsystemPointers;
|
||||
@ -88,15 +85,15 @@ private:
|
||||
std::vector<int> knownRhsIndices;
|
||||
/// This vector holds the indices of the unknown values of the right hand sides of the subsystems.
|
||||
std::vector<int> unknownRhsIndices;
|
||||
} subsystemStructure;
|
||||
};
|
||||
|
||||
/// GPU version of subsystemStructure
|
||||
typedef struct{
|
||||
struct subsystemStructureGPU {
|
||||
cl::Buffer subsystemPointers;
|
||||
cl::Buffer nzIndices;
|
||||
cl::Buffer knownRhsIndices;
|
||||
cl::Buffer unknownRhsIndices;
|
||||
} subsystemStructureGPU;
|
||||
} ;
|
||||
|
||||
subsystemStructure lower, upper;
|
||||
subsystemStructureGPU d_lower, d_upper;
|
||||
@ -113,15 +110,18 @@ public:
|
||||
BISAI(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
|
||||
// analysis, extract parallelism
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
@ -132,7 +132,6 @@ public:
|
||||
/// in the csrToCscOffsetMap[i]-th position in the CSC representation.
|
||||
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
@ -34,37 +34,32 @@
|
||||
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
CPR<block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_) :
|
||||
Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
|
||||
bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
|
||||
diagIndices.resize(1);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
@ -75,8 +70,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
|
||||
return success;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
@ -88,8 +85,10 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat)
|
||||
return success;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
@ -108,8 +107,10 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *
|
||||
return result;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
@ -128,26 +129,30 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// return the absolute value of the N elements for which the absolute value is highest
|
||||
double get_absmax(const double *data, const int N) {
|
||||
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
|
||||
template<class Scalar>
|
||||
Scalar get_absmax(const Scalar* data, const int N)
|
||||
{
|
||||
return std::abs(*std::max_element(data, data + N,
|
||||
[](Scalar a, Scalar b)
|
||||
{ return std::fabs(a) < std::fabs(b); }));
|
||||
}
|
||||
|
||||
|
||||
// solve A^T * x = b
|
||||
void solve_transposed_3x3(const double *A, const double *b, double *x) {
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
|
||||
{
|
||||
const int B = 3;
|
||||
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
|
||||
double t4 = A[0+0*B] * A[1+1*B];
|
||||
double t6 = A[0+0*B] * A[1+2*B];
|
||||
double t8 = A[0+1*B] * A[1+0*B];
|
||||
double t10 = A[0+2*B] * A[1+0*B];
|
||||
double t12 = A[0+1*B] * A[2+0*B];
|
||||
double t14 = A[0+2*B] * A[2+0*B];
|
||||
Scalar t4 = A[0+0*B] * A[1+1*B];
|
||||
Scalar t6 = A[0+0*B] * A[1+2*B];
|
||||
Scalar t8 = A[0+1*B] * A[1+0*B];
|
||||
Scalar t10 = A[0+2*B] * A[1+0*B];
|
||||
Scalar t12 = A[0+1*B] * A[2+0*B];
|
||||
Scalar t14 = A[0+2*B] * A[2+0*B];
|
||||
|
||||
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
|
||||
Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
|
||||
|
||||
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
|
||||
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
|
||||
@ -162,44 +167,49 @@ void solve_transposed_3x3(const double *A, const double *b, double *x) {
|
||||
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::init_opencl_buffers() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar, block_size>::init_opencl_buffers()
|
||||
{
|
||||
d_Amatrices.reserve(num_levels);
|
||||
d_Rmatrices.reserve(num_levels - 1);
|
||||
d_invDiags.reserve(num_levels - 1);
|
||||
for (Matrix& m : Amatrices) {
|
||||
for (Matrix<Scalar>& m : Amatrices) {
|
||||
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
}
|
||||
for (Matrix& m : Rmatrices) {
|
||||
for (Matrix<Scalar>& m : Rmatrices) {
|
||||
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
|
||||
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
|
||||
}
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::opencl_upload() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::opencl_upload()
|
||||
{
|
||||
d_mat->upload(queue.get(), mat);
|
||||
|
||||
err = CL_SUCCESS;
|
||||
events.resize(2 * Rmatrices.size() + 1);
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
|
||||
sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
|
||||
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
|
||||
sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
|
||||
nullptr, &events[2*i+1]);
|
||||
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
|
||||
sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
|
||||
nullptr, &events[2*i+2]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -212,9 +222,10 @@ void CPR<block_size>::opencl_upload() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::
|
||||
create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
this->mat = mat_;
|
||||
|
||||
coarse_vals.resize(nnzb);
|
||||
@ -222,8 +233,8 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
coarse_y.resize(Nb);
|
||||
weights.resize(N);
|
||||
|
||||
try{
|
||||
double rhs[] = {0, 0, 0};
|
||||
try {
|
||||
Scalar rhs[] = {0, 0, 0};
|
||||
rhs[pressure_idx] = 1;
|
||||
|
||||
// find diagonal index for each row
|
||||
@ -241,12 +252,12 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
// calculate weights for each row
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
// solve to find weights
|
||||
double *row_weights = weights.data() + block_size * row; // weights for this row
|
||||
Scalar* row_weights = weights.data() + block_size * row; // weights for this row
|
||||
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
|
||||
|
||||
// normalize weights for this row
|
||||
double abs_max = get_absmax(row_weights, block_size);
|
||||
for(unsigned int i = 0; i < block_size; i++){
|
||||
Scalar abs_max = get_absmax(row_weights, block_size);
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
row_weights[i] /= abs_max;
|
||||
}
|
||||
}
|
||||
@ -257,9 +268,9 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
double *block = mat->nnzValues + idx * block_size * block_size;
|
||||
double *row_weights = weights.data() + block_size * row;
|
||||
double value = 0.0;
|
||||
Scalar* block = mat->nnzValues + idx * block_size * block_size;
|
||||
Scalar* row_weights = weights.data() + block_size * row;
|
||||
Scalar value = 0.0;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
value += block[block_size * i + pressure_idx] * row_weights[i];
|
||||
}
|
||||
@ -276,10 +287,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
if (recalculate_aggregates) {
|
||||
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
|
||||
|
||||
typedef DuneMat::CreateIterator Iter;
|
||||
using Iter = typename DuneMat::CreateIterator;
|
||||
|
||||
// setup sparsity pattern
|
||||
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
|
||||
for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
|
||||
int start = mat->rowPointers[row.index()];
|
||||
int end = mat->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
@ -302,7 +313,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
Dune::Amg::SequentialInformation seqinfo;
|
||||
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
|
||||
|
||||
Opm::PropertyTree property_tree;
|
||||
PropertyTree property_tree;
|
||||
property_tree.put("alpha", 0.333333333333);
|
||||
|
||||
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
|
||||
@ -315,7 +326,7 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
num_pre_smooth_steps = c.getNoPreSmoothSteps();
|
||||
num_post_smooth_steps = c.getNoPostSmoothSteps();
|
||||
|
||||
dune_amg->build<OverlapFlags>(c);
|
||||
dune_amg->template build<OverlapFlags>(c);
|
||||
|
||||
analyzeHierarchy();
|
||||
analyzeAggregateMaps();
|
||||
@ -351,10 +362,10 @@ void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeHierarchy() {
|
||||
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::analyzeHierarchy()
|
||||
{
|
||||
const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
|
||||
// store coarsest AMG level in umfpack format, also performs LU decomposition
|
||||
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
|
||||
@ -372,8 +383,8 @@ void CPR<block_size>::analyzeHierarchy() {
|
||||
|
||||
// matrixIter.dereference() returns MatrixAdapter
|
||||
// matrixIter.dereference().getmat() returns BCRSMat
|
||||
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for (int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
const auto& A = matrixIter.dereference().getmat();
|
||||
level_sizes[level] = A.N();
|
||||
diagIndices[level].reserve(A.N());
|
||||
@ -395,38 +406,38 @@ void CPR<block_size>::analyzeHierarchy() {
|
||||
}
|
||||
}
|
||||
|
||||
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
|
||||
BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
|
||||
Amatrices.back().colIndices);
|
||||
|
||||
// compute inverse diagonal values for current level
|
||||
invDiags.emplace_back(A.N());
|
||||
for (unsigned int row = 0; row < A.N(); ++row) {
|
||||
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeAggregateMaps() {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::analyzeAggregateMaps()
|
||||
{
|
||||
PcolIndices.resize(num_levels - 1);
|
||||
Rmatrices.clear();
|
||||
|
||||
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
|
||||
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
DuneAmg::AggregatesMap *map = *mapIter;
|
||||
typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
typename DuneAmg::AggregatesMap* map = *mapIter;
|
||||
|
||||
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
|
||||
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
|
||||
|
||||
// get indices for each row of P and R
|
||||
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
|
||||
std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
|
||||
PcolIndices[level].resize(level_sizes[level]);
|
||||
|
||||
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
|
||||
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
|
||||
using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
|
||||
for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
|
||||
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
|
||||
const long int diff = ai - map->begin();
|
||||
PcolIndices[level][diff] = *ai;
|
||||
@ -446,19 +457,20 @@ void CPR<block_size>::analyzeAggregateMaps() {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
|
||||
OpenclMatrix *A = &d_Amatrices[level];
|
||||
OpenclMatrix *R = &d_Rmatrices[level];
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
OpenclMatrix<Scalar>* A = &d_Amatrices[level];
|
||||
OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
|
||||
if (level == num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
|
||||
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
@ -470,7 +482,8 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
|
||||
umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
@ -486,34 +499,37 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &
|
||||
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
double jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
|
||||
x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// x = prec(y)
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
// 0-initialize u and x vectors
|
||||
events.resize(d_u.size() + 1);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
|
||||
sizeof(Scalar) * Nb, nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
|
||||
sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -522,16 +538,18 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
|
||||
d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bilu0->apply(y, x);
|
||||
if (verbosity >= 4) {
|
||||
@ -549,20 +567,14 @@ void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class CPR<T,1>; \
|
||||
template class CPR<T,2>; \
|
||||
template class CPR<T,3>; \
|
||||
template class CPR<T,4>; \
|
||||
template class CPR<T,5>; \
|
||||
template class CPR<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class CPR<n>;
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -33,18 +33,15 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
class BlockedMatrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <unsigned int block_size>
|
||||
class CPR : public Preconditioner<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class CPR : public Preconditioner<Scalar,block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -58,25 +55,25 @@ class CPR : public Preconditioner<block_size>
|
||||
|
||||
private:
|
||||
int num_levels;
|
||||
std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<cl::Buffer> d_PcolIndices;
|
||||
std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<cl::Buffer> d_invDiags;
|
||||
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
|
||||
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
|
||||
std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
|
||||
std::unique_ptr<OpenclMatrix> d_mat; // stores blocked matrix
|
||||
std::unique_ptr<OpenclMatrix<Scalar>> d_mat; // stores blocked matrix
|
||||
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
std::unique_ptr<BILU0<block_size> > bilu0; // Blocked ILU0 preconditioner
|
||||
BlockedMatrix *mat = nullptr; // input matrix, blocked
|
||||
std::unique_ptr<BILU0<Scalar,block_size>> bilu0; // Blocked ILU0 preconditioner
|
||||
BlockedMatrix<Scalar>* mat = nullptr; // input matrix, blocked
|
||||
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
|
||||
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
|
||||
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
|
||||
std::unique_ptr<DuneAmg> dune_amg;
|
||||
@ -91,7 +88,7 @@ private:
|
||||
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
|
||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||
|
||||
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
||||
std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
|
||||
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
@ -112,32 +109,35 @@ private:
|
||||
|
||||
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
|
||||
|
||||
void create_preconditioner_amg(BlockedMatrix *mat);
|
||||
void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);
|
||||
|
||||
public:
|
||||
CPR(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
|
||||
// applies blocked ilu0
|
||||
// also applies amg for pressure component
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
};
|
||||
|
||||
// solve A^T * x = b
|
||||
// A should represent a 3x3 matrix
|
||||
// x and b are vectors with 3 elements
|
||||
void solve_transposed_3x3(const double *A, const double *b, double *x);
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -31,12 +31,19 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
|
||||
template<class Scalar>
|
||||
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue,
|
||||
Scalar* vals, int* cols, int* rows)
|
||||
{
|
||||
std::vector<cl::Event> events(3);
|
||||
|
||||
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
|
||||
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0,
|
||||
sizeof(Scalar) * block_size * block_size * nnzbs,
|
||||
vals, nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs,
|
||||
cols, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1),
|
||||
rows, nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -46,7 +53,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
|
||||
template<class Scalar>
|
||||
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix)
|
||||
{
|
||||
if (block_size != 1) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
|
||||
}
|
||||
@ -54,7 +63,9 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
|
||||
upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
|
||||
}
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
|
||||
template<class Scalar>
|
||||
void OpenclMatrix<Scalar>::upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix)
|
||||
{
|
||||
if (matrix->block_size != block_size) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
|
||||
}
|
||||
@ -62,5 +73,7 @@ void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
|
||||
upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
|
||||
}
|
||||
|
||||
template class OpenclMatrix<double>;
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
@ -29,28 +29,30 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
class Matrix;
|
||||
class BlockedMatrix;
|
||||
template<class Scalar> class Matrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The matrix data is stored in OpenCL Buffers
|
||||
class OpenclMatrix {
|
||||
template<class Scalar>
|
||||
class OpenclMatrix
|
||||
{
|
||||
public:
|
||||
|
||||
OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
|
||||
: Nb(Nb_),
|
||||
Mb(Mb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_)
|
||||
{
|
||||
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
|
||||
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * block_size * block_size * nnzbs);
|
||||
colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
|
||||
rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
}
|
||||
|
||||
void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
|
||||
void upload(cl::CommandQueue *queue, Matrix *matrix);
|
||||
void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
|
||||
void upload(cl::CommandQueue* queue, Scalar* vals, int* cols, int* rows);
|
||||
void upload(cl::CommandQueue* queue, Matrix<Scalar>* matrix);
|
||||
void upload(cl::CommandQueue* queue, BlockedMatrix<Scalar>* matrix);
|
||||
|
||||
cl::Buffer nnzValues;
|
||||
cl::Buffer colIndices;
|
||||
|
@ -30,61 +30,58 @@
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void Preconditioner<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
std::unique_ptr<Preconditioner<block_size>>
|
||||
Preconditioner<block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
std::unique_ptr<Preconditioner<Scalar,block_size>>
|
||||
Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
|
||||
{
|
||||
switch (type ) {
|
||||
case Type::BILU0:
|
||||
return std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity);
|
||||
return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case Type::CPR:
|
||||
return std::make_unique<CPR<block_size> >(opencl_ilu_parallel, verbosity);
|
||||
return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case Type::BISAI:
|
||||
return std::make_unique<BISAI<block_size> >(opencl_ilu_parallel, verbosity);
|
||||
return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
}
|
||||
|
||||
OPM_THROW(std::logic_error,
|
||||
"Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool Preconditioner<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
return analyze_matrix(mat);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool Preconditioner<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
return create_preconditioner(mat);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(Type, bool, int); \
|
||||
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
|
||||
template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \
|
||||
template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class Preconditioner<T,1>; \
|
||||
template class Preconditioner<T,2>; \
|
||||
template class Preconditioner<T,3>; \
|
||||
template class Preconditioner<T,4>; \
|
||||
template class Preconditioner<T,5>; \
|
||||
template class Preconditioner<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} //namespace Accelerator
|
||||
} //namespace Opm
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -24,17 +24,13 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
class BlockedMatrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
template <unsigned int block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class Preconditioner
|
||||
{
|
||||
|
||||
protected:
|
||||
int N = 0; // number of rows of the matrix
|
||||
int Nb = 0; // number of blockrows of the matrix
|
||||
@ -65,7 +61,8 @@ public:
|
||||
virtual ~Preconditioner() = default;
|
||||
|
||||
// nested Preconditioners might need to override this
|
||||
virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
virtual void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
|
||||
@ -73,16 +70,17 @@ public:
|
||||
// analyze matrix, e.g. the sparsity pattern
|
||||
// probably only called once
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
|
||||
virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
||||
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat) = 0;
|
||||
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat);
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
|
||||
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat) = 0;
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat);
|
||||
};
|
||||
|
||||
} //namespace Accelerator
|
||||
} //namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
@ -18,52 +18,71 @@
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// define static variables and kernels
|
||||
int OpenclKernels::verbosity;
|
||||
cl::CommandQueue *OpenclKernels::queue;
|
||||
std::vector<double> OpenclKernels::tmp;
|
||||
bool OpenclKernels::initialized = false;
|
||||
std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
|
||||
template<class Scalar> int OpenclKernels<Scalar>::verbosity;
|
||||
template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
|
||||
template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
|
||||
template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
|
||||
template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;
|
||||
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::init(cl::Context *context,
|
||||
cl::CommandQueue *queue_,
|
||||
std::vector<cl::Device>& devices, int verbosity_)
|
||||
{
|
||||
if (initialized) {
|
||||
OpmLog::debug("Warning OpenclKernels is already initialized");
|
||||
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
|
||||
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
|
||||
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
|
||||
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
initialized = true;
|
||||
} // end get_opencl_kernels()
|
||||
|
||||
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_dot;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_sum = 0.0;
|
||||
Scalar gpu_sum = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_sum += tmp[i];
|
||||
}
|
||||
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
|
||||
return gpu_sum;
|
||||
}
|
||||
|
||||
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_norm;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_norm = 0.0;
|
||||
Scalar gpu_norm = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_norm += tmp[i];
|
||||
}
|
||||
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
return gpu_norm;
|
||||
}
|
||||
|
||||
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
const double omega, const double beta, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
const Scalar omega, const Scalar beta, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
const cl::Buffer& x, cl::Buffer& b, int Nb,
|
||||
unsigned int block_size, bool reset, bool add)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
const cl::Buffer& x, cl::Buffer& b, int Nb,
|
||||
unsigned int block_size, bool reset, bool add)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_spmv;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
if (add) {
|
||||
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
} else {
|
||||
if (reset) {
|
||||
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
}
|
||||
|
||||
@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& x, const cl::Buffer& rhs,
|
||||
cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& x, const cl::Buffer& rhs,
|
||||
cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_residual;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_ilu_apply1;
|
||||
|
||||
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
y, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
y, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_ilu_apply2;
|
||||
|
||||
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
invDiagVals, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
invDiagVals, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
|
||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
|
||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 128;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int num_hwarps_per_group = work_group_size / 16;
|
||||
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
|
||||
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot
|
||||
Timer t_ilu_decomp;
|
||||
|
||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
firstRow, lastRow, rowIndices,
|
||||
vals, cols, rows,
|
||||
invDiagVals, diagIndex, rowsThisColor,
|
||||
cl::Local(lmem_per_work_group));
|
||||
firstRow, lastRow, rowIndices,
|
||||
vals, cols, rows,
|
||||
invDiagVals, diagIndex, rowsThisColor,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int total_work_items = num_std_wells * work_group_size;
|
||||
const unsigned int lmem1 = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||
const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
|
||||
const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
|
||||
Timer t_apply_stdwells;
|
||||
|
||||
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
|
||||
Timer t_isaiL;
|
||||
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
|
||||
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
|
||||
Timer t_isaiU;
|
||||
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
|
||||
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
template class OpenclKernels<double>;
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -26,10 +26,7 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
|
||||
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
|
||||
|
||||
template<class Scalar>
|
||||
class OpenclKernels
|
||||
{
|
||||
private:
|
||||
static int verbosity;
|
||||
static cl::CommandQueue *queue;
|
||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static std::vector<Scalar> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
|
||||
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
|
||||
@ -117,12 +115,12 @@ public:
|
||||
|
||||
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
||||
|
||||
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const double a, int N);
|
||||
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
|
||||
static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const Scalar a, int N);
|
||||
static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
|
||||
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
|
||||
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
|
||||
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
|
||||
@ -150,7 +148,40 @@ public:
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
#if CHOW_PATEL
|
||||
#define DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply2_str;
|
||||
#else
|
||||
#define DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
|
||||
#endif
|
||||
|
||||
#define DECLARE_INSTANCE(T) \
|
||||
DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::axpy_str; \
|
||||
template<> const std::string OpenclKernels<T>::scale_str; \
|
||||
template<> const std::string OpenclKernels<T>::vmul_str; \
|
||||
template<> const std::string OpenclKernels<T>::dot_1_str; \
|
||||
template<> const std::string OpenclKernels<T>::norm_str; \
|
||||
template<> const std::string OpenclKernels<T>::custom_str; \
|
||||
template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str; \
|
||||
template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
|
||||
template<> const std::string OpenclKernels<T>::prolongate_vector_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_blocked_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_blocked_add_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_noreset_str; \
|
||||
template<> const std::string OpenclKernels<T>::residual_blocked_str; \
|
||||
template<> const std::string OpenclKernels<T>::residual_str; \
|
||||
template<> const std::string OpenclKernels<T>::stdwell_apply_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_decomp_str; \
|
||||
template<> const std::string OpenclKernels<T>::isaiL_str; \
|
||||
template<> const std::string OpenclKernels<T>::isaiU_str;
|
||||
|
||||
DECLARE_INSTANCE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
@ -37,41 +37,50 @@
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, bool opencl_ilu_parallel_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_parallel(opencl_ilu_parallel_) {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclSolverBackend<Scalar,block_size>::
|
||||
openclSolverBackend(int verbosity_,
|
||||
int maxit_,
|
||||
Scalar tolerance_,
|
||||
unsigned int platformID_,
|
||||
unsigned int deviceID_,
|
||||
bool opencl_ilu_parallel_,
|
||||
std::string linsolver)
|
||||
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
bool use_cpr, use_isai;
|
||||
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
if (linsolver == "ilu0") {
|
||||
use_cpr = false;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
} else if (linsolver == "cpr_quasiimpes") {
|
||||
use_cpr = true;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("isai") == 0) {
|
||||
} else if (linsolver == "isai") {
|
||||
use_cpr = false;
|
||||
use_isai = true;
|
||||
} else if (linsolver.compare("cpr_trueimpes") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support --linerar-solver=cpr_trueimpes");
|
||||
} else if (linsolver == "cpr_trueimpes") {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support "
|
||||
"--linear-solver=cpr_trueimpes");
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
|
||||
}
|
||||
|
||||
using PreconditionerType = typename Preconditioner<block_size>::Type;
|
||||
using PreconditionerType = Preconditioner<Scalar,block_size>;
|
||||
if (use_cpr) {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, opencl_ilu_parallel, verbosity);
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::CPR,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
} else if (use_isai) {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, opencl_ilu_parallel, verbosity);
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::BISAI,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
} else {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, opencl_ilu_parallel, verbosity);
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::BILU0,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
@ -103,7 +112,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
out.clear();
|
||||
|
||||
if (platforms.size() <= platformID) {
|
||||
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
|
||||
OPM_THROW(std::logic_error, "Error: Invalid OpenCL platform ID selected");
|
||||
} else {
|
||||
std::string platform_info;
|
||||
out << "Chosen:\n";
|
||||
@ -119,7 +128,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);
|
||||
|
||||
if (devices.empty()) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
|
||||
OPM_THROW(std::logic_error, "Error openclSolver is selected but "
|
||||
"no OpenCL devices are found");
|
||||
}
|
||||
out << "Found " << devices.size() << " OpenCL devices" << "\n";
|
||||
|
||||
@ -203,8 +213,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
context = std::make_shared<cl::Context>(devices[0]);
|
||||
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
|
||||
|
||||
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
|
||||
|
||||
OpenclKernels<Scalar>::init(context.get(), queue.get(), devices, verbosity);
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -217,26 +226,33 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, bool opencl_ilu_parallel_) :
|
||||
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclSolverBackend<Scalar,block_size>::
|
||||
openclSolverBackend(int verbosity_, int maxit_,
|
||||
Scalar tolerance_, bool opencl_ilu_parallel_)
|
||||
: Base(verbosity_, maxit_, tolerance_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
// prec = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
|
||||
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
float it;
|
||||
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
Scalar norm, norm_0;
|
||||
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
|
||||
@ -246,15 +262,15 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// set initial values
|
||||
events.resize(5);
|
||||
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
|
||||
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
|
||||
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[0]);
|
||||
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(Scalar) * N, nullptr, &events[1]);
|
||||
rho = 1.0;
|
||||
alpha = 1.0;
|
||||
omega = 1.0;
|
||||
|
||||
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
|
||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
|
||||
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(Scalar) * N, nullptr, &events[3]);
|
||||
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -263,7 +279,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
|
||||
}
|
||||
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
norm_0 = norm;
|
||||
|
||||
if (verbosity > 1) {
|
||||
@ -277,11 +293,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
|
||||
rho = OpenclKernels<Scalar>::dot(d_rw, d_r, d_tmp, N);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
OpenclKernels<Scalar>::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -298,7 +314,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
|
||||
// v = A * pw
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
@ -306,20 +322,20 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
if(verbosity >= 3) {
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_well.stop();
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
|
||||
tmp1 = OpenclKernels<Scalar>::dot(d_rw, d_v, d_tmp, N);
|
||||
alpha = rho / tmp1;
|
||||
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<Scalar>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels<Scalar>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
@ -343,8 +359,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
|
||||
// t = A * s
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
if(verbosity >= 3){
|
||||
OpenclKernels<Scalar>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
t_well.start();
|
||||
@ -352,7 +368,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t);
|
||||
static_cast<WellContributionsOCL<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -360,12 +376,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
|
||||
tmp1 = OpenclKernels<Scalar>::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels<Scalar>::dot(d_t, d_t, d_tmp, N);
|
||||
omega = tmp1 / tmp2;
|
||||
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<Scalar>::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels<Scalar>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels<Scalar>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
@ -382,7 +398,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
}
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.iterations = std::min(it, static_cast<float>(maxit));
|
||||
res.reduction = norm / norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / it));
|
||||
res.elapsed = t_total.stop();
|
||||
@ -390,7 +406,8 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
out << "=== converged: " << res.converged << ", conv_rate: "
|
||||
<< res.conv_rate << ", time: " << res.elapsed <<
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
@ -405,9 +422,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
@ -437,22 +456,21 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
||||
mat = matrix;
|
||||
jacMat = jacMatrix;
|
||||
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
|
||||
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz);
|
||||
d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * nnz);
|
||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -467,8 +485,10 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
copy_system_to_gpu()
|
||||
{
|
||||
Timer t;
|
||||
events.resize(5);
|
||||
|
||||
@ -476,18 +496,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, vals_contiguous.data(),
|
||||
nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
|
||||
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0,
|
||||
sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0,
|
||||
sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
|
||||
sizeof(Scalar) * N, h_b, nullptr, &events[3]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[4]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -504,8 +531,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
update_system_on_gpu()
|
||||
{
|
||||
Timer t;
|
||||
events.resize(3);
|
||||
|
||||
@ -513,16 +542,21 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum,
|
||||
size_row * sizeof(Scalar) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, vals_contiguous.data(),
|
||||
nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0,
|
||||
sizeof(Scalar) * nnz, mat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0,
|
||||
sizeof(Scalar) * N, h_b, nullptr, &events[1]);
|
||||
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(Scalar) * N, nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
@ -538,9 +572,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclSolverBackend<Scalar,block_size>::
|
||||
analyze_matrix()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool success;
|
||||
@ -560,9 +595,10 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
return success;
|
||||
} // end analyze_matrix()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
update_system(Scalar* vals, Scalar* b)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
mat->nnzValues = vals;
|
||||
@ -575,9 +611,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||
}
|
||||
} // end update_system()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclSolverBackend<Scalar,block_size>::
|
||||
create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool result;
|
||||
@ -594,9 +631,10 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
// actually solve
|
||||
@ -604,7 +642,8 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
|
||||
gpu_pbicgstab(wellContribs, res);
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
|
||||
oss << "openclSolverBackend::solve_system error: " << error.what()
|
||||
<< "(" << error.err() << ")\n";
|
||||
oss << getErrorString(error.err());
|
||||
// rethrow exception
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
@ -618,17 +657,17 @@ void openclSolverBackend<block_size>::solve_system(WellContributions &wellContri
|
||||
out << "openclSolver::solve_system(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
|
||||
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(Scalar) * N, x);
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
@ -637,13 +676,13 @@ void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix, jacMatrix);
|
||||
@ -668,21 +707,14 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class openclSolverBackend<T,1>; \
|
||||
template class openclSolverBackend<T,2>; \
|
||||
template class openclSolverBackend<T,3>; \
|
||||
template class openclSolverBackend<T,4>; \
|
||||
template class openclSolverBackend<T,5>; \
|
||||
template class openclSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend( \
|
||||
int, int, double, unsigned int, unsigned int, bool, std::string); \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
|
||||
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -27,16 +27,13 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -50,8 +47,8 @@ class openclSolverBackend : public BdaSolver<block_size>
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
double *h_b = nullptr; // b vector, on host
|
||||
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
Scalar* h_b = nullptr; // b vector, on host
|
||||
std::vector<Scalar> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
|
||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||
cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
|
||||
@ -63,12 +60,12 @@ private:
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
std::unique_ptr<Preconditioner<block_size> > prec;
|
||||
std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
|
||||
// can perform blocked ILU0 and AMG on pressure component
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
bool analysis_done = false;
|
||||
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
|
||||
bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
@ -76,12 +73,13 @@ private:
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Copy linear system to GPU
|
||||
void copy_system_to_gpu();
|
||||
@ -89,7 +87,7 @@ private:
|
||||
/// Reassign pointers, in case the addresses of the Dune variables have changed
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector b, contains N values
|
||||
void update_system(double *vals, double *b);
|
||||
void update_system(Scalar* vals, Scalar* b);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
void update_system_on_gpu();
|
||||
@ -106,11 +104,11 @@ private:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// could be empty
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions &wellContribs, BdaResult &res);
|
||||
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
public:
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::shared_ptr<cl::Context> context{};
|
||||
std::shared_ptr<cl::CommandQueue> queue{};
|
||||
|
||||
/// Construct a openclSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of openclSolver
|
||||
@ -121,11 +119,13 @@ public:
|
||||
/// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||
/// only ilu0, cpr_quasiimpes and isai are supported
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
||||
bool opencl_ilu_parallel, std::string linsolver);
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance,
|
||||
unsigned int platformID, unsigned int deviceID,
|
||||
bool opencl_ilu_parallel, std::string linsolver);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit,
|
||||
Scalar tolerance, bool opencl_ilu_parallel);
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] matrix matrix A
|
||||
@ -134,8 +134,11 @@ public:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
@ -143,19 +146,16 @@ public:
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
/// Set OpenCL objects
|
||||
/// This class either creates them based on platformID and deviceID or receives them through this function
|
||||
/// \param[in] context the opencl context to be used
|
||||
/// \param[in] queue the opencl queue to be used
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue);
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -25,93 +25,122 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Opm {
|
||||
|
||||
using Accelerator::OpenclKernels;
|
||||
|
||||
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::
|
||||
setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_)
|
||||
{
|
||||
this->context = context_;
|
||||
this->queue = queue_;
|
||||
}
|
||||
|
||||
|
||||
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
|
||||
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
|
||||
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
|
||||
{
|
||||
OpenclKernels<Scalar>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
|
||||
*d_Ccols_ocl, *d_Bcols_ocl,
|
||||
d_x, d_y, this->dim, this->dim_wells,
|
||||
*d_val_pointers_ocl, this->num_std_wells);
|
||||
}
|
||||
|
||||
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::apply_mswells(cl::Buffer d_x, cl::Buffer d_y)
|
||||
{
|
||||
if (h_x.empty()) {
|
||||
h_x.resize(N);
|
||||
h_y.resize(N);
|
||||
h_x.resize(this->N);
|
||||
h_y.resize(this->N);
|
||||
}
|
||||
|
||||
events.resize(2);
|
||||
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
|
||||
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
|
||||
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(Scalar) * this->N,
|
||||
h_x.data(), nullptr, &events[0]);
|
||||
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
|
||||
h_y.data(), nullptr, &events[1]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (auto& well : multisegments) {
|
||||
for (auto& well : this->multisegments) {
|
||||
well->apply(h_x.data(), h_y.data());
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(Scalar) * this->N,
|
||||
h_y.data(), nullptr, &events[0]);
|
||||
events[0].wait();
|
||||
events.clear();
|
||||
}
|
||||
|
||||
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y){
|
||||
if(num_std_wells > 0){
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::apply(cl::Buffer d_x, cl::Buffer d_y)
|
||||
{
|
||||
if (this->num_std_wells > 0){
|
||||
apply_stdwells(d_x, d_y);
|
||||
}
|
||||
|
||||
if(num_ms_wells > 0){
|
||||
if (this->num_ms_wells > 0) {
|
||||
apply_mswells(d_x, d_y);
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributionsOCL::APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
double* values,
|
||||
unsigned int val_size)
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::
|
||||
APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
Scalar* values,
|
||||
unsigned int val_size)
|
||||
{
|
||||
if (!allocated) {
|
||||
if (!this->allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
events.resize(2);
|
||||
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE,
|
||||
sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
sizeof(Scalar) * val_size * this->dim * this->dim_wells,
|
||||
values, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE,
|
||||
sizeof(int) * this->num_blocks_so_far,
|
||||
sizeof(int) * val_size, colIndices, nullptr, &events[1]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
break;
|
||||
|
||||
case MatrixType::D:
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE,
|
||||
sizeof(Scalar) * this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
|
||||
sizeof(Scalar) * this->dim_wells * this->dim_wells,
|
||||
values, nullptr, &events[0]);
|
||||
events[0].wait();
|
||||
events.clear();
|
||||
break;
|
||||
|
||||
case MatrixType::B:
|
||||
events.resize(2);
|
||||
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE,
|
||||
sizeof(Scalar) * this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
sizeof(Scalar) * val_size * this->dim * this->dim_wells,
|
||||
values, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE,
|
||||
sizeof(int) * this->num_blocks_so_far, sizeof(int) * val_size,
|
||||
colIndices, nullptr, &events[1]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if (num_std_wells_so_far == num_std_wells - 1) {
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
|
||||
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
|
||||
this->val_pointers[this->num_std_wells] = this->num_blocks;
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0,
|
||||
sizeof(unsigned int) * (this->num_std_wells + 1),
|
||||
this->val_pointers.data(), nullptr, &events[0]);
|
||||
events[0].wait();
|
||||
events.clear();
|
||||
}
|
||||
@ -122,14 +151,21 @@ void WellContributionsOCL::APIaddMatrix(MatrixType type,
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributionsOCL::APIalloc()
|
||||
template<class Scalar>
|
||||
void WellContributionsOCL<Scalar>::APIalloc()
|
||||
{
|
||||
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
|
||||
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
|
||||
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * this->num_std_wells * this->dim_wells * this->dim_wells);
|
||||
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(Scalar) * this->num_blocks * this->dim * this->dim_wells);
|
||||
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
|
||||
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * this->num_blocks);
|
||||
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE,
|
||||
sizeof(unsigned int) * (this->num_std_wells + 1));
|
||||
}
|
||||
|
||||
} //namespace Opm
|
||||
template class WellContributionsOCL<double>;
|
||||
|
||||
} // namespace Opm
|
||||
|
@ -29,10 +29,10 @@
|
||||
#include <vector>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Opm {
|
||||
|
||||
class WellContributionsOCL : public WellContributions
|
||||
template<class Scalar>
|
||||
class WellContributionsOCL : public WellContributions<Scalar>
|
||||
{
|
||||
public:
|
||||
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
||||
@ -45,7 +45,10 @@ protected:
|
||||
/// Allocate memory for the StandardWells
|
||||
void APIalloc() override;
|
||||
|
||||
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
|
||||
using MatrixType = typename WellContributions<Scalar>::MatrixType;
|
||||
|
||||
void APIaddMatrix(MatrixType type, int* colIndices,
|
||||
Scalar* values, unsigned int val_size) override;
|
||||
|
||||
cl::Context* context;
|
||||
cl::CommandQueue* queue;
|
||||
@ -55,10 +58,10 @@ protected:
|
||||
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
||||
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
||||
|
||||
std::vector<double> h_x;
|
||||
std::vector<double> h_y;
|
||||
std::vector<Scalar> h_x;
|
||||
std::vector<Scalar> h_y;
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
@ -47,27 +47,28 @@
|
||||
#undef HIP_HAVE_CUDA_DEFINED
|
||||
#endif
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
rocalutionSolverBackend<block_size>::rocalutionSolverBackend(int verbosity_, int maxit_, double tolerance_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
rocalutionSolverBackend<Scalar,block_size>::
|
||||
rocalutionSolverBackend(int verbosity_, int maxit_, Scalar tolerance_)
|
||||
: Base(verbosity_, maxit_, tolerance_)
|
||||
{
|
||||
rocalution::init_rocalution();
|
||||
rocalution::info_rocalution();
|
||||
roc_solver = std::make_unique<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
|
||||
roc_prec = std::make_unique<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> >();
|
||||
using BCGS = rocalution::BiCGStab<Mat,Vec,Scalar>;
|
||||
roc_solver = std::make_unique<BCGS>();
|
||||
using ILU = rocalution::ILU<Mat,Vec,Scalar>;
|
||||
roc_prec = std::make_unique<ILU>();
|
||||
roc_solver->Verbose(0);
|
||||
roc_solver->Init(/*abs_tol=*/1e-15, tolerance, /*divergence_tol=*/1e3, maxit);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
rocalutionSolverBackend<Scalar,block_size>::~rocalutionSolverBackend()
|
||||
{
|
||||
// normally, these rocalution variables are destroyed after the destructor automatically,
|
||||
// but sometimes it segfaults, both with test_rocalutionSolver and with an actual case
|
||||
// release both variables here to prevent that segfault
|
||||
@ -76,9 +77,10 @@ rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
|
||||
rocalution::stop_rocalution();
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocalutionSolverBackend<Scalar,block_size>::
|
||||
initialize(BlockedMatrix<Scalar>* matrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
@ -94,15 +96,16 @@ void rocalutionSolverBackend<block_size>::initialize(BlockedMatrix *matrix) {
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocalutionSolverBackend<Scalar,block_size>::
|
||||
convert_matrix(BlockedMatrix<Scalar>* matrix)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
for(int i = 0; i < Nb+1; ++i){
|
||||
for (int i = 0; i < Nb+1; ++i) {
|
||||
tmp_rowpointers[i] = matrix->rowPointers[i];
|
||||
}
|
||||
for(int i = 0; i < nnzb; ++i){
|
||||
for (int i = 0; i < nnzb; ++i) {
|
||||
tmp_colindices[i] = matrix->colIndices[i];
|
||||
}
|
||||
|
||||
@ -112,7 +115,7 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
|
||||
// BCSR_IND_BASE == 0: rocalution expects column-major
|
||||
// BCSR_IND_BASE == 1: rocalution expects row-major
|
||||
if (BCSR_IND_BASE == 0) {
|
||||
for(int i = 0; i < nnzb; ++i){
|
||||
for (int i = 0; i < nnzb; ++i) {
|
||||
tmp_nnzvalues[i * block_size * block_size + 0] = matrix->nnzValues[i * block_size * block_size + 0];
|
||||
tmp_nnzvalues[i * block_size * block_size + 1] = matrix->nnzValues[i * block_size * block_size + 3];
|
||||
tmp_nnzvalues[i * block_size * block_size + 2] = matrix->nnzValues[i * block_size * block_size + 6];
|
||||
@ -131,11 +134,12 @@ void rocalutionSolverBackend<block_size>::convert_matrix(BlockedMatrix *matrix)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void rocalutionSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocalutionSolverBackend<Scalar,block_size>::
|
||||
get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
std::copy(h_x.begin(), h_x.end(), x);
|
||||
@ -147,13 +151,13 @@ void rocalutionSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
[[maybe_unused]] WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus rocalutionSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
[[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix.get());
|
||||
@ -161,21 +165,20 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
|
||||
|
||||
tmp_rowpointers = new int[Nb+1];
|
||||
tmp_colindices = new int[nnzb];
|
||||
tmp_nnzvalues = new double[nnzb*block_size*block_size];
|
||||
tmp_nnzvalues = new Scalar[nnzb*block_size*block_size];
|
||||
|
||||
convert_matrix(matrix.get());
|
||||
|
||||
rocalution::LocalVector<double> roc_x;
|
||||
rocalution::LocalVector<double> roc_rhs;
|
||||
rocalution::LocalMatrix<double> roc_mat;
|
||||
Vec roc_x;
|
||||
Vec roc_rhs;
|
||||
Mat roc_mat;
|
||||
|
||||
// this also transfers ownership to the allocated memory to rocalution
|
||||
// and sets the tmp_* pointers to nullptr
|
||||
roc_mat.SetDataPtrBCSR(
|
||||
&tmp_rowpointers,
|
||||
&tmp_colindices,
|
||||
&tmp_nnzvalues,
|
||||
"matrix A", nnzb, Nb, Nb, block_size);
|
||||
roc_mat.SetDataPtrBCSR(&tmp_rowpointers,
|
||||
&tmp_colindices,
|
||||
&tmp_nnzvalues,
|
||||
"matrix A", nnzb, Nb, Nb, block_size);
|
||||
|
||||
roc_mat.MoveToAccelerator();
|
||||
roc_x.MoveToAccelerator();
|
||||
@ -196,7 +199,7 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
|
||||
// so it just calls ILU::Build() everytime
|
||||
roc_solver->ReBuildNumeric();
|
||||
|
||||
double norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)
|
||||
Scalar norm_0 = roc_rhs.Norm(); // since the initial guess is a vector with 0s, initial error is norm(b)
|
||||
|
||||
// actually solve
|
||||
Dune::Timer t_solve;
|
||||
@ -215,7 +218,6 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction, 1.0 / res.iterations));
|
||||
res.converged = (roc_solver->GetSolverStatus() == 2);
|
||||
|
||||
|
||||
// copy solution vector to host vector
|
||||
// if roc_x could be reused, this should be removed here
|
||||
// and roc_x should be directly copied into x in get_result()
|
||||
@ -224,26 +226,25 @@ SolverStatus rocalutionSolverBackend<block_size>::solve_system(std::shared_ptr<B
|
||||
|
||||
if (verbosity >= 1) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / res.iterations << ", iterations: " << res.iterations;
|
||||
out << "=== converged: " << res.converged
|
||||
<< ", conv_rate: " << res.conv_rate
|
||||
<< ", time: " << res.elapsed <<
|
||||
", time per iteration: " << res.elapsed / res.iterations
|
||||
<< ", iterations: " << res.iterations;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class rocalutionSolverBackend<T,1>; \
|
||||
template class rocalutionSolverBackend<T,2>; \
|
||||
template class rocalutionSolverBackend<T,3>; \
|
||||
template class rocalutionSolverBackend<T,4>; \
|
||||
template class rocalutionSolverBackend<T,5>; \
|
||||
template class rocalutionSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template rocalutionSolverBackend<n>::rocalutionSolverBackend(int, int, double);
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -31,17 +31,14 @@ template<class Scalar> class LocalMatrix;
|
||||
template<class Scalar> class LocalVector;
|
||||
}
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a rocalution based linear solver solver on GPU
|
||||
/// It uses ilu0-bicgstab
|
||||
template <unsigned int block_size>
|
||||
class rocalutionSolverBackend : public BdaSolver<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class rocalutionSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -55,31 +52,34 @@ class rocalutionSolverBackend : public BdaSolver<block_size>
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
std::vector<double> h_x; // store solution vector on host
|
||||
std::vector<Scalar> h_x; // store solution vector on host
|
||||
int *tmp_rowpointers; // store matrix on host, this pointer is given to and freed by rocalution
|
||||
int *tmp_colindices; // store matrix on host, this pointer is given to and freed by rocalution
|
||||
double *tmp_nnzvalues; // store matrix on host, this pointer is given to and freed by rocalution
|
||||
Scalar* tmp_nnzvalues; // store matrix on host, this pointer is given to and freed by rocalution
|
||||
|
||||
std::unique_ptr<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_prec;
|
||||
std::unique_ptr<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_solver;
|
||||
using Mat = rocalution::LocalMatrix<Scalar>;
|
||||
using Vec = rocalution::LocalVector<Scalar>;
|
||||
|
||||
std::unique_ptr<rocalution::ILU<Mat,Vec,Scalar>> roc_prec;
|
||||
std::unique_ptr<rocalution::BiCGStab<Mat,Vec,Scalar>> roc_solver;
|
||||
|
||||
/// Initialize sizes and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
void initialize(BlockedMatrix *matrix);
|
||||
void initialize(BlockedMatrix<Scalar>* matrix);
|
||||
|
||||
/// Convert matrix to rocalution format
|
||||
/// copy matrix to raw pointers, which are given to and freed by rocalution
|
||||
/// \param[in] matrix matrix A
|
||||
void convert_matrix(BlockedMatrix *matrix);
|
||||
void convert_matrix(BlockedMatrix<Scalar>* matrix);
|
||||
|
||||
public:
|
||||
|
||||
/// Construct a rocalutionSolver
|
||||
/// also initialize rocalution library and rocalution variables
|
||||
/// \param[in] linear_solver_verbosity verbosity of rocalutionSolver
|
||||
/// \param[in] maxit maximum number of iterations for rocalutionSolver
|
||||
/// \param[in] tolerance required relative tolerance for rocalutionSolver
|
||||
rocalutionSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
|
||||
rocalutionSolverBackend(int linear_solver_verbosity,
|
||||
int maxit, Scalar tolerance);
|
||||
|
||||
/// Destroy a rocalutionSolver, and free memory
|
||||
~rocalutionSolverBackend();
|
||||
@ -91,17 +91,19 @@ public:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
}; // end class rocalutionSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -93,20 +93,20 @@
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif //HAVE_OPENMP
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
rocsparseSolverBackend<Scalar,block_size>::
|
||||
rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
|
||||
unsigned int platformID_, unsigned int deviceID_)
|
||||
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
|
||||
{
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (static_cast<int>(deviceID) >= numDevices) {
|
||||
OPM_THROW(std::runtime_error, "Error chosen too high HIP device ID");
|
||||
OPM_THROW(std::runtime_error, "Invalid HIP device ID");
|
||||
}
|
||||
HIP_CHECK(hipSetDevice(deviceID));
|
||||
|
||||
@ -126,45 +126,45 @@ rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int m
|
||||
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
rocsparseSolverBackend<block_size>::~rocsparseSolverBackend() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
rocsparseSolverBackend<Scalar,block_size>::~rocsparseSolverBackend()
|
||||
{
|
||||
hipError_t hipstatus = hipStreamSynchronize(stream);
|
||||
if(hipstatus != hipSuccess){
|
||||
if (hipstatus != hipSuccess) {
|
||||
OpmLog::error("Could not synchronize with hipStream");
|
||||
}
|
||||
hipstatus = hipStreamDestroy(stream);
|
||||
if(hipstatus != hipSuccess){
|
||||
if (hipstatus != hipSuccess) {
|
||||
OpmLog::error("Could not destroy hipStream");
|
||||
}
|
||||
rocsparse_status status1 = rocsparse_destroy_handle(handle);
|
||||
if(status1 != rocsparse_status_success){
|
||||
if (status1 != rocsparse_status_success) {
|
||||
OpmLog::error("Could not destroy rocsparse handle");
|
||||
}
|
||||
rocblas_status status2 = rocblas_destroy_handle(blas_handle);
|
||||
if(status2 != rocblas_status_success){
|
||||
if (status2 != rocblas_status_success) {
|
||||
OpmLog::error("Could not destroy rocblas handle");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellContributions& wellContribs,
|
||||
BdaResult& res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
float it = 0.5;
|
||||
double rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
Scalar rho, rhop, beta, alpha, nalpha, omega, nomega, tmp1, tmp2;
|
||||
Scalar norm, norm_0;
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
Scalar mone = -1.0;
|
||||
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
|
||||
// set stream here, the WellContributions object is destroyed every linear solve
|
||||
// the number of wells can change every linear solve
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).setStream(stream);
|
||||
}
|
||||
|
||||
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
|
||||
@ -253,8 +253,8 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
|
||||
}
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_pw, d_v);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
@ -312,15 +312,15 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
d_s, &zero, d_t));
|
||||
#endif
|
||||
if(verbosity >= 3){
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
t_spmv.stop();
|
||||
t_well.start();
|
||||
}
|
||||
|
||||
// apply wellContributions
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
|
||||
if (wellContribs.getNumWells() > 0) {
|
||||
static_cast<WellContributionsRocsparse<Scalar>&>(wellContribs).apply(d_s, d_t);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
@ -360,8 +360,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
|
||||
|
||||
if (verbosity >= 1) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
out << "=== converged: " << res.converged
|
||||
<< ", conv_rate: " << res.conv_rate
|
||||
<< ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed / it
|
||||
<< ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
@ -375,9 +378,11 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
@ -390,12 +395,14 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
|
||||
out << "Initializing GPU, matrix size: "
|
||||
<< Nb << " blockrows, nnzb: " << nnzb << "\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
|
||||
out << "Maxit: " << maxit
|
||||
<< std::scientific << ", tolerance: " << tolerance << "\n"
|
||||
<< "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
@ -403,26 +410,26 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
|
||||
mat = matrix;
|
||||
jacMat = jacMatrix;
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_p, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_s, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_v, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_p, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_pw, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_s, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_v, sizeof(Scalar) * N));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_Arows, sizeof(rocsparse_int) * (Nb + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Acols, sizeof(rocsparse_int) * nnzb));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(double) * nnz));
|
||||
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(double) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Avals, sizeof(Scalar) * nnz));
|
||||
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));
|
||||
|
||||
if (useJacMatrix) {
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
|
||||
} else { // preconditioner matrix is same
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(double) * nnzbs_prec * block_size * block_size));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
|
||||
d_Mcols = d_Acols;
|
||||
d_Mrows = d_Arows;
|
||||
}
|
||||
@ -430,26 +437,43 @@ void rocsparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatri
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
copy_system_to_gpu(Scalar *b)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices, sizeof(rocsparse_int) * nnzb, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Arows, mat->rowPointers,
|
||||
sizeof(rocsparse_int) * (Nb + 1),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Acols, mat->colIndices,
|
||||
sizeof(rocsparse_int) * nnzb,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues,
|
||||
sizeof(Scalar) * nnz,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices, sizeof(rocsparse_int) * nnzbs_prec, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
|
||||
sizeof(rocsparse_int) * (Nb + 1),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
|
||||
sizeof(rocsparse_int) * nnzbs_prec,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
|
||||
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
|
||||
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -459,29 +483,36 @@ void rocsparseSolverBackend<block_size>::copy_system_to_gpu(double *b) {
|
||||
std::ostringstream out;
|
||||
out << "-----rocsparseSolver::copy_system_to_gpu(): " << t.elapsed() << " s\n";
|
||||
out << "---rocsparseSolver::cum copy: " << c_copy << " s";
|
||||
OpmLog::info(out.str());
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
update_system_on_gpu(Scalar* b)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(double) * nnz, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, sizeof(double) * N, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Avals, mat->nnzValues, sizeof(Scalar) * nnz,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues, sizeof(double) * nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
|
||||
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(double) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
|
||||
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
@ -493,8 +524,10 @@ void rocsparseSolverBackend<block_size>::update_system_on_gpu(double *b) {
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool rocsparseSolverBackend<block_size>::analyze_matrix() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool rocsparseSolverBackend<Scalar,block_size>::
|
||||
analyze_matrix()
|
||||
{
|
||||
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
Timer t;
|
||||
|
||||
@ -523,7 +556,8 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
|
||||
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
d_bufferSize = std::max(d_bufferSize_M,
|
||||
std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
|
||||
|
||||
@ -571,9 +605,10 @@ bool rocsparseSolverBackend<block_size>::analyze_matrix() {
|
||||
return true;
|
||||
} // end analyze_matrix()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool rocsparseSolverBackend<block_size>::create_preconditioner() {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool rocsparseSolverBackend<Scalar,block_size>::
|
||||
create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool result = true;
|
||||
@ -598,9 +633,10 @@ bool rocsparseSolverBackend<block_size>::create_preconditioner() {
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
// actually solve
|
||||
@ -612,17 +648,18 @@ void rocsparseSolverBackend<block_size>::solve_system(WellContributions &wellCon
|
||||
out << "rocsparseSolver::solve_system(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void rocsparseSolverBackend<block_size>::get_result(double *x) {
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void rocsparseSolverBackend<Scalar,block_size>::
|
||||
get_result(Scalar* x)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(x, d_x, sizeof(Scalar) * N,
|
||||
hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream)); // always wait, caller might want to use x immediately
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -632,13 +669,13 @@ void rocsparseSolverBackend<block_size>::get_result(double *x) {
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
SolverStatus rocsparseSolverBackend<Scalar,block_size>::
|
||||
solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(matrix, jacMatrix);
|
||||
@ -662,19 +699,14 @@ SolverStatus rocsparseSolverBackend<block_size>::solve_system(std::shared_ptr<Bl
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class rocsparseSolverBackend<T,1>; \
|
||||
template class rocsparseSolverBackend<T,2>; \
|
||||
template class rocsparseSolverBackend<T,3>; \
|
||||
template class rocsparseSolverBackend<T,4>; \
|
||||
template class rocsparseSolverBackend<T,5>; \
|
||||
template class rocsparseSolverBackend<T,6>;
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template rocsparseSolverBackend<n>::rocsparseSolverBackend( \
|
||||
int, int, double, unsigned int, unsigned int);
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -31,16 +31,13 @@
|
||||
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a rocsparse-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class rocsparseSolverBackend : public BdaSolver<block_size>
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class rocsparseSolverBackend : public BdaSolver<Scalar,block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
using Base = BdaSolver<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -54,14 +51,13 @@ class rocsparseSolverBackend : public BdaSolver<block_size>
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
|
||||
double c_copy = 0.0; // cummulative timer measuring the total time it takes to transfer the data to the GPU
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
bool analysis_done = false;
|
||||
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
|
||||
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
|
||||
|
||||
rocsparse_direction dir = rocsparse_direction_row;
|
||||
@ -77,31 +73,31 @@ private:
|
||||
|
||||
rocsparse_int *d_Arows, *d_Mrows;
|
||||
rocsparse_int *d_Acols, *d_Mcols;
|
||||
double *d_Avals, *d_Mvals;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
double *d_pw, *d_s, *d_t, *d_v;
|
||||
Scalar *d_Avals, *d_Mvals;
|
||||
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
Scalar *d_pw, *d_s, *d_t, *d_v;
|
||||
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
|
||||
int ver;
|
||||
char rev[64];
|
||||
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
void gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
void initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix);
|
||||
|
||||
/// Copy linear system to GPU
|
||||
/// \param[in] b input vector, contains N values
|
||||
void copy_system_to_gpu(double *b);
|
||||
void copy_system_to_gpu(Scalar* b);
|
||||
|
||||
/// Update linear system to GPU
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(double *b);
|
||||
void update_system_on_gpu(Scalar* b);
|
||||
|
||||
/// Analyze sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
@ -114,16 +110,20 @@ private:
|
||||
/// Solve linear system
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(WellContributions &wellContribs, BdaResult &res);
|
||||
void solve_system(WellContributions<Scalar>& wellContribs, BdaResult& res);
|
||||
|
||||
public:
|
||||
/// Construct a openclSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of openclSolver
|
||||
/// \param[in] maxit maximum number of iterations for openclSolver
|
||||
/// \param[in] tolerance required relative tolerance for openclSolver
|
||||
/// Construct a rocsparseSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of rocsparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for rocsparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for rocsparseSolver
|
||||
/// \param[in] platformID the OpenCL platform to be used
|
||||
/// \param[in] deviceID the device to be used
|
||||
rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
|
||||
rocsparseSolverBackend(int linear_solver_verbosity,
|
||||
int maxit,
|
||||
Scalar tolerance,
|
||||
unsigned int platformID,
|
||||
unsigned int deviceID);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
// rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
@ -138,8 +138,11 @@ public:
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
Scalar* b,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
@ -147,13 +150,10 @@ public:
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
void get_result(Scalar* x) override;
|
||||
|
||||
}; // end class rocsparseSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -56,17 +56,17 @@ namespace Opm
|
||||
|
||||
#ifdef __HIP__
|
||||
/// HIP kernel to apply the standard wellcontributions
|
||||
__global__ void stdwell_apply(
|
||||
const double *Cnnzs,
|
||||
const double *Dnnzs,
|
||||
const double *Bnnzs,
|
||||
const unsigned *Ccols,
|
||||
const unsigned *Bcols,
|
||||
const double *x,
|
||||
double *y,
|
||||
const unsigned dim,
|
||||
const unsigned dim_wells,
|
||||
const unsigned *val_pointers)
|
||||
template<class Scalar>
|
||||
__global__ void stdwell_apply(const Scalar* Cnnzs,
|
||||
const Scalar* Dnnzs,
|
||||
const Scalar* Bnnzs,
|
||||
const unsigned* Ccols,
|
||||
const unsigned* Bcols,
|
||||
const Scalar* x,
|
||||
Scalar* y,
|
||||
const unsigned dim,
|
||||
const unsigned dim_wells,
|
||||
const unsigned *val_pointers)
|
||||
{
|
||||
unsigned wgId = blockIdx.x;
|
||||
unsigned wiId = threadIdx.x;
|
||||
@ -76,16 +76,16 @@ __global__ void stdwell_apply(
|
||||
unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
|
||||
unsigned c = wiId % dim;
|
||||
unsigned r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
Scalar temp;
|
||||
|
||||
extern __shared__ double localSum[];
|
||||
double *z1 = localSum + gridDim.x;
|
||||
double *z2 = z1 + dim_wells;
|
||||
extern __shared__ Scalar localSum[];
|
||||
Scalar* z1 = localSum + gridDim.x;
|
||||
Scalar* z2 = z1 + dim_wells;
|
||||
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
if (wiId < numActiveWorkItems) {
|
||||
unsigned b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
while (b < valSize + val_pointers[wgId]) {
|
||||
int colIdx = Bcols[b];
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
@ -99,14 +99,14 @@ __global__ void stdwell_apply(
|
||||
// 6 7 8 18 19 20
|
||||
// 9 10 11 21 22 23
|
||||
// workitem i will hold the sum of workitems i and i + valsPerBlock
|
||||
if(wiId < valsPerBlock){
|
||||
if (wiId < valsPerBlock){
|
||||
for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
|
||||
localSum[wiId] += localSum[wiId + i*valsPerBlock];
|
||||
}
|
||||
}
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned i = dim - 1; i > 0; --i){
|
||||
if (c == 0 && wiId < valsPerBlock){
|
||||
for (unsigned i = dim - 1; i > 0; --i) {
|
||||
localSum[wiId] += localSum[wiId + i];
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
@ -117,7 +117,7 @@ __global__ void stdwell_apply(
|
||||
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned i = 0; i < dim_wells; ++i){
|
||||
for (unsigned i = 0; i < dim_wells; ++i) {
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
@ -125,10 +125,10 @@ __global__ void stdwell_apply(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
if (wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
unsigned bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned j = 0; j < dim_wells; ++j){
|
||||
for (unsigned j = 0; j < dim_wells; ++j) {
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
|
||||
@ -138,17 +138,26 @@ __global__ void stdwell_apply(
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
|
||||
[[maybe_unused]] double *d_y){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply_stdwells([[maybe_unused]] Scalar* d_x,
|
||||
[[maybe_unused]] Scalar* d_y)
|
||||
{
|
||||
#ifdef __HIP__
|
||||
unsigned gridDim = num_std_wells;
|
||||
unsigned blockDim = 64;
|
||||
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
|
||||
unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(Scalar); // shared memory for localSum, z1 and z2
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
|
||||
d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
|
||||
d_x, d_y, dim, dim_wells, d_val_pointers_hip
|
||||
stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(d_Cnnzs_hip,
|
||||
d_Dnnzs_hip,
|
||||
d_Bnnzs_hip,
|
||||
d_Ccols_hip,
|
||||
d_Bcols_hip,
|
||||
d_x,
|
||||
d_y,
|
||||
dim,
|
||||
dim_wells,
|
||||
d_val_pointers_hip
|
||||
);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
@ -156,67 +165,89 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
|
||||
#endif
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply_mswells(Scalar* d_x, Scalar* d_y)
|
||||
{
|
||||
if (h_x.empty()) {
|
||||
h_x.resize(N);
|
||||
h_y.resize(N);
|
||||
h_x.resize(this->N);
|
||||
h_y.resize(this->N);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(Scalar) * this->N, hipMemcpyDeviceToHost, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for (auto& well : multisegments) {
|
||||
for (auto& well : this->multisegments) {
|
||||
well->apply(h_x.data(), h_y.data());
|
||||
}
|
||||
|
||||
// copy vector y from CPU to GPU
|
||||
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::apply(double *d_x, double *d_y){
|
||||
if(num_std_wells > 0){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
apply(Scalar* d_x, Scalar* d_y)
|
||||
{
|
||||
if (this->num_std_wells > 0) {
|
||||
apply_stdwells(d_x, d_y);
|
||||
}
|
||||
|
||||
if(num_ms_wells > 0){
|
||||
if (this->num_ms_wells > 0) {
|
||||
apply_mswells(d_x, d_y);
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::setStream(hipStream_t stream_){
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::setStream(hipStream_t stream_)
|
||||
{
|
||||
stream = stream_;
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
double* values,
|
||||
unsigned int val_size)
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::
|
||||
APIaddMatrix(MatrixType type,
|
||||
int* colIndices,
|
||||
Scalar* values,
|
||||
unsigned int val_size)
|
||||
{
|
||||
if (!allocated) {
|
||||
if (!this->allocated) {
|
||||
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case MatrixType::C:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(d_Cnnzs_hip) * val_size * this->dim * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + this->num_blocks_so_far, colIndices,
|
||||
sizeof(d_Ccols_hip) * val_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
break;
|
||||
|
||||
case MatrixType::D:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + this->num_std_wells_so_far * this->dim_wells * this->dim_wells,
|
||||
values, sizeof(d_Dnnzs_hip) * this->dim_wells * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
break;
|
||||
|
||||
case MatrixType::B:
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + this->num_blocks_so_far * this->dim * this->dim_wells,
|
||||
values, sizeof(d_Bnnzs_hip) * val_size * this->dim * this->dim_wells,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + this->num_blocks_so_far, colIndices,
|
||||
sizeof(d_Bcols_hip) * val_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
|
||||
if (num_std_wells_so_far == num_std_wells - 1) {
|
||||
val_pointers[num_std_wells] = num_blocks;
|
||||
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
|
||||
this->val_pointers[this->num_std_wells_so_far] = this->num_blocks_so_far;
|
||||
if (this->num_std_wells_so_far == this->num_std_wells - 1) {
|
||||
this->val_pointers[this->num_std_wells] = this->num_blocks;
|
||||
HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, this->val_pointers.data(),
|
||||
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
break;
|
||||
|
||||
@ -226,14 +257,21 @@ void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void WellContributionsRocsparse::APIalloc()
|
||||
template<class Scalar>
|
||||
void WellContributionsRocsparse<Scalar>::APIalloc()
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip,
|
||||
sizeof(d_Cnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip,
|
||||
sizeof(d_Dnnzs_hip) * this->num_std_wells * this->dim_wells * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip,
|
||||
sizeof(d_Bnnzs_hip) * this->num_blocks * this->dim * this->dim_wells));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * this->num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * this->num_blocks));
|
||||
HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip,
|
||||
sizeof(d_val_pointers_hip) * (this->num_std_wells + 1)));
|
||||
}
|
||||
|
||||
} //namespace Opm
|
||||
template class WellContributionsRocsparse<double>;
|
||||
|
||||
} // namespace Opm
|
||||
|
@ -26,33 +26,35 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace Opm {
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
class WellContributionsRocsparse : public WellContributions
|
||||
template<class Scalar>
|
||||
class WellContributionsRocsparse : public WellContributions<Scalar>
|
||||
{
|
||||
private:
|
||||
hipStream_t stream;
|
||||
|
||||
public:
|
||||
void apply_stdwells(double *d_x, double *d_y);
|
||||
void apply_mswells(double *d_x, double *d_y);
|
||||
void apply(double *d_x, double *d_y);
|
||||
void apply_stdwells(Scalar* d_x, Scalar* d_y);
|
||||
void apply_mswells(Scalar* d_x, Scalar* d_y);
|
||||
void apply(Scalar* d_x, Scalar* d_y);
|
||||
void setStream(hipStream_t stream);
|
||||
|
||||
protected:
|
||||
/// Allocate memory for the StandardWells
|
||||
void APIalloc() override;
|
||||
|
||||
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
|
||||
using MatrixType = typename WellContributions<Scalar>::MatrixType;
|
||||
|
||||
double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
|
||||
void APIaddMatrix(MatrixType type, int* colIndices,
|
||||
Scalar* values, unsigned int val_size) override;
|
||||
|
||||
Scalar *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
|
||||
unsigned *d_Ccols_hip, *d_Bcols_hip;
|
||||
unsigned *d_val_pointers_hip;
|
||||
|
||||
std::vector<double> h_x;
|
||||
std::vector<double> h_y;
|
||||
std::vector<Scalar> h_x;
|
||||
std::vector<Scalar> h_y;
|
||||
};
|
||||
|
||||
} //namespace Opm
|
||||
|
@ -90,7 +90,7 @@ struct EnableTerminalOutput {
|
||||
namespace Opm {
|
||||
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
#endif
|
||||
|
||||
/// Class for handling the blackoil well model.
|
||||
@ -287,7 +287,7 @@ class WellContributions;
|
||||
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
// accumulate the contributions of all Wells in the WellContributions object
|
||||
void getWellContributions(WellContributions& x) const;
|
||||
void getWellContributions(WellContributions<Scalar>& x) const;
|
||||
#endif
|
||||
|
||||
// apply well model with scaling of alpha
|
||||
|
@ -1568,7 +1568,7 @@ namespace Opm {
|
||||
template<typename TypeTag>
|
||||
void
|
||||
BlackoilWellModel<TypeTag>::
|
||||
getWellContributions(WellContributions& wellContribs) const
|
||||
getWellContributions(WellContributions<Scalar>& wellContribs) const
|
||||
{
|
||||
// prepare for StandardWells
|
||||
wellContribs.setBlockSize(StandardWell<TypeTag>::Indices::numEq, StandardWell<TypeTag>::numStaticWellEq);
|
||||
|
@ -202,7 +202,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
template<class Scalar, int numWellEq, int numEq>
|
||||
void MultisegmentWellEquations<Scalar,numWellEq,numEq>::
|
||||
extract(WellContributions& wellContribs) const
|
||||
extract(WellContributions<Scalar>& wellContribs) const
|
||||
{
|
||||
unsigned int Mb = duneB_.N(); // number of blockrows in duneB_, duneC_ and duneD_
|
||||
unsigned int BnumBlocks = duneB_.nonzeroes();
|
||||
|
@ -39,7 +39,7 @@ namespace Opm
|
||||
template<class Scalar, int numWellEq, int numEq> class MultisegmentWellEquationAccess;
|
||||
template<class Scalar> class MultisegmentWellGeneric;
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
#endif
|
||||
template<class Scalar> class WellInterfaceGeneric;
|
||||
template<class Scalar> class WellState;
|
||||
@ -105,7 +105,7 @@ public:
|
||||
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
//! \brief Add the matrices of this well to the WellContributions object.
|
||||
void extract(WellContributions& wellContribs) const;
|
||||
void extract(WellContributions<Scalar>& wellContribs) const;
|
||||
#endif
|
||||
|
||||
//! \brief Add the matrices of this well to the sparse matrix adapter.
|
||||
|
@ -198,7 +198,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
|
||||
template<class Scalar, int numEq>
|
||||
void StandardWellEquations<Scalar,numEq>::
|
||||
extract(const int numStaticWellEq,
|
||||
WellContributions& wellContribs) const
|
||||
WellContributions<Scalar>& wellContribs) const
|
||||
{
|
||||
std::vector<int> colIndices;
|
||||
std::vector<Scalar> nnzValues;
|
||||
@ -216,7 +216,7 @@ extract(const int numStaticWellEq,
|
||||
}
|
||||
}
|
||||
}
|
||||
wellContribs.addMatrix(WellContributions::MatrixType::C,
|
||||
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::C,
|
||||
colIndices.data(), nnzValues.data(), duneC_.nonzeroes());
|
||||
|
||||
// invDuneD
|
||||
@ -229,7 +229,7 @@ extract(const int numStaticWellEq,
|
||||
nnzValues.emplace_back(invDuneD_[0][0][i][j]);
|
||||
}
|
||||
}
|
||||
wellContribs.addMatrix(WellContributions::MatrixType::D,
|
||||
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::D,
|
||||
colIndices.data(), nnzValues.data(), 1);
|
||||
|
||||
// duneB
|
||||
@ -245,7 +245,7 @@ extract(const int numStaticWellEq,
|
||||
}
|
||||
}
|
||||
}
|
||||
wellContribs.addMatrix(WellContributions::MatrixType::B,
|
||||
wellContribs.addMatrix(WellContributions<Scalar>::MatrixType::B,
|
||||
colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
|
||||
}
|
||||
#endif
|
||||
|
@ -37,7 +37,7 @@ namespace Opm
|
||||
template<class Scalar> class ParallelWellInfo;
|
||||
template<class Scalar, int numEq> class StandardWellEquationAccess;
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
#endif
|
||||
template<class Scalar> class WellInterfaceGeneric;
|
||||
template<class Scalar> class WellState;
|
||||
@ -102,7 +102,7 @@ public:
|
||||
#if COMPILE_BDA_BRIDGE
|
||||
//! \brief Add the matrices of this well to the WellContributions object.
|
||||
void extract(const int numStaticWellEq,
|
||||
WellContributions& wellContribs) const;
|
||||
WellContributions<Scalar>& wellContribs) const;
|
||||
#endif
|
||||
|
||||
//! \brief Add the matrices of this well to the sparse matrix adapter.
|
||||
|
@ -38,7 +38,7 @@ class ConvergenceReport;
|
||||
class DeferredLogger;
|
||||
class Schedule;
|
||||
class SummaryState;
|
||||
class WellContributions;
|
||||
template<class Scalar> class WellContributions;
|
||||
template<class FluidSystem, class Indices> class WellInterfaceIndices;
|
||||
template<class Scalar> class WellState;
|
||||
|
||||
|
@ -272,7 +272,7 @@ computeBhpAtThpLimitProd(const std::function<std::vector<Scalar>(const Scalar)>&
|
||||
"find bhp-point where production becomes non-zero for well " + well_.name());
|
||||
return std::nullopt;
|
||||
}
|
||||
const std::array<Scalar, 2> range {controls.bhp_limit, *bhp_max};
|
||||
const std::array<Scalar, 2> range {static_cast<Scalar>(controls.bhp_limit), *bhp_max};
|
||||
return this->computeBhpAtThpLimit(frates, fbhp, range, deferred_logger);
|
||||
}
|
||||
|
||||
@ -518,9 +518,9 @@ computeBhpAtThpLimitInjImpl(const std::function<std::vector<Scalar>(const Scalar
|
||||
|
||||
// Get the flo samples, add extra samples at low rates and bhp
|
||||
// limit point if necessary.
|
||||
std::vector<Scalar> flo_samples = table.getFloAxis();
|
||||
std::vector<double> flo_samples = table.getFloAxis();
|
||||
if (flo_samples[0] > 0.0) {
|
||||
const Scalar f0 = flo_samples[0];
|
||||
const double f0 = flo_samples[0];
|
||||
flo_samples.insert(flo_samples.begin(), { f0/20.0, f0/10.0, f0/5.0, f0/2.0 });
|
||||
}
|
||||
const Scalar flo_bhp_limit = flo(frates(controls.bhp_limit));
|
||||
|
@ -123,7 +123,7 @@ testCusparseSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("cusparse", false);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
|
||||
@ -138,7 +138,7 @@ testCusparseSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Mat
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("cusparse", false);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("cusparse", false);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
|
||||
|
@ -120,7 +120,7 @@ testOpenclSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>&
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("opencl", false);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
|
||||
@ -135,7 +135,7 @@ testOpenclSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matri
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("opencl", false);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("opencl", false);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
|
||||
|
@ -96,7 +96,7 @@ testRocalutionSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix,
|
||||
Dune::InverseOperatorResult result;
|
||||
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create(accelerator_mode, true);
|
||||
auto wellContribs = Opm::WellContributions<double>::create(accelerator_mode, true);
|
||||
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
|
||||
try {
|
||||
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
|
||||
|
@ -127,7 +127,7 @@ testRocsparseSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
|
||||
@ -142,7 +142,7 @@ testRocsparseSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>,
|
||||
{
|
||||
Dune::InverseOperatorResult result;
|
||||
Vector<bz> x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("rocsparse", true);
|
||||
auto wellContribs = Opm::WellContributions<double>::create("rocsparse", true);
|
||||
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
|
||||
// matrix created by readMatrixMarket() did not have contiguous memory
|
||||
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
|
||||
|
Loading…
Reference in New Issue
Block a user