mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Add CPR preconditioner for openclSolver and
change raw pointers to vector for Matrix
This commit is contained in:
parent
44faf036c9
commit
6465cf9cbb
@ -100,6 +100,7 @@ if(OPENCL_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/CPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
|
||||
@ -107,11 +108,11 @@ if(OPENCL_FOUND)
|
||||
endif()
|
||||
if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR HAVE_AMGCL)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Matrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||
endif()
|
||||
if(HAVE_FPGA)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
|
||||
@ -247,10 +248,10 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/linalg/bda/BdaSolver.hpp
|
||||
opm/simulators/linalg/bda/BILU0.hpp
|
||||
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
||||
opm/simulators/linalg/bda/CPR.hpp
|
||||
opm/simulators/linalg/bda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
||||
opm/simulators/linalg/bda/FPGAMatrix.hpp
|
||||
opm/simulators/linalg/bda/FPGABILU0.hpp
|
||||
opm/simulators/linalg/bda/FPGASolverBackend.hpp
|
||||
opm/simulators/linalg/bda/FPGAUtils.hpp
|
||||
@ -260,6 +261,7 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/linalg/bda/openclKernels.hpp
|
||||
opm/simulators/linalg/bda/openclSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/openclWellContributions.hpp
|
||||
opm/simulators/linalg/bda/Matrix.hpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
|
||||
opm/simulators/linalg/bda/WellContributions.hpp
|
||||
opm/simulators/linalg/amgcpr.hh
|
||||
|
@ -146,7 +146,8 @@ namespace Opm
|
||||
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
||||
std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver));
|
||||
}
|
||||
#else
|
||||
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
|
||||
|
@ -122,7 +122,6 @@ public:
|
||||
instance().doAddCreator(type, creator);
|
||||
}
|
||||
|
||||
private:
|
||||
using CriterionBase
|
||||
= Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<Matrix, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
@ -150,6 +149,7 @@ private:
|
||||
criterion.setMinAggregateSize(prm.get<int>("minaggsize", 4));
|
||||
return criterion;
|
||||
}
|
||||
private:
|
||||
|
||||
/// Helper struct to explicitly overload amgSmootherArgs() version for
|
||||
/// ParallelOverlappingILU0, since in-class specialization is not allowed.
|
||||
|
@ -205,7 +205,7 @@ BILU0<block_size>::~BILU0()
|
||||
Timer t_copyToGpu;
|
||||
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
@ -219,9 +219,9 @@ BILU0<block_size>::~BILU0()
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
|
@ -63,7 +63,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
double tolerance,
|
||||
[[maybe_unused]] unsigned int platformID,
|
||||
unsigned int deviceID,
|
||||
[[maybe_unused]] std::string opencl_ilu_reorder)
|
||||
[[maybe_unused]] std::string opencl_ilu_reorder,
|
||||
[[maybe_unused]] std::string linsolver)
|
||||
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
|
||||
{
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
@ -88,7 +89,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
||||
}
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
@ -168,27 +169,28 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||
// sparsity pattern should stay the same
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix>
|
||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::getSparsityPattern(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
int sum_nnzs = 0;
|
||||
|
||||
// convert colIndices and rowPointers
|
||||
if (h_rows.empty()) {
|
||||
h_rows.emplace_back(0);
|
||||
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
int size_row = 0;
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
h_rows.clear();
|
||||
h_cols.clear();
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
// convert colIndices and rowPointers
|
||||
h_rows.emplace_back(0);
|
||||
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
int size_row = 0;
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
}
|
||||
} // end getSparsityPattern()
|
||||
|
||||
|
@ -54,7 +54,9 @@ public:
|
||||
/// \param[in] platformID the OpenCL platform ID to be used
|
||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
||||
/// \param[in] linsolver copy of cmdline argument --linsolver
|
||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
|
||||
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
@ -75,6 +77,12 @@ public:
|
||||
return use_gpu;
|
||||
}
|
||||
|
||||
/// Store sparsity pattern into vectors
|
||||
/// \param[in] mat input matrix, probably BCRSMatrix
|
||||
/// \param[out] h_rows rowpointers
|
||||
/// \param[out] h_cols columnindices
|
||||
static void getSparsityPattern(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
|
||||
|
||||
/// Initialize the WellContributions object with opencl context and queue
|
||||
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
|
||||
/// \param[in] wellContribs container to hold all WellContributions
|
||||
|
@ -76,6 +76,7 @@ namespace Accelerator {
|
||||
/// \param[in] tolerance required relative tolerance for solver
|
||||
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
||||
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace Opm
|
||||
@ -502,6 +503,8 @@ INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
|
||||
#endif // HAVE_FPGA
|
||||
|
@ -22,9 +22,15 @@
|
||||
|
||||
#if HAVE_FPGA
|
||||
#include <vector>
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
class Matrix;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
519
opm/simulators/linalg/bda/CPR.cpp
Normal file
519
opm/simulators/linalg/bda/CPR.cpp
Normal file
@ -0,0 +1,519 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/istl/umfpack.hh>
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) :
|
||||
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::init(int Nb_, int nnzb_, std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
this->Nb = Nb_;
|
||||
this->nnzb = nnzb_;
|
||||
this->N = Nb_ * block_size;
|
||||
this->nnz = nnzb_ * block_size * block_size;
|
||||
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
coarse_vals.resize(nnzb);
|
||||
coarse_x.resize(Nb);
|
||||
coarse_y.resize(Nb);
|
||||
weights.resize(N);
|
||||
diagIndices.resize(1);
|
||||
}
|
||||
|
||||
|
||||
double get_absmax(const double *data, const int B) {
|
||||
double abs_max = 0.0;
|
||||
for (int i = 0; i < B; ++i) {
|
||||
if (std::fabs(data[i]) > abs_max) {
|
||||
abs_max = std::fabs(data[i]);
|
||||
}
|
||||
}
|
||||
return abs_max;
|
||||
}
|
||||
|
||||
|
||||
// solve A^T * x = b
|
||||
void solve_transposed_3x3(const double *A, const double *b, double *x) {
|
||||
const int B = 3;
|
||||
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
|
||||
double t4 = A[0+0*B] * A[1+1*B];
|
||||
double t6 = A[0+0*B] * A[1+2*B];
|
||||
double t8 = A[0+1*B] * A[1+0*B];
|
||||
double t10 = A[0+2*B] * A[1+0*B];
|
||||
double t12 = A[0+1*B] * A[2+0*B];
|
||||
double t14 = A[0+2*B] * A[2+0*B];
|
||||
|
||||
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
|
||||
|
||||
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
|
||||
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
|
||||
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
|
||||
|
||||
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
|
||||
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
|
||||
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
|
||||
|
||||
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
|
||||
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
|
||||
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat_) {
|
||||
this->mat = mat_;
|
||||
|
||||
try{
|
||||
double rhs[] = {0, 0, 0};
|
||||
rhs[pressure_idx] = 1;
|
||||
|
||||
// find diagonal index for each row
|
||||
if (diagIndices[0].empty()) {
|
||||
diagIndices[0].resize(Nb);
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
|
||||
assert(candidate != mat->colIndices + end);
|
||||
diagIndices[0][row] = candidate - mat->colIndices;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate weights for each row
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
// solve to find weights
|
||||
double *row_weights = weights.data() + block_size * row; // weights for this row
|
||||
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
|
||||
|
||||
// normalize weights for this row
|
||||
double abs_max = get_absmax(row_weights, block_size);
|
||||
for(unsigned int i = 0; i < block_size; i++){
|
||||
row_weights[i] /= abs_max;
|
||||
}
|
||||
}
|
||||
|
||||
// transform blocks to scalars
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
double *block = mat->nnzValues + idx * block_size * block_size;
|
||||
double *row_weights = weights.data() + block_size * row;
|
||||
double value = 0.0;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
value += block[block_size * i + pressure_idx] * row_weights[i];
|
||||
}
|
||||
coarse_vals[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
|
||||
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
|
||||
if (recalculate_aggregates) {
|
||||
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
|
||||
|
||||
typedef DuneMat::CreateIterator Iter;
|
||||
|
||||
// setup sparsity pattern
|
||||
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
|
||||
int start = mat->rowPointers[row.index()];
|
||||
int end = mat->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
row.insert(col);
|
||||
}
|
||||
}
|
||||
|
||||
// set values
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
|
||||
Dune::Amg::SequentialInformation seqinfo;
|
||||
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
|
||||
|
||||
Opm::PropertyTree property_tree;
|
||||
property_tree.put("alpha", 0.333333333333);
|
||||
|
||||
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
|
||||
// Yet a SymmetricDependency is used in AMGCPR
|
||||
// An UnSymmetricCriterion is also available
|
||||
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
|
||||
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
const Criterion c = Opm::PreconditionerFactory<MatrixOperator, Dune::Amg::SequentialInformation>::amgCriterion(property_tree);
|
||||
|
||||
dune_amg->build<OverlapFlags>(c);
|
||||
|
||||
analyzeHierarchy();
|
||||
analyzeAggregateMaps();
|
||||
|
||||
recalculate_aggregates = false;
|
||||
} else {
|
||||
// update values of coarsest level in AMG
|
||||
// this works because that level is actually a reference to the DuneMat held by dune_coarse
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
// update the rest of the AMG hierarchy
|
||||
dune_amg->recalculateGalerkin(OverlapFlags());
|
||||
analyzeHierarchy();
|
||||
}
|
||||
|
||||
// initialize OpenclMatrices and Buffers if needed
|
||||
std::call_once(opencl_buffers_allocated, [&](){
|
||||
d_Amatrices.reserve(num_levels);
|
||||
d_Pmatrices.reserve(num_levels - 1);
|
||||
d_Rmatrices.reserve(num_levels - 1);
|
||||
d_invDiags.reserve(num_levels-1);
|
||||
for (Matrix& m : Amatrices) {
|
||||
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs);
|
||||
}
|
||||
for (Matrix& m : Pmatrices) {
|
||||
d_Pmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
}
|
||||
for (Matrix& m : Rmatrices) {
|
||||
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
}
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_mat = std::make_unique<OpenclMatrix<block_size> >(context.get(), Nb, Nb, nnzb);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
});
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
d_mat->upload(queue.get(), mat);
|
||||
|
||||
err = CL_SUCCESS;
|
||||
events.resize(Pmatrices.size() + 1);
|
||||
for (unsigned int i = 0; i < Pmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[i]);
|
||||
}
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[Pmatrices.size()]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
for (unsigned int i = 0; i < Pmatrices.size(); ++i) {
|
||||
d_Pmatrices[i].upload(queue.get(), &Pmatrices[i]);
|
||||
}
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
|
||||
}
|
||||
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "Caught exception: " << ex.what() << std::endl;
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeHierarchy() {
|
||||
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
|
||||
num_levels = dune_amg->levels();
|
||||
level_sizes.resize(num_levels);
|
||||
diagIndices.resize(num_levels);
|
||||
|
||||
Amatrices.reserve(num_levels);
|
||||
Pmatrices.reserve(num_levels - 1); // coarsest level does not need one
|
||||
Rmatrices.reserve(num_levels - 1);
|
||||
invDiags.reserve(num_levels);
|
||||
|
||||
Amatrices.clear();
|
||||
invDiags.clear();
|
||||
|
||||
// matrixIter.dereference() returns MatrixAdapter
|
||||
// matrixIter.dereference().getmat() returns BCRSMat
|
||||
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
const auto& A = matrixIter.dereference().getmat();
|
||||
level_sizes[level] = A.N();
|
||||
diagIndices[level].reserve(A.N());
|
||||
|
||||
// extract matrix A
|
||||
Amatrices.emplace_back(A.N(), A.nonzeroes());
|
||||
// contiguous copy is not possible
|
||||
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
|
||||
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
|
||||
int nnz_idx = 0;
|
||||
const bool fillDiagIndices = diagIndices[level].empty();
|
||||
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
|
||||
if (fillDiagIndices && r.index() == c.index()) {
|
||||
diagIndices[level].emplace_back(nnz_idx);
|
||||
}
|
||||
nnz_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
Opm::BdaBridge<DuneMat, DuneVec, 1>::getSparsityPattern(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
|
||||
|
||||
// compute inverse diagonal values for current level
|
||||
invDiags.emplace_back(A.N());
|
||||
for (unsigned int row = 0; row < A.N(); ++row) {
|
||||
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeAggregateMaps() {
|
||||
|
||||
Pmatrices.clear();
|
||||
Rmatrices.clear();
|
||||
|
||||
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
|
||||
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
DuneAmg::AggregatesMap *map = *mapIter;
|
||||
|
||||
// get indices for each row of P and R
|
||||
std::vector<std::set<long int> > indicesP(level_sizes[level]);
|
||||
std::vector<std::set<long int> > indicesR(level_sizes[level+1]);
|
||||
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
|
||||
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
|
||||
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
|
||||
const long int diff = ai - map->begin();
|
||||
indicesP[diff].insert(*ai);
|
||||
indicesR[*ai].insert(diff);
|
||||
}
|
||||
}
|
||||
|
||||
Pmatrices.emplace_back(level_sizes[level], level_sizes[level+1], level_sizes[level]);
|
||||
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
|
||||
std::fill(Pmatrices.back().nnzValues.begin(), Pmatrices.back().nnzValues.end(), 1.0);
|
||||
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
|
||||
|
||||
// set sparsity pattern of P
|
||||
int col_idx = 0;
|
||||
Pmatrices.back().rowPointers[0] = 0;
|
||||
for (unsigned int i = 0; i < indicesP.size(); ++i) {
|
||||
Pmatrices.back().rowPointers[i + 1] = Pmatrices.back().rowPointers[i] + indicesP[i].size();
|
||||
for (auto it = indicesP[i].begin(); it != indicesP[i].end(); ++it) {
|
||||
Pmatrices.back().colIndices[col_idx] = *it;
|
||||
col_idx++;
|
||||
}
|
||||
}
|
||||
// set sparsity pattern of R
|
||||
col_idx = 0;
|
||||
Rmatrices.back().rowPointers[0] = 0;
|
||||
for (unsigned int i = 0; i < indicesR.size(); ++i) {
|
||||
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
|
||||
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
|
||||
Rmatrices.back().colIndices[col_idx] = *it;
|
||||
col_idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void solve_coarse_umfpack(const Matrix *A, std::vector<double> &y, std::vector<double> &x) {
|
||||
const int N = A->N;
|
||||
const int nnzs = A->nnzs;
|
||||
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
|
||||
DuneMat B(N, N, nnzs, DuneMat::row_wise);
|
||||
|
||||
typedef DuneMat::CreateIterator Iter;
|
||||
|
||||
// setup sparsity pattern
|
||||
for(Iter row = B.createbegin(); row != B.createend(); ++row){
|
||||
int start = A->rowPointers[row.index()];
|
||||
int end = A->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = A->colIndices[idx];
|
||||
row.insert(col);
|
||||
}
|
||||
}
|
||||
|
||||
// set values
|
||||
for (int row = 0; row < N; ++row) {
|
||||
int start = A->rowPointers[row];
|
||||
int end = A->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = A->colIndices[idx];
|
||||
B[row][col] = A->nnzValues[idx];
|
||||
}
|
||||
}
|
||||
|
||||
// create umfpack object
|
||||
Dune::UMFPack<DuneMat> umfpack(B, 0);
|
||||
|
||||
umfpack.apply(x.data(), y.data());
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
|
||||
OpenclMatrix<1> *A = &d_Amatrices[level];
|
||||
OpenclMatrix<1> *P = &d_Pmatrices[level];
|
||||
OpenclMatrix<1> *R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
|
||||
if (level == num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
|
||||
}
|
||||
|
||||
solve_coarse_umfpack(&Amatrices[level], h_y, h_x);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
return;
|
||||
}
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
cl::Buffer& t = d_t[level];
|
||||
cl::Buffer& f = d_f[level];
|
||||
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
double jacobi_damping = 0.72; // default value in amgcl: 0.72
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels::spmv(P->nnzValues, P->colIndices, P->rowPointers, u, x, Ncur, 1, false);
|
||||
|
||||
// postsmooth
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
|
||||
// x = prec(y)
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
|
||||
// 0-initialize u and x vectors
|
||||
events.resize(d_u.size() + 1);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels::move_to_coarse(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels::move_to_fine(*d_coarse_x, x, pressure_idx, Nb);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template CPR<n>::CPR(int, ILUReorder); \
|
||||
template void CPR<n>::init(int, int, std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
|
||||
template void CPR<n>::apply(const cl::Buffer&, cl::Buffer&); \
|
||||
template void CPR<n>::create_preconditioner(BlockedMatrix<n> *mat);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
|
117
opm/simulators/linalg/bda/CPR.hpp
Normal file
117
opm/simulators/linalg/bda/CPR.hpp
Normal file
@ -0,0 +1,117 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef CPR_HPP
|
||||
#define CPR_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <dune/istl/paamg/matrixhierarchy.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <unsigned int block_size>
|
||||
class CPR
|
||||
{
|
||||
|
||||
private:
|
||||
int N; // number of rows of the matrix
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzb; // number of blocks of the matrix
|
||||
int verbosity;
|
||||
|
||||
int num_levels;
|
||||
std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix> Amatrices, Pmatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<OpenclMatrix<1> > d_Amatrices, d_Pmatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<cl::Buffer> d_invDiags;
|
||||
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
|
||||
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
|
||||
std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
|
||||
std::unique_ptr<OpenclMatrix<block_size> > d_mat; // stores blocked matrix
|
||||
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
BlockedMatrix<block_size> *mat = nullptr; // input matrix, blocked
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
|
||||
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
|
||||
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
|
||||
std::unique_ptr<DuneAmg> dune_amg;
|
||||
std::unique_ptr<DuneMat> dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy
|
||||
std::shared_ptr<MatrixOperator> dune_op; // operator, input to Dune AMG
|
||||
std::vector<int> level_sizes; // size of each level in the AMG hierarchy
|
||||
std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
|
||||
bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
|
||||
bool recalculate_aggregates = true; // only rerecalculate if true
|
||||
const int pressure_idx = 1; // hardcoded to mimic OPM
|
||||
|
||||
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
||||
ILUReorder opencl_ilu_reorder; // reordering strategy for ILU0 in coarse solver
|
||||
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
void analyzeHierarchy();
|
||||
|
||||
// Analyze the aggregateMaps from the AMG hierarchy
|
||||
// These can be reused, so only use when recalculate_aggregates is true
|
||||
void analyzeAggregateMaps();
|
||||
|
||||
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
|
||||
|
||||
public:
|
||||
|
||||
CPR(int verbosity, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
void init(int Nb, int nnzb, std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x);
|
||||
|
||||
void create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <config.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/material/common/Unused.hpp>
|
||||
|
@ -17,10 +17,13 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace Opm
|
||||
@ -28,6 +31,32 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
|
||||
std::vector<cl::Event> events(3);
|
||||
|
||||
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "OpenclMatrix OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, Matrix *matrix) {
|
||||
upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void OpenclMatrix<block_size>::upload(cl::CommandQueue *queue, BlockedMatrix<block_size> *matrix) {
|
||||
upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
|
||||
}
|
||||
|
||||
/*Sort a row of matrix elements from a CSR-format.*/
|
||||
void sortRow(int *colIndices, double *data, int left, int right) {
|
||||
int l = left;
|
||||
@ -57,6 +86,7 @@ void sortRow(int *colIndices, double *data, int left, int right) {
|
||||
|
||||
}
|
||||
|
||||
#if HAVE_FPGA
|
||||
/*
|
||||
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
|
||||
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
|
||||
@ -247,6 +277,20 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class OpenclMatrix<n>;
|
||||
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
@ -22,36 +22,77 @@
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
class Matrix;
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The matrix data is stored in OpenCL Buffers
|
||||
template <unsigned int block_size>
|
||||
class OpenclMatrix {
|
||||
public:
|
||||
|
||||
OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_)
|
||||
: Nb(Nb_),
|
||||
Mb(Mb_),
|
||||
nnzbs(nnzbs_)
|
||||
{
|
||||
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
|
||||
colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
|
||||
rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
}
|
||||
|
||||
void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
|
||||
void upload(cl::CommandQueue *queue, Matrix *matrix);
|
||||
void upload(cl::CommandQueue *queue, BlockedMatrix<block_size> *matrix);
|
||||
|
||||
cl::Buffer nnzValues;
|
||||
cl::Buffer colIndices;
|
||||
cl::Buffer rowPointers;
|
||||
int Nb, Mb;
|
||||
int nnzbs;
|
||||
};
|
||||
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
class Matrix {
|
||||
|
||||
public:
|
||||
|
||||
/// Allocate Matrix and data arrays with given sizes
|
||||
/// Allocate square Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int nnzs_)
|
||||
: nnzValues(new double[nnzs_]),
|
||||
colIndices(new int[nnzs_]),
|
||||
rowPointers(new int[N_+1]),
|
||||
N(N_),
|
||||
: N(N_),
|
||||
nnzs(nnzs_)
|
||||
{}
|
||||
|
||||
/// All constructors allocate new memory, so always delete here
|
||||
~Matrix(){
|
||||
delete[] nnzValues;
|
||||
delete[] colIndices;
|
||||
delete[] rowPointers;
|
||||
{
|
||||
nnzValues.resize(nnzs);
|
||||
colIndices.resize(nnzs);
|
||||
rowPointers.resize(N+1);
|
||||
}
|
||||
|
||||
/// Allocate rectangular Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] M number of columns
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int M_, int nnzs_)
|
||||
: N(N_),
|
||||
M(M_),
|
||||
nnzs(nnzs_)
|
||||
{
|
||||
nnzValues.resize(nnzs);
|
||||
colIndices.resize(nnzs);
|
||||
rowPointers.resize(N+1);
|
||||
}
|
||||
|
||||
#if HAVE_FPGA
|
||||
/// Converts this matrix to the dataformat used by the FPGA.
|
||||
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
|
||||
/// The purpose of this format is to allow the data to be streamable.
|
||||
@ -71,11 +112,12 @@ public:
|
||||
int toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
|
||||
#endif
|
||||
|
||||
double *nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
int N;
|
||||
std::vector<double> nnzValues;
|
||||
std::vector<int> colIndices;
|
||||
std::vector<int> rowPointers;
|
||||
int N, M;
|
||||
int nnzs;
|
||||
};
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
@ -45,8 +46,15 @@ std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const u
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::move_to_coarse_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::move_to_fine_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
@ -79,14 +87,28 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
add_kernel_string(sources, axpy_s);
|
||||
const std::string& scale_s = get_scale_string();
|
||||
add_kernel_string(sources, scale_s);
|
||||
const std::string& vmul_s = get_vmul_string();
|
||||
add_kernel_string(sources, vmul_s);
|
||||
const std::string& dot_1_s = get_dot_1_string();
|
||||
add_kernel_string(sources, dot_1_s);
|
||||
const std::string& norm_s = get_norm_string();
|
||||
add_kernel_string(sources, norm_s);
|
||||
const std::string& custom_s = get_custom_string();
|
||||
add_kernel_string(sources, custom_s);
|
||||
const std::string& spmv_blocked_s = get_spmv_blocked_string();
|
||||
const std::string& move_to_coarse_s = get_move_to_coarse_string();
|
||||
add_kernel_string(sources, move_to_coarse_s);
|
||||
const std::string& move_to_fine_s = get_move_to_fine_string();
|
||||
add_kernel_string(sources, move_to_fine_s);
|
||||
const std::string& spmv_blocked_s = get_blocked_matrix_operation_string(matrix_operation::spmv_op);
|
||||
add_kernel_string(sources, spmv_blocked_s);
|
||||
const std::string& spmv_s = get_matrix_operation_string(matrix_operation::spmv_op, true);
|
||||
add_kernel_string(sources, spmv_s);
|
||||
const std::string& spmv_noreset_s = get_matrix_operation_string(matrix_operation::spmv_op, false);
|
||||
add_kernel_string(sources, spmv_noreset_s);
|
||||
const std::string& residual_blocked_s = get_blocked_matrix_operation_string(matrix_operation::residual_op);
|
||||
add_kernel_string(sources, residual_blocked_s);
|
||||
const std::string& residual_s = get_matrix_operation_string(matrix_operation::residual_op);
|
||||
add_kernel_string(sources, residual_s);
|
||||
#if CHOW_PATEL
|
||||
bool ilu_operate_on_full_matrix = false;
|
||||
#else
|
||||
@ -114,8 +136,15 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
move_to_coarse_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "move_to_coarse")));
|
||||
move_to_fine_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "move_to_fine")));
|
||||
spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
|
||||
spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
|
||||
residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
|
||||
residual_k.reset(new residual_kernel_type(cl::Kernel(program, "residual")));
|
||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||
@ -219,6 +248,23 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_vmul;
|
||||
|
||||
cl::Event event = (*vmul_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), alpha, in1, in2, out, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels vmul() time: " << t_vmul.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
@ -236,15 +282,58 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const do
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
|
||||
void OpenclKernels::move_to_coarse(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t;
|
||||
|
||||
cl::Event event = (*move_to_coarse_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), fine_y, weights, coarse_y, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels move_to_coarse() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::move_to_fine(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t;
|
||||
|
||||
cl::Event event = (*move_to_fine_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), coarse_x, fine_x, pressure_idx, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels move_to_fine() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_spmv;
|
||||
cl::Event event;
|
||||
|
||||
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
if (block_size > 1) {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
if (reset) {
|
||||
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -254,6 +343,29 @@ void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer&
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_residual;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels residual_blocked() time: " << t_residual.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||
{
|
||||
@ -395,6 +507,27 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
)";
|
||||
}
|
||||
|
||||
// multiply vector with another vector, element-wise
|
||||
std::string OpenclKernels::get_vmul_string() {
|
||||
return R"(
|
||||
__kernel void vmul(
|
||||
const double alpha,
|
||||
__global double const *in1,
|
||||
__global double const *in2,
|
||||
__global double *out,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
int idx = get_global_id(0);
|
||||
|
||||
while(idx < N){
|
||||
out[idx] += alpha * in1[idx] * in2[idx];
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
|
||||
// returns partial sums, instead of the final dot product
|
||||
std::string OpenclKernels::get_dot_1_string() {
|
||||
@ -506,15 +639,77 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
)";
|
||||
}
|
||||
|
||||
std::string OpenclKernels::get_spmv_blocked_string() {
|
||||
|
||||
// transform blocked vector to scalar vector using pressure-weights
|
||||
// every workitem handles one blockrow
|
||||
std::string OpenclKernels::get_move_to_coarse_string() {
|
||||
return R"(
|
||||
__kernel void spmv_blocked(
|
||||
__global const double *vals,
|
||||
__kernel void move_to_coarse(
|
||||
__global const double *fine_y,
|
||||
__global const double *weights,
|
||||
__global double *coarse_y,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = get_global_id(0);
|
||||
|
||||
while(target_block_row < Nb){
|
||||
double sum = 0.0;
|
||||
unsigned int idx = block_size * target_block_row;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
sum += fine_y[idx + i] * weights[idx + i];
|
||||
}
|
||||
coarse_y[target_block_row] = sum;
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
// add the coarse pressure solution back to the finer, complete solution
|
||||
// every workitem handles one blockrow
|
||||
std::string OpenclKernels::get_move_to_fine_string() {
|
||||
return R"(
|
||||
__kernel void move_to_fine(
|
||||
__global const double *coarse_x,
|
||||
__global double *fine_x,
|
||||
const unsigned int pressure_idx,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = get_global_id(0);
|
||||
|
||||
while(target_block_row < Nb){
|
||||
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
|
||||
/// either b = mat * x
|
||||
/// or res = rhs - mat * x
|
||||
std::string OpenclKernels::get_blocked_matrix_operation_string(matrix_operation op) {
|
||||
std::string s;
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
s += "__kernel void spmv_blocked(";
|
||||
} else {
|
||||
s += "__kernel void residual_blocked(";
|
||||
}
|
||||
s += R"(__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int Nb,
|
||||
__global const double *x,
|
||||
__global double *b,
|
||||
)";
|
||||
if (op == matrix_operation::residual_op) {
|
||||
s += "__global const double *rhs,";
|
||||
}
|
||||
s += R"(
|
||||
__global double *out,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
@ -566,15 +761,97 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
b[row] = tmp[lane];
|
||||
)";
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
s += " out[row] = tmp[lane];";
|
||||
} else {
|
||||
s += " out[row] = rhs[row] - tmp[lane];";
|
||||
}
|
||||
s += R"(
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
/// either b = mat * x
|
||||
/// or res = rhs - mat * x
|
||||
std::string OpenclKernels::get_matrix_operation_string(matrix_operation op, bool spmv_reset) {
|
||||
std::string s;
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
if (spmv_reset) {
|
||||
s += "__kernel void spmv(";
|
||||
} else {
|
||||
s += "__kernel void spmv_noreset(";
|
||||
}
|
||||
} else {
|
||||
s += "__kernel void residual(";
|
||||
}
|
||||
s += R"(__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int N,
|
||||
__global const double *x,
|
||||
)";
|
||||
if (op == matrix_operation::residual_op) {
|
||||
s += "__global const double *rhs,";
|
||||
}
|
||||
s += R"(
|
||||
__global double *out,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int bsize = get_local_size(0);
|
||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_workgroups = get_num_groups(0);
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
double local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
)";
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
if (spmv_reset) {
|
||||
s += " out[row] = tmp[idx_t];";
|
||||
} else {
|
||||
s += " out[row] += tmp[idx_t];";
|
||||
}
|
||||
} else {
|
||||
s += " out[row] = rhs[row] - tmp[idx_t];";
|
||||
}
|
||||
s += R"(
|
||||
}
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
|
@ -30,8 +30,14 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
@ -55,12 +61,24 @@ private:
|
||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
|
||||
enum matrix_operation {
|
||||
spmv_op,
|
||||
residual_op
|
||||
};
|
||||
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > move_to_coarse_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > move_to_fine_k;
|
||||
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
|
||||
static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
|
||||
static std::unique_ptr<residual_kernel_type> residual_k;
|
||||
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
||||
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||
@ -75,6 +93,10 @@ private:
|
||||
/// a = a * alpha
|
||||
static std::string get_scale_string();
|
||||
|
||||
/// multiply vector with another vector and a scalar, element-wise
|
||||
/// add result to a third vector
|
||||
static std::string get_vmul_string();
|
||||
|
||||
/// returns partial sums, instead of the final dot product
|
||||
/// partial sums are added on CPU
|
||||
static std::string get_dot_1_string();
|
||||
@ -88,11 +110,20 @@ private:
|
||||
/// p = (p - omega * v) * beta + r
|
||||
static std::string get_custom_string();
|
||||
|
||||
/// Transform blocked vector to scalar vector using pressure-weights
|
||||
static std::string get_move_to_coarse_string();
|
||||
|
||||
/// Add the coarse pressure solution back to the finer, complete solution
|
||||
static std::string get_move_to_fine_string();
|
||||
|
||||
/// b = mat * x
|
||||
/// algorithm based on:
|
||||
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
static std::string get_spmv_blocked_string();
|
||||
/// or
|
||||
/// res = rhs - (mat * x)
|
||||
static std::string get_blocked_matrix_operation_string(matrix_operation op);
|
||||
static std::string get_matrix_operation_string(matrix_operation op, bool spmv_reset = true);
|
||||
|
||||
/// ILU apply part 1: forward substitution
|
||||
/// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||
@ -127,8 +158,13 @@ public:
|
||||
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const double a, int N);
|
||||
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
|
||||
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
|
||||
static void move_to_coarse(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
|
||||
static void move_to_fine(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
|
||||
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
|
||||
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
|
||||
|
@ -45,8 +45,20 @@ using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
use_cpr = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
use_cpr = true;
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linsolver, " + linsolver);
|
||||
}
|
||||
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
|
||||
if (use_cpr) {
|
||||
cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_reorder);
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
try {
|
||||
@ -191,6 +203,18 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) :
|
||||
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), use_cpr(false), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::~openclSolverBackend() {
|
||||
@ -252,12 +276,16 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// pw = prec(p)
|
||||
t_prec.start();
|
||||
prec->apply(d_p, d_pw);
|
||||
bilu0->apply(d_p, d_pw);
|
||||
t_prec.stop();
|
||||
|
||||
if (use_cpr) {
|
||||
cpr->apply(d_p, d_pw);
|
||||
}
|
||||
|
||||
// v = A * pw
|
||||
t_spmv.start();
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@ -283,12 +311,16 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// s = prec(r)
|
||||
t_prec.start();
|
||||
prec->apply(d_r, d_s);
|
||||
bilu0->apply(d_r, d_s);
|
||||
t_prec.stop();
|
||||
|
||||
if (use_cpr) {
|
||||
cpr->apply(d_r, d_s);
|
||||
}
|
||||
|
||||
// t = A * s
|
||||
t_spmv.start();
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@ -358,8 +390,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
out.clear();
|
||||
|
||||
try {
|
||||
prec->setOpenCLContext(context.get());
|
||||
prec->setOpenCLQueue(queue.get());
|
||||
bilu0->setOpenCLContext(context.get());
|
||||
bilu0->setOpenCLQueue(queue.get());
|
||||
|
||||
if (use_cpr) {
|
||||
cpr->init(Nb, nnzb, context, queue);
|
||||
}
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
vals_contiguous = new double[N];
|
||||
@ -411,7 +447,6 @@ void openclSolverBackend<block_size>::finalize() {
|
||||
#if COPY_ROW_BY_ROW
|
||||
delete[] vals_contiguous;
|
||||
#endif
|
||||
delete prec;
|
||||
} // end finalize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
@ -492,14 +527,14 @@ template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::analyse_matrix() {
|
||||
Timer t;
|
||||
|
||||
bool success = prec->init(mat.get());
|
||||
bool success = bilu0->init(mat.get());
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
rmat = mat.get();
|
||||
} else {
|
||||
toOrder = prec->getToOrder();
|
||||
fromOrder = prec->getFromOrder();
|
||||
rmat = prec->getRMat();
|
||||
toOrder = bilu0->getToOrder();
|
||||
fromOrder = bilu0->getFromOrder();
|
||||
rmat = bilu0->getRMat();
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
@ -539,7 +574,14 @@ template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
Timer t;
|
||||
|
||||
bool result = prec->create_preconditioner(mat.get());
|
||||
bool result = bilu0->create_preconditioner(mat.get());
|
||||
if (use_cpr) {
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
cpr->create_preconditioner(mat.get());
|
||||
} else {
|
||||
cpr->create_preconditioner(bilu0->getRMat());
|
||||
}
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
@ -624,8 +666,11 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder); \
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend( \
|
||||
int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, ILUReorder); \
|
||||
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
|
||||
#include <tuple>
|
||||
|
||||
@ -35,12 +36,14 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
class CPR;
|
||||
|
||||
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
typedef BILU0<block_size> Preconditioner;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -66,7 +69,10 @@ private:
|
||||
|
||||
std::vector<cl::Device> devices;
|
||||
|
||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||
std::unique_ptr<BILU0<block_size> > bilu0; // Blocked ILU0 preconditioner
|
||||
std::unique_ptr<CPR<block_size> > cpr; // Constrained Pressure Residual preconditioner
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
bool use_cpr; // allow to enable CPR
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
bool analysis_done = false;
|
||||
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
|
||||
@ -178,7 +184,13 @@ public:
|
||||
/// \param[in] platformID the OpenCL platform to be used
|
||||
/// \param[in] deviceID the device to be used
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, ILUReorder opencl_ilu_reorder);
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linsolver cmdline argument
|
||||
/// only ilu0 and cpr_quasiimpes are supported
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
||||
ILUReorder opencl_ilu_reorder, std::string linsolver);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~openclSolverBackend();
|
||||
@ -200,10 +212,20 @@ public:
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
// SolverStatus solve_system(BdaResult &res);
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
/// Set OpenCL objects
|
||||
/// This class either creates them based on platformID and deviceID or receives them through this function
|
||||
/// \param[in] context the opencl context to be used
|
||||
/// \param[in] queue the opencl queue to be used
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
|
Loading…
Reference in New Issue
Block a user