opm-simulators/opm/simulators/linalg/bda/opencl/CPR.cpp

544 lines
20 KiB
C++
Raw Normal View History

/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <dune/common/shared_ptr.hh>
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
#include <opm/simulators/linalg/PropertyTree.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
2021-11-23 02:44:18 -06:00
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
2022-02-01 09:51:32 -06:00
#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
2021-11-25 06:40:30 -06:00
{
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
diagIndices.resize(1);
2021-11-25 06:40:30 -06:00
}
template <unsigned int block_size>
void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
context = context_;
queue = queue_;
bilu0->setOpencl(context, queue);
2021-11-25 06:40:30 -06:00
}
template <unsigned int block_size>
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_);
2021-11-25 06:40:30 -06:00
if (opencl_ilu_reorder == ILUReorder::NONE) {
mat = mat_;
} else {
mat = bilu0->getRMat();
}
return success;
}
2021-11-25 06:40:30 -06:00
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
Dune::Timer t_bilu0;
2021-11-25 06:40:30 -06:00
bool result = bilu0->create_preconditioner(mat_);
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
2021-11-25 06:40:30 -06:00
}
return result;
}
2021-11-23 04:33:30 -06:00
// return the absolute value of the N elements for which the absolute value is highest
double get_absmax(const double *data, const int N) {
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
}
// solve A^T * x = b
void solve_transposed_3x3(const double *A, const double *b, double *x) {
const int B = 3;
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
double t4 = A[0+0*B] * A[1+1*B];
double t6 = A[0+0*B] * A[1+2*B];
double t8 = A[0+1*B] * A[1+0*B];
double t10 = A[0+2*B] * A[1+0*B];
double t12 = A[0+1*B] * A[2+0*B];
double t14 = A[0+2*B] * A[2+0*B];
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
}
2021-11-23 10:12:31 -06:00
template <unsigned int block_size>
void CPR<block_size>::init_opencl_buffers() {
d_Amatrices.reserve(num_levels);
d_Rmatrices.reserve(num_levels - 1);
d_invDiags.reserve(num_levels - 1);
for (Matrix& m : Amatrices) {
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
}
for (Matrix& m : Rmatrices) {
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
2021-11-23 10:12:31 -06:00
}
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
}
template <unsigned int block_size>
void CPR<block_size>::opencl_upload() {
d_mat->upload(queue.get(), mat);
err = CL_SUCCESS;
events.resize(2 * Rmatrices.size() + 1);
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
2021-11-23 10:12:31 -06:00
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
2021-11-23 10:12:31 -06:00
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
}
}
template <unsigned int block_size>
2021-11-25 06:40:30 -06:00
void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
this->mat = mat_;
coarse_vals.resize(nnzb);
coarse_x.resize(Nb);
coarse_y.resize(Nb);
weights.resize(N);
try{
double rhs[] = {0, 0, 0};
rhs[pressure_idx] = 1;
// find diagonal index for each row
if (diagIndices[0].empty()) {
diagIndices[0].resize(Nb);
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
assert(candidate != mat->colIndices + end);
diagIndices[0][row] = candidate - mat->colIndices;
}
}
// calculate weights for each row
for (int row = 0; row < Nb; ++row) {
// solve to find weights
double *row_weights = weights.data() + block_size * row; // weights for this row
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
// normalize weights for this row
double abs_max = get_absmax(row_weights, block_size);
for(unsigned int i = 0; i < block_size; i++){
row_weights[i] /= abs_max;
}
}
2021-11-23 03:52:09 -06:00
// extract pressure
// transform blocks to scalars to create scalar linear system
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
double *block = mat->nnzValues + idx * block_size * block_size;
double *row_weights = weights.data() + block_size * row;
double value = 0.0;
for (unsigned int i = 0; i < block_size; ++i) {
value += block[block_size * i + pressure_idx] * row_weights[i];
}
coarse_vals[idx] = value;
}
}
#if HAVE_MPI
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
#else
2021-12-09 01:57:40 -06:00
using Communication = Dune::Amg::SequentialInformation;
#endif
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
if (recalculate_aggregates) {
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
typedef DuneMat::CreateIterator Iter;
// setup sparsity pattern
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
int start = mat->rowPointers[row.index()];
int end = mat->rowPointers[row.index() + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
row.insert(col);
}
}
// set values
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
(*dune_coarse)[row][col] = coarse_vals[idx];
}
}
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
2021-12-09 01:57:40 -06:00
#if DUNE_VERSION_NEWER(DUNE_ISTL, 2, 7)
Dune::Amg::SequentialInformation seqinfo;
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
2021-12-09 01:57:40 -06:00
#else
dune_amg = std::make_unique<DuneAmg>(*dune_op);
#endif
Opm::PropertyTree property_tree;
property_tree.put("alpha", 0.333333333333);
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
// Yet a SymmetricDependency is used in AMGCPR
// An UnSymmetricCriterion is also available
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
const Criterion c = Opm::PreconditionerFactory<MatrixOperator, Dune::Amg::SequentialInformation>::amgCriterion(property_tree);
num_pre_smooth_steps = c.getNoPreSmoothSteps();
num_post_smooth_steps = c.getNoPostSmoothSteps();
dune_amg->build<OverlapFlags>(c);
analyzeHierarchy();
analyzeAggregateMaps();
recalculate_aggregates = false;
} else {
// update values of coarsest level in AMG
// this works because that level is actually a reference to the DuneMat held by dune_coarse
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
(*dune_coarse)[row][col] = coarse_vals[idx];
}
}
// update the rest of the AMG hierarchy
dune_amg->recalculateGalerkin(OverlapFlags());
analyzeHierarchy();
}
// initialize OpenclMatrices and Buffers if needed
2021-11-23 10:12:31 -06:00
auto init_func = std::bind(&CPR::init_opencl_buffers, this);
std::call_once(opencl_buffers_allocated, init_func);
// upload matrices and vectors to GPU
2021-11-23 10:12:31 -06:00
opencl_upload();
} catch (const std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << std::endl;
throw ex;
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeHierarchy() {
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
2021-12-01 04:47:29 -06:00
// store coarsest AMG level in umfpack format, also performs LU decomposition
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
num_levels = dune_amg->levels();
level_sizes.resize(num_levels);
diagIndices.resize(num_levels);
Amatrices.reserve(num_levels);
Rmatrices.reserve(num_levels - 1); // coarsest level does not need one
invDiags.reserve(num_levels);
Amatrices.clear();
invDiags.clear();
// matrixIter.dereference() returns MatrixAdapter
// matrixIter.dereference().getmat() returns BCRSMat
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
const auto& A = matrixIter.dereference().getmat();
level_sizes[level] = A.N();
diagIndices[level].reserve(A.N());
// extract matrix A
Amatrices.emplace_back(A.N(), A.nonzeroes());
// contiguous copy is not possible
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
int nnz_idx = 0;
const bool fillDiagIndices = diagIndices[level].empty();
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
for (auto c = r->begin(); c != r->end(); ++c) {
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
if (fillDiagIndices && r.index() == c.index()) {
diagIndices[level].emplace_back(nnz_idx);
}
nnz_idx++;
}
}
2021-11-24 02:56:54 -06:00
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
// compute inverse diagonal values for current level
invDiags.emplace_back(A.N());
for (unsigned int row = 0; row < A.N(); ++row) {
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
}
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeAggregateMaps() {
PcolIndices.resize(num_levels - 1);
Rmatrices.clear();
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
DuneAmg::AggregatesMap *map = *mapIter;
2021-11-26 07:09:58 -06:00
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
// get indices for each row of P and R
2021-11-26 07:09:58 -06:00
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
PcolIndices[level].resize(level_sizes[level]);
2021-11-26 07:09:58 -06:00
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
const long int diff = ai - map->begin();
PcolIndices[level][diff] = *ai;
2021-11-26 07:09:58 -06:00
indicesR[*ai].emplace_back(diff);
}
}
int col_idx = 0;
2021-11-26 07:09:58 -06:00
// set sparsity pattern of R
Rmatrices.back().rowPointers[0] = 0;
for (unsigned int i = 0; i < indicesR.size(); ++i) {
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
2021-11-26 07:09:58 -06:00
Rmatrices.back().colIndices[col_idx++] = *it;
}
}
}
}
template <unsigned int block_size>
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
OpenclMatrix *A = &d_Amatrices[level];
OpenclMatrix *R = &d_Rmatrices[level];
int Ncur = A->Nb;
if (level == num_levels - 1) {
// solve coarsest level
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
events.resize(1);
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
}
2021-12-01 04:47:29 -06:00
// solve coarsest level using umfpack
umfpack.apply(h_x.data(), h_y.data());
events.resize(1);
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
return;
}
int Nnext = d_Amatrices[level+1].Nb;
cl::Buffer& t = d_t[level];
cl::Buffer& f = d_f[level];
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
// presmooth
double jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
// move to coarser level
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
amg_cycle_gpu(level + 1, f, u);
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
// postsmooth
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
}
// x = prec(y)
template <unsigned int block_size>
2021-11-25 06:40:30 -06:00
void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
// 0-initialize u and x vectors
events.resize(d_u.size() + 1);
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
for (unsigned int i = 0; i < d_u.size(); ++i) {
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
2021-11-23 04:16:42 -06:00
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
2021-11-23 04:16:42 -06:00
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
}
2021-11-25 06:40:30 -06:00
template <unsigned int block_size>
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
Dune::Timer t_bilu0;
2021-11-25 06:40:30 -06:00
bilu0->apply(y, x);
if (verbosity >= 4) {
std::ostringstream out;
out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
apply_amg(y, x);
if (verbosity >= 4) {
std::ostringstream out;
out << "CPR apply amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
2021-11-25 06:40:30 -06:00
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class CPR<n>;
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm