opm-simulators/opm/simulators/linalg/bda/BILU0.hpp
2021-10-05 10:49:21 +02:00

152 lines
5.5 KiB
C++

/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef BILU0_HPP
#define BILU0_HPP
#include <mutex>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
// FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
// if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
// if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
// the exact decomposition uses a full matrix LU which is the superposition of L and U
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
#define CHOW_PATEL 0
#define CHOW_PATEL_GPU 1
namespace bda
{
/// This class implementa a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix
template <unsigned int block_size>
class BILU0
{
private:
int N; // number of rows of the matrix
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
#endif
double *invDiagVals = nullptr;
std::vector<int> diagIndex;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
std::vector<int> toOrder, fromOrder;
int numColors;
int verbosity;
std::once_flag pattern_uploaded;
ILUReorder opencl_ilu_reorder;
typedef struct {
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
cl::Buffer rowsPerColor;
#if CHOW_PATEL
cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows;
#else
cl::Buffer LUvals, LUcols, LUrows;
#endif
} GPU_storage;
ilu_apply1_kernel_type *ILU_apply1;
ilu_apply2_kernel_type *ILU_apply2;
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale;
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg> *ilu_decomp;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
std::vector<cl::Event> events;
cl_int err;
int work_group_size = 0;
int total_work_items = 0;
int lmem_per_work_group = 0;
ChowPatelIlu chowPatelIlu;
void chow_patel_decomposition();
public:
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
~BILU0();
// analysis
bool init(BlockedMatrix<block_size> *mat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<block_size> *mat);
// apply preconditioner, x = prec(y)
void apply(cl::Buffer& y, cl::Buffer& x);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
void setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
);
int* getToOrder()
{
return toOrder.data();
}
int* getFromOrder()
{
return fromOrder.data();
}
BlockedMatrix<block_size>* getRMat()
{
return rmat.get();
}
};
} // end namespace bda
#endif