mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #3750 from ducbueno/isai-ilu
Adds IncompleteSAI preconditioner to openclSolver
This commit is contained in:
@@ -244,7 +244,6 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
|
||||
@@ -115,6 +115,16 @@ public:
|
||||
return rmat.get();
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
||||
{
|
||||
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
||||
}
|
||||
|
||||
std::pair<cl::Buffer, cl::Buffer> get_preconditioner_data()
|
||||
{
|
||||
return std::make_pair(s.LUvals, s.invDiagVals);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
|
||||
275
opm/simulators/linalg/bda/BISAI.cpp
Normal file
275
opm/simulators/linalg/bda/BISAI.cpp
Normal file
@@ -0,0 +1,275 @@
|
||||
/*
|
||||
Copyright 2022 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/BISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
BISAI<block_size>::BISAI(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
||||
Preconditioner<block_size>(verbosity_)
|
||||
{
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder_, verbosity_);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices){
|
||||
std::vector<int> aux(colPointers); // colPointers must be copied to this vector
|
||||
std::vector<int> csrToCscOffsetMap(rowIndices.size()); // map must have the same size as the indices vector
|
||||
|
||||
for(unsigned int row = 0; row < colPointers.size() - 1; row++){
|
||||
for(int jj = colPointers[row]; jj < colPointers[row+1]; jj++){
|
||||
int col = rowIndices[jj];
|
||||
int dest = aux[col];
|
||||
csrToCscOffsetMap[dest] = jj;
|
||||
aux[col]++;
|
||||
}
|
||||
}
|
||||
|
||||
return csrToCscOffsetMap;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
this->N = mat->Nb * bs;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * bs * bs;
|
||||
this->nnzb = mat->nnzbs;
|
||||
|
||||
bilu0->analyze_matrix(mat);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::buildLowerSubsystemsStructures(){
|
||||
lower.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
Dune::Timer t_buildLowerSubsystemsStructures;
|
||||
|
||||
for(int tcol = 0; tcol < Nb; tcol++){
|
||||
int frow = diagIndex[tcol] + 1;
|
||||
int lrow = colPointers[tcol + 1];
|
||||
int nx = lrow - frow;
|
||||
int nv = 0;
|
||||
|
||||
for(int sweep = 0; sweep < nx - 1; sweep++){
|
||||
for(int xid = sweep + 1; xid < nx; xid++){
|
||||
for(int ptr = diagIndex[rowIndices[frow + sweep]] + 1; ptr < colPointers[rowIndices[frow + sweep + 1]]; ptr++){
|
||||
if(rowIndices[ptr] == rowIndices[frow + xid]){
|
||||
lower.nzIndices.push_back(csrToCscOffsetMap[ptr]);
|
||||
lower.knownRhsIndices.push_back(csrToCscOffsetMap[frow + sweep]);
|
||||
lower.unknownRhsIndices.push_back(csrToCscOffsetMap[frow + xid]);
|
||||
nv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lower.subsystemPointers[tcol + 1] = lower.subsystemPointers[tcol] + nv;
|
||||
}
|
||||
|
||||
if(verbosity >= 4){
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildLowerSubsystemsStructures time: " << t_buildLowerSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::buildUpperSubsystemsStructures(){
|
||||
upper.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
Dune::Timer t_buildUpperSubsystemsStructures;
|
||||
|
||||
for(int tcol = 0; tcol < Nb; tcol++){
|
||||
int frow = colPointers[tcol];
|
||||
int lrow = diagIndex[tcol];
|
||||
int nx = lrow - frow + 1;
|
||||
int nv = 0;
|
||||
|
||||
for(int sweep = 0; sweep < nx - 1; sweep++){
|
||||
for(int xid = 0; xid < nx; xid++){
|
||||
for(int ptr = colPointers[rowIndices[lrow - sweep]]; ptr < diagIndex[rowIndices[lrow - sweep]]; ptr++){
|
||||
if(rowIndices[ptr] == rowIndices[lrow - xid]){
|
||||
upper.nzIndices.push_back(csrToCscOffsetMap[ptr]);
|
||||
upper.knownRhsIndices.push_back(csrToCscOffsetMap[lrow - sweep]);
|
||||
upper.unknownRhsIndices.push_back(csrToCscOffsetMap[lrow - xid]);
|
||||
nv++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
upper.subsystemPointers[tcol + 1] = upper.subsystemPointers[tcol] + nv;
|
||||
}
|
||||
|
||||
if(verbosity >= 4){
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildUpperSubsystemsStructures time: " << t_buildUpperSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
if (bs != 3) {
|
||||
OPM_THROW(std::logic_error, "Creation of ISAI preconditioner on GPU only supports block_size = 3");
|
||||
}
|
||||
|
||||
Dune::Timer t_preconditioner;
|
||||
|
||||
bilu0->create_preconditioner(mat);
|
||||
|
||||
std::call_once(initialize, [&]() {
|
||||
std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
|
||||
|
||||
csrToCscOffsetMap = buildCsrToCscOffsetMap(colPointers, rowIndices);
|
||||
buildLowerSubsystemsStructures();
|
||||
buildUpperSubsystemsStructures();
|
||||
|
||||
d_colPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * colPointers.size());
|
||||
d_rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * rowIndices.size());
|
||||
d_csrToCscOffsetMap = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * csrToCscOffsetMap.size());
|
||||
d_diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * diagIndex.size());
|
||||
d_invLvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
|
||||
d_invUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnzb * bs * bs);
|
||||
d_invL_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb * bs);
|
||||
d_lower.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.subsystemPointers.size());
|
||||
d_upper.subsystemPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.subsystemPointers.size());
|
||||
|
||||
if(!lower.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_lower.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.nzIndices.size());
|
||||
d_lower.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.knownRhsIndices.size());
|
||||
d_lower.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * lower.unknownRhsIndices.size());
|
||||
}
|
||||
|
||||
if(!upper.nzIndices.empty()){ // knownRhsIndices and unknownRhsIndices will also be empty if nzIndices is empty
|
||||
d_upper.nzIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.nzIndices.size());
|
||||
d_upper.knownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.knownRhsIndices.size());
|
||||
d_upper.unknownRhsIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * upper.unknownRhsIndices.size());
|
||||
}
|
||||
|
||||
events.resize(6);
|
||||
err = queue->enqueueWriteBuffer(d_colPointers, CL_FALSE, 0, colPointers.size() * sizeof(int), colPointers.data(), nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(d_rowIndices, CL_FALSE, 0, rowIndices.size() * sizeof(int), rowIndices.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(d_csrToCscOffsetMap, CL_FALSE, 0, csrToCscOffsetMap.size() * sizeof(int), csrToCscOffsetMap.data(), nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(d_diagIndex, CL_FALSE, 0, diagIndex.size() * sizeof(int), diagIndex.data(), nullptr, &events[3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.subsystemPointers, CL_FALSE, 0, sizeof(int) * lower.subsystemPointers.size(), lower.subsystemPointers.data(), nullptr, &events[4]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.subsystemPointers, CL_FALSE, 0, sizeof(int) * upper.subsystemPointers.size(), upper.subsystemPointers.data(), nullptr, &events[5]);
|
||||
|
||||
if(!lower.nzIndices.empty()){
|
||||
events.resize(events.size() + 3);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.nzIndices, CL_FALSE, 0, sizeof(int) * lower.nzIndices.size(), lower.nzIndices.data(), nullptr, &events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.knownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.knownRhsIndices.size(), lower.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_lower.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * lower.unknownRhsIndices.size(), lower.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
|
||||
}
|
||||
|
||||
if(!upper.nzIndices.empty()){
|
||||
events.resize(events.size() + 3);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.nzIndices, CL_FALSE, 0, sizeof(int) * upper.nzIndices.size(), upper.nzIndices.data(), nullptr, &events[events.size() - 3]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.knownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.knownRhsIndices.size(), upper.knownRhsIndices.data(), nullptr, &events[events.size() - 2]);
|
||||
err |= queue->enqueueWriteBuffer(d_upper.unknownRhsIndices, CL_FALSE, 0, sizeof(int) * upper.unknownRhsIndices.size(), upper.unknownRhsIndices.data(), nullptr, &events[events.size() - 1]);
|
||||
}
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BISAI OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
});
|
||||
|
||||
std::tie(d_LUvals, d_invDiagVals) = bilu0->get_preconditioner_data();
|
||||
|
||||
events.resize(2);
|
||||
err = queue->enqueueFillBuffer(d_invLvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[0]);
|
||||
err |= queue->enqueueFillBuffer(d_invUvals, 0, 0, sizeof(double) * nnzb * bs * bs, nullptr, &events[1]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
|
||||
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
|
||||
d_invDiagVals, d_invUvals, Nb);
|
||||
|
||||
if(verbosity >= 4){
|
||||
std::ostringstream out;
|
||||
out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
|
||||
// (to compensate for the unitary diagonal that is not
|
||||
// included in isaiL, for simplicity)
|
||||
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class BISAI<n>;
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
}
|
||||
}
|
||||
156
opm/simulators/linalg/bda/BISAI.hpp
Normal file
156
opm/simulators/linalg/bda/BISAI.hpp
Normal file
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
Copyright 2022 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BISAI_HPP
|
||||
#define BISAI_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend;
|
||||
|
||||
class BlockedMatrix;
|
||||
|
||||
/// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
|
||||
/// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
|
||||
template <unsigned int block_size>
|
||||
class BISAI : public Preconditioner<block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
using Base::context;
|
||||
using Base::queue;
|
||||
using Base::events;
|
||||
using Base::err;
|
||||
|
||||
private:
|
||||
std::once_flag initialize;
|
||||
|
||||
std::vector<int> colPointers;
|
||||
std::vector<int> rowIndices;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> csrToCscOffsetMap;
|
||||
std::vector<double> invLvals;
|
||||
std::vector<double> invUvals;
|
||||
|
||||
cl::Buffer d_colPointers;
|
||||
cl::Buffer d_rowIndices;
|
||||
cl::Buffer d_csrToCscOffsetMap;
|
||||
cl::Buffer d_diagIndex;
|
||||
cl::Buffer d_LUvals;
|
||||
cl::Buffer d_invDiagVals;
|
||||
cl::Buffer d_invLvals;
|
||||
cl::Buffer d_invUvals;
|
||||
cl::Buffer d_invL_x;
|
||||
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
std::unique_ptr<BILU0<block_size> > bilu0;
|
||||
|
||||
/// Struct that holds the structure of the small subsystems for each column
|
||||
typedef struct{
|
||||
/// This vector holds the cumulative sum for the number of non-zero blocks for each subsystem.
|
||||
/// Works similarly to row and column pointers for the CSR and CSC matrix representations.
|
||||
std::vector<int> subsystemPointers;
|
||||
/// This vector holds the indices of the non-zero blocks for the target subsystem. These blocks are
|
||||
/// the ones that are present in the shadow set of the non-zero blocks of column j of the main matrix,
|
||||
/// as described in section 2.3 of the paper. The amount of non-zero blocks for j-th subsystem is
|
||||
/// given by subsystemPointers[j+1] - subsystemPointers[j].
|
||||
std::vector<int> nzIndices;
|
||||
/// This vector holds the indices of the already known values of the right hand sides of the subsystems.
|
||||
/// Its purpose is to aid in the parallel solution of the subsystems.
|
||||
std::vector<int> knownRhsIndices;
|
||||
/// This vector holds the indices of the unknown values of the right hand sides of the subsystems.
|
||||
std::vector<int> unknownRhsIndices;
|
||||
} subsystemStructure;
|
||||
|
||||
/// GPU version of subsystemStructure
|
||||
typedef struct{
|
||||
cl::Buffer subsystemPointers;
|
||||
cl::Buffer nzIndices;
|
||||
cl::Buffer knownRhsIndices;
|
||||
cl::Buffer unknownRhsIndices;
|
||||
} subsystemStructureGPU;
|
||||
|
||||
subsystemStructure lower, upper;
|
||||
subsystemStructureGPU d_lower, d_upper;
|
||||
|
||||
/// An approximate inverse for L is computed by solving a small lower triangular system for each column of the main matrix.
|
||||
/// This function finds the structure of each of these subsystems and fills the 'lower' struct.
|
||||
void buildLowerSubsystemsStructures();
|
||||
|
||||
/// An approximate inverse for U is computed by solving a small upper triangular system for each column of the main matrix.
|
||||
/// This function finds the structure of each of theses subsystems and fills the 'upper' struct.
|
||||
void buildUpperSubsystemsStructures();
|
||||
|
||||
public:
|
||||
BISAI(ILUReorder opencl_ilu_reorder, int verbosity);
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
|
||||
// analysis, find reordering if specified
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
int* getToOrder() override
|
||||
{
|
||||
return bilu0->getToOrder();
|
||||
}
|
||||
|
||||
int* getFromOrder() override
|
||||
{
|
||||
return bilu0->getFromOrder();
|
||||
}
|
||||
|
||||
BlockedMatrix* getRMat() override
|
||||
{
|
||||
return bilu0->getRMat();
|
||||
}
|
||||
};
|
||||
|
||||
/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
|
||||
/// The map works as follows: if an element 'e' of the matrix is in the i-th position in the CSR representation, it will be
|
||||
/// in the csrToCscOffsetMap[i]-th position in the CSC representation.
|
||||
std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
@@ -23,6 +23,7 @@
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/BISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
@@ -44,6 +45,8 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
|
||||
return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity);
|
||||
} else if (type == PreconditionerType::CPR) {
|
||||
return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder);
|
||||
} else if (type == PreconditionerType::BISAI) {
|
||||
return std::make_unique<Opm::Accelerator::BISAI<block_size> >(opencl_ilu_reorder, verbosity);
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Invalid PreconditionerType");
|
||||
}
|
||||
|
||||
@@ -54,7 +54,8 @@ protected:
|
||||
public:
|
||||
enum PreconditionerType {
|
||||
BILU0,
|
||||
CPR
|
||||
CPR,
|
||||
BISAI
|
||||
};
|
||||
|
||||
static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
80
opm/simulators/linalg/bda/opencl/kernels/isaiL.cl
Normal file
80
opm/simulators/linalg/bda/opencl/kernels/isaiL.cl
Normal file
@@ -0,0 +1,80 @@
|
||||
__kernel void block_sub(__global double *mat1, __global double *mat2, __global double *result)
|
||||
{
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
const unsigned int row = lane % bs;
|
||||
const unsigned int col = (lane / bs) % bs;
|
||||
result[bs * row + col] = mat1[bs * row + col] - mat2[bs * row + col];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void block_mult_sub_isai(__global double *a, __global double *b, __global double *c)
|
||||
{
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
const unsigned int row = lane % bs;
|
||||
const unsigned int col = (lane / bs) % bs;
|
||||
double temp = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < bs; k++) {
|
||||
temp += b[bs * row + k] * c[bs * k + col];
|
||||
}
|
||||
|
||||
a[bs * row + col] -= temp;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void isaiL(__global const int *diagIndex,
|
||||
__global const int *colPtr,
|
||||
__global const int *mapping,
|
||||
__global const int *nvc,
|
||||
__global const int *luIdxs,
|
||||
__global const int *xxIdxs,
|
||||
__global const int *dxIdxs,
|
||||
__global const double *LU,
|
||||
__global double *invL,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int idx_b = get_group_id(0);
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int idx = get_global_id(0);
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int num_threads = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = num_threads / warpsize;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize / bs / bs;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
unsigned int tcol = idx / warpsize;
|
||||
|
||||
while(tcol < Nb){
|
||||
const unsigned int frow = diagIndex[tcol] + 1;
|
||||
const unsigned int lrow = colPtr[tcol + 1];
|
||||
const unsigned int nx = lrow - frow;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
for(unsigned int xid = 0; xid < nx; xid++){
|
||||
unsigned int xpos = mapping[frow + xid];
|
||||
block_sub(invL + xpos * bs * bs, LU + xpos * bs * bs, invL + xpos * bs * bs);
|
||||
}
|
||||
|
||||
for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
|
||||
block_mult_sub_isai(invL + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invL + dxIdxs[v] * bs * bs);
|
||||
}
|
||||
}
|
||||
|
||||
tcol += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
84
opm/simulators/linalg/bda/opencl/kernels/isaiU.cl
Normal file
84
opm/simulators/linalg/bda/opencl/kernels/isaiU.cl
Normal file
@@ -0,0 +1,84 @@
|
||||
__kernel void block_add(__global double *mat1, __global double *mat2, __global double *result)
|
||||
{
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
const unsigned int row = lane % bs;
|
||||
const unsigned int col = (lane / bs) % bs;
|
||||
result[bs * row + col] = mat1[bs * row + col] + mat2[bs * row + col];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void block_mult_isai(__global double *mat1, __global double *mat2, __global double *result)
|
||||
{
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
const unsigned int row = lane % bs;
|
||||
const unsigned int col = (lane / bs) % bs;
|
||||
double temp = 0.0;
|
||||
|
||||
for (unsigned int k = 0; k < bs; k++) {
|
||||
temp += mat1[bs * row + k] * mat2[bs * k + col];
|
||||
}
|
||||
|
||||
result[bs * row + col] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void isaiU(__global const int *diagIndex,
|
||||
__global const int *colPtr,
|
||||
__global const int *rowIndices,
|
||||
__global const int *mapping,
|
||||
__global const int *nvc,
|
||||
__global const int *luIdxs,
|
||||
__global const int *xxIdxs,
|
||||
__global const int *dxIdxs,
|
||||
__global const double *LU,
|
||||
__global const double *invDiagVals,
|
||||
__global double *invU,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int idx_b = get_group_id(0);
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int idx = get_global_id(0);
|
||||
const unsigned int bs = 3;
|
||||
const unsigned int num_threads = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = num_threads / warpsize;
|
||||
const unsigned int num_active_threads = (warpsize / bs / bs) * bs * bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize / bs / bs;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
unsigned int tcol = idx / warpsize;
|
||||
|
||||
while(tcol < Nb){
|
||||
const unsigned int frow = colPtr[tcol];
|
||||
const unsigned int lrow = diagIndex[tcol];
|
||||
const unsigned int nx = lrow - frow + 1;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
block_add(invU + lrow * bs * bs, invDiagVals + tcol * bs * bs, invU + lrow * bs * bs);
|
||||
|
||||
for(unsigned int v = nvc[tcol]; v < nvc[tcol + 1]; v++){
|
||||
block_mult_sub_isai(invU + xxIdxs[v] * bs * bs, LU + luIdxs[v] * bs * bs, invU + dxIdxs[v] * bs * bs);
|
||||
}
|
||||
|
||||
for(unsigned int xid = 1; xid < nx; xid++){
|
||||
unsigned int xpos = mapping[lrow - xid];
|
||||
block_mult_isai(invDiagVals + rowIndices[lrow - xid] * bs * bs, invU + xpos * bs * bs, invU + xpos * bs * bs);
|
||||
}
|
||||
}
|
||||
|
||||
tcol += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
68
opm/simulators/linalg/bda/opencl/kernels/spmv_blocked_add.cl
Normal file
68
opm/simulators/linalg/bda/opencl/kernels/spmv_blocked_add.cl
Normal file
@@ -0,0 +1,68 @@
|
||||
/// b = mat * x
|
||||
/// algorithm based on:
|
||||
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
__kernel void spmv_blocked_add(
|
||||
__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int Nb,
|
||||
__global const double *x,
|
||||
__global double *out,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int warpsize = 32;
|
||||
const unsigned int bsize = get_local_size(0);
|
||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
unsigned int idx = idx_b * bsize + idx_t;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
// for 3x3 blocks:
|
||||
// num_active_threads: 27
|
||||
// num_blocks_per_warp: 3
|
||||
|
||||
while(target_block_row < Nb){
|
||||
unsigned int first_block = rows[target_block_row];
|
||||
unsigned int last_block = rows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
double local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
double x_elem = x[cols[block]*bs + c];
|
||||
double A_elem = vals[block*bs*bs + c + r*bs];
|
||||
local_out += x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for(unsigned int offset = 3; offset <= 24; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
out[row] = tmp[lane];
|
||||
out[row] += x[row];
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
@@ -52,6 +52,7 @@ std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, c
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
|
||||
@@ -61,7 +62,8 @@ std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
||||
|
||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
@@ -90,6 +92,7 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
sources.emplace_back(add_coarse_pressure_correction_str);
|
||||
sources.emplace_back(prolongate_vector_str);
|
||||
sources.emplace_back(spmv_blocked_str);
|
||||
sources.emplace_back(spmv_blocked_add_str);
|
||||
sources.emplace_back(spmv_str);
|
||||
sources.emplace_back(spmv_noreset_str);
|
||||
sources.emplace_back(residual_blocked_str);
|
||||
@@ -104,6 +107,8 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
sources.emplace_back(stdwell_apply_str);
|
||||
sources.emplace_back(stdwell_apply_no_reorder_str);
|
||||
sources.emplace_back(ILU_decomp_str);
|
||||
sources.emplace_back(isaiL_str);
|
||||
sources.emplace_back(isaiU_str);
|
||||
|
||||
cl::Program program = cl::Program(*context, sources);
|
||||
program.build(devices);
|
||||
@@ -122,6 +127,7 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
|
||||
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
|
||||
spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
spmv_blocked_add_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked_add")));
|
||||
spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
|
||||
spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
|
||||
residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
|
||||
@@ -131,6 +137,8 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
|
||||
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
|
||||
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
|
||||
|
||||
initialized = true;
|
||||
} // end get_opencl_kernels()
|
||||
@@ -311,7 +319,7 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset, bool add)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@@ -321,7 +329,11 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, c
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
if (add) {
|
||||
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
} else {
|
||||
if (reset) {
|
||||
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
@@ -460,5 +472,44 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
|
||||
Timer t_isaiL;
|
||||
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels isaiL() time: " << t_isaiL.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
|
||||
Timer t_isaiU;
|
||||
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels isaiU() time: " << t_isaiU.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
@@ -31,9 +31,9 @@ namespace Accelerator
|
||||
{
|
||||
|
||||
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
@@ -52,6 +52,10 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
|
||||
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
|
||||
using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
|
||||
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
|
||||
|
||||
class OpenclKernels
|
||||
{
|
||||
@@ -71,6 +75,7 @@ private:
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
|
||||
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_add_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
|
||||
static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
|
||||
@@ -80,6 +85,8 @@ private:
|
||||
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
||||
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
||||
static std::unique_ptr<isaiL_kernel_type> isaiL_k;
|
||||
static std::unique_ptr<isaiU_kernel_type> isaiU_k;
|
||||
|
||||
OpenclKernels(){}; // disable instantiation
|
||||
|
||||
@@ -94,6 +101,7 @@ public:
|
||||
static const std::string add_coarse_pressure_correction_str;
|
||||
static const std::string prolongate_vector_str;
|
||||
static const std::string spmv_blocked_str;
|
||||
static const std::string spmv_blocked_add_str;
|
||||
static const std::string spmv_str;
|
||||
static const std::string spmv_noreset_str;
|
||||
static const std::string residual_blocked_str;
|
||||
@@ -108,6 +116,8 @@ public:
|
||||
static const std::string stdwell_apply_str;
|
||||
static const std::string stdwell_apply_no_reorder_str;
|
||||
static const std::string ILU_decomp_str;
|
||||
static const std::string isaiL_str;
|
||||
static const std::string isaiU_str;
|
||||
|
||||
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
||||
|
||||
@@ -120,7 +130,7 @@ public:
|
||||
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
|
||||
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
|
||||
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
|
||||
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
|
||||
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
|
||||
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
@@ -139,6 +149,13 @@ public:
|
||||
static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
|
||||
|
||||
static void isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb);
|
||||
|
||||
static void isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
|
||||
@@ -47,11 +47,17 @@ using Dune::Timer;
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||
|
||||
bool use_cpr;
|
||||
bool use_cpr, use_isai;
|
||||
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
use_cpr = false;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
use_cpr = true;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("isai") == 0) {
|
||||
use_cpr = false;
|
||||
use_isai = true;
|
||||
} else if (linsolver.compare("cpr_trueimpes") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support --linsolver=cpr_trueimpes");
|
||||
} else {
|
||||
@@ -61,6 +67,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
|
||||
if (use_cpr) {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder);
|
||||
} else if (use_isai) {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_reorder);
|
||||
} else {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder);
|
||||
}
|
||||
|
||||
@@ -84,10 +84,15 @@ setupPropertyTree(FlowLinearSolverParameters p, // Note: copying the parameters
|
||||
return setupILU(conf, p);
|
||||
}
|
||||
|
||||
// Same configuration as ILU0.
|
||||
if (conf == "isai") {
|
||||
return setupISAI(conf, p);
|
||||
}
|
||||
|
||||
// No valid configuration option found.
|
||||
OPM_THROW(std::invalid_argument,
|
||||
conf << " is not a valid setting for --linear-solver-configuration."
|
||||
<< " Please use ilu0, cpr, cpr_trueimpes, or cpr_quasiimpes");
|
||||
<< " Please use ilu0, cpr, cpr_trueimpes, cpr_quasiimpes or isai");
|
||||
}
|
||||
|
||||
PropertyTree
|
||||
@@ -187,5 +192,20 @@ setupILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParamet
|
||||
}
|
||||
|
||||
|
||||
PropertyTree
|
||||
setupISAI([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p)
|
||||
{
|
||||
using namespace std::string_literals;
|
||||
PropertyTree prm;
|
||||
prm.put("tol", p.linear_solver_reduction_);
|
||||
prm.put("maxiter", p.linear_solver_maxiter_);
|
||||
prm.put("verbosity", p.linear_solver_verbosity_);
|
||||
prm.put("solver", "bicgstab"s);
|
||||
prm.put("preconditioner.type", "ParOverILU0"s);
|
||||
prm.put("preconditioner.relaxation", p.ilu_relaxation_);
|
||||
prm.put("preconditioner.ilulevel", p.ilu_fillin_level_);
|
||||
return prm;
|
||||
}
|
||||
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
@@ -35,6 +35,7 @@ PropertyTree setupPropertyTree(FlowLinearSolverParameters p,
|
||||
PropertyTree setupCPR(const std::string& conf, const FlowLinearSolverParameters& p);
|
||||
PropertyTree setupAMG(const std::string& conf, const FlowLinearSolverParameters& p);
|
||||
PropertyTree setupILU(const std::string& conf, const FlowLinearSolverParameters& p);
|
||||
PropertyTree setupISAI(const std::string& conf, const FlowLinearSolverParameters& p);
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
|
||||
Reference in New Issue
Block a user