mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #3672 from Tongdongq/add-cpr-opencl
Add CPR preconditioner to openclSolver
This commit is contained in:
@@ -100,18 +100,21 @@ if(OPENCL_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/CPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/OpenclMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
|
||||
endif()
|
||||
if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR HAVE_AMGCL)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Matrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||
endif()
|
||||
if(HAVE_FPGA)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
|
||||
@@ -145,6 +148,7 @@ list (APPEND TEST_SOURCE_FILES
|
||||
tests/test_timer.cpp
|
||||
tests/test_invert.cpp
|
||||
tests/test_stoppedwells.cpp
|
||||
tests/test_solvetransposed3x3.cpp
|
||||
tests/test_relpermdiagnostics.cpp
|
||||
tests/test_norne_pvt.cpp
|
||||
tests/test_wellprodindexcalculator.cpp
|
||||
@@ -247,10 +251,10 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/linalg/bda/BdaSolver.hpp
|
||||
opm/simulators/linalg/bda/BILU0.hpp
|
||||
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
||||
opm/simulators/linalg/bda/CPR.hpp
|
||||
opm/simulators/linalg/bda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
||||
opm/simulators/linalg/bda/FPGAMatrix.hpp
|
||||
opm/simulators/linalg/bda/FPGABILU0.hpp
|
||||
opm/simulators/linalg/bda/FPGASolverBackend.hpp
|
||||
opm/simulators/linalg/bda/FPGAUtils.hpp
|
||||
@@ -258,8 +262,11 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/linalg/bda/ILUReorder.hpp
|
||||
opm/simulators/linalg/bda/opencl.hpp
|
||||
opm/simulators/linalg/bda/openclKernels.hpp
|
||||
opm/simulators/linalg/bda/OpenclMatrix.hpp
|
||||
opm/simulators/linalg/bda/opencl/Preconditioner.hpp
|
||||
opm/simulators/linalg/bda/openclSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/openclWellContributions.hpp
|
||||
opm/simulators/linalg/bda/Matrix.hpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
|
||||
opm/simulators/linalg/bda/WellContributions.hpp
|
||||
opm/simulators/linalg/amgcpr.hh
|
||||
|
||||
@@ -146,7 +146,8 @@ namespace Opm
|
||||
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
||||
std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver));
|
||||
}
|
||||
#else
|
||||
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
|
||||
|
||||
@@ -122,7 +122,6 @@ public:
|
||||
instance().doAddCreator(type, creator);
|
||||
}
|
||||
|
||||
private:
|
||||
using CriterionBase
|
||||
= Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<Matrix, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
@@ -150,6 +149,7 @@ private:
|
||||
criterion.setMinAggregateSize(prm.get<int>("minaggsize", 4));
|
||||
return criterion;
|
||||
}
|
||||
private:
|
||||
|
||||
/// Helper struct to explicitly overload amgSmootherArgs() version for
|
||||
/// ParallelOverlappingILU0, since in-class specialization is not allowed.
|
||||
|
||||
@@ -40,279 +40,251 @@ using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
||||
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{
|
||||
#if CHOW_PATEL
|
||||
chowPatelIlu.setVerbosity(verbosity);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::~BILU0()
|
||||
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
{
|
||||
delete[] invDiagVals;
|
||||
}
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
this->N = mat->Nb * block_size;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * block_size * block_size;
|
||||
this->nnzb = mat->nnzbs;
|
||||
|
||||
this->N = mat->Nb * block_size;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * block_size * block_size;
|
||||
this->nnzbs = mat->nnzbs;
|
||||
std::vector<int> CSCRowIndices;
|
||||
std::vector<int> CSCColPointers;
|
||||
|
||||
int *CSCRowIndices = nullptr;
|
||||
int *CSCColPointers = nullptr;
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
LUmat = std::make_unique<BlockedMatrix>(*mat);
|
||||
} else {
|
||||
toOrder.resize(Nb);
|
||||
fromOrder.resize(Nb);
|
||||
CSCRowIndices.resize(nnzb);
|
||||
CSCColPointers.resize(Nb + 1);
|
||||
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rmat);
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
||||
} else {
|
||||
toOrder.resize(Nb);
|
||||
fromOrder.resize(Nb);
|
||||
CSCRowIndices = new int[nnzbs];
|
||||
CSCColPointers = new int[Nb + 1];
|
||||
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
|
||||
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
|
||||
if(verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
Timer t_analysis;
|
||||
std::ostringstream out;
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
out << "BILU0 reordering strategy: none\n";
|
||||
// numColors = 1;
|
||||
// rowsPerColor.emplace_back(Nb);
|
||||
numColors = Nb;
|
||||
for(int i = 0; i < Nb; ++i){
|
||||
rowsPerColor.emplace_back(1);
|
||||
}
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||
}
|
||||
if(verbosity >= 1){
|
||||
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||
Timer t_analysis;
|
||||
std::ostringstream out;
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
out << "BILU0 reordering strategy: none\n";
|
||||
// numColors = 1;
|
||||
// rowsPerColor.emplace_back(Nb);
|
||||
numColors = Nb;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
rowsPerColor.emplace_back(1);
|
||||
}
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||
}
|
||||
if (verbosity >= 1) {
|
||||
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||
}
|
||||
#if CHOW_PATEL
|
||||
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||
#endif
|
||||
OpmLog::info(out.str());
|
||||
OpmLog::info(out.str());
|
||||
|
||||
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] CSCRowIndices;
|
||||
delete[] CSCColPointers;
|
||||
}
|
||||
|
||||
diagIndex.resize(mat->Nb);
|
||||
invDiagVals = new double[mat->Nb * bs * bs];
|
||||
diagIndex.resize(mat->Nb);
|
||||
invDiagVals.resize(mat->Nb * bs * bs);
|
||||
|
||||
#if CHOW_PATEL
|
||||
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
#endif
|
||||
|
||||
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
|
||||
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
|
||||
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
|
||||
#if CHOW_PATEL
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
|
||||
#else
|
||||
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
|
||||
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
|
||||
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
|
||||
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
|
||||
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
|
||||
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
|
||||
#endif
|
||||
|
||||
events.resize(2);
|
||||
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
|
||||
events.resize(2);
|
||||
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
|
||||
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end init()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
auto *m = mat;
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
auto *m = mat;
|
||||
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
m = rmat.get();
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
||||
Timer t_copy;
|
||||
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 memcpy: " << t_copy.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
#if CHOW_PATEL
|
||||
chowPatelIlu.decomposition(queue, context,
|
||||
LUmat.get(), Lmat.get(), Umat.get(),
|
||||
invDiagVals, diagIndex,
|
||||
s.diagIndex, s.invDiagVals,
|
||||
s.Lvals, s.Lcols, s.Lrows,
|
||||
s.Uvals, s.Ucols, s.Urows);
|
||||
#else
|
||||
Timer t_copyToGpu;
|
||||
|
||||
events.resize(1);
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
// must be done after reordering
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row+1];
|
||||
|
||||
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
|
||||
assert(candidate != LUmat->colIndices + rowEnd);
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
m = rmat.get();
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Timer t_decomposition;
|
||||
std::ostringstream out;
|
||||
cl::Event event;
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color+1];
|
||||
if (verbosity >= 4) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
#endif // CHOW_PATEL
|
||||
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const double relaxation = 0.9;
|
||||
cl::Event event;
|
||||
Timer t_apply;
|
||||
|
||||
for(int color = 0; color < numColors; ++color){
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
for(int color = numColors-1; color >= 0; --color){
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
OpenclKernels::scale(x, relaxation, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t_apply.stop() << " s";
|
||||
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
||||
Timer t_copy;
|
||||
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 memcpy: " << t_copy.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
#if CHOW_PATEL
|
||||
chowPatelIlu.decomposition(queue, context,
|
||||
LUmat.get(), Lmat.get(), Umat.get(),
|
||||
invDiagVals.data(), diagIndex,
|
||||
s.diagIndex, s.invDiagVals,
|
||||
s.Lvals, s.Lcols, s.Lrows,
|
||||
s.Uvals, s.Ucols, s.Urows);
|
||||
#else
|
||||
Timer t_copyToGpu;
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
|
||||
std::call_once(pattern_uploaded, [&]() {
|
||||
// find the positions of each diagonal block
|
||||
// must be done after reordering
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row + 1];
|
||||
|
||||
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
|
||||
assert(candidate != LUmat->colIndices + rowEnd);
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Timer t_decomposition;
|
||||
std::ostringstream out;
|
||||
cl::Event event;
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
|
||||
if (verbosity >= 4) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
#endif // CHOW_PATEL
|
||||
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLContext(cl::Context *context_) {
|
||||
this->context = context_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
|
||||
this->queue = queue_;
|
||||
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const double relaxation = 0.9;
|
||||
cl::Event event;
|
||||
Timer t_apply;
|
||||
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int color = numColors - 1; color >= 0; --color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
OpenclKernels::scale(x, relaxation, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t_apply.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template BILU0<n>::BILU0(ILUReorder, int); \
|
||||
template BILU0<n>::~BILU0(); \
|
||||
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
||||
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
||||
template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&); \
|
||||
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
||||
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class BILU0<n>;
|
||||
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||
|
||||
|
||||
@@ -35,89 +36,86 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
/// This class implementa a Blocked ILU0 preconditioner
|
||||
/// The decomposition is done on CPU, and reorders the rows of the matrix
|
||||
template <unsigned int block_size>
|
||||
class BILU0
|
||||
{
|
||||
/// This class implements a Blocked ILU0 preconditioner
|
||||
/// The decomposition is done on CPU, and reorders the rows of the matrix
|
||||
template <unsigned int block_size>
|
||||
class BILU0 : public Preconditioner<block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
|
||||
private:
|
||||
int N; // number of rows of the matrix
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzbs; // number of blocks of the matrix
|
||||
std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
using Base::context;
|
||||
using Base::queue;
|
||||
using Base::events;
|
||||
using Base::err;
|
||||
|
||||
private:
|
||||
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
|
||||
#if CHOW_PATEL
|
||||
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
|
||||
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
|
||||
#endif
|
||||
double *invDiagVals = nullptr;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||
std::vector<int> toOrder, fromOrder;
|
||||
int numColors;
|
||||
int verbosity;
|
||||
std::once_flag pattern_uploaded;
|
||||
std::vector<double> invDiagVals;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||
std::vector<int> toOrder, fromOrder;
|
||||
int numColors;
|
||||
std::once_flag pattern_uploaded;
|
||||
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
|
||||
typedef struct {
|
||||
cl::Buffer invDiagVals;
|
||||
cl::Buffer diagIndex;
|
||||
cl::Buffer rowsPerColor;
|
||||
typedef struct {
|
||||
cl::Buffer invDiagVals;
|
||||
cl::Buffer diagIndex;
|
||||
cl::Buffer rowsPerColor;
|
||||
#if CHOW_PATEL
|
||||
cl::Buffer Lvals, Lcols, Lrows;
|
||||
cl::Buffer Uvals, Ucols, Urows;
|
||||
cl::Buffer Lvals, Lcols, Lrows;
|
||||
cl::Buffer Uvals, Ucols, Urows;
|
||||
#else
|
||||
cl::Buffer LUvals, LUcols, LUrows;
|
||||
cl::Buffer LUvals, LUcols, LUrows;
|
||||
#endif
|
||||
} GPU_storage;
|
||||
} GPU_storage;
|
||||
|
||||
GPU_storage s;
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
GPU_storage s;
|
||||
|
||||
#if CHOW_PATEL
|
||||
ChowPatelIlu<block_size> chowPatelIlu;
|
||||
ChowPatelIlu<block_size> chowPatelIlu;
|
||||
#endif
|
||||
|
||||
public:
|
||||
public:
|
||||
|
||||
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
|
||||
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
|
||||
|
||||
~BILU0();
|
||||
// analysis, find reordering if specified
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
|
||||
// analysis
|
||||
bool init(BlockedMatrix<block_size> *mat);
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x);
|
||||
int* getToOrder() override
|
||||
{
|
||||
return toOrder.data();
|
||||
}
|
||||
|
||||
void setOpenCLContext(cl::Context *context);
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
int* getFromOrder() override
|
||||
{
|
||||
return fromOrder.data();
|
||||
}
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
return toOrder.data();
|
||||
}
|
||||
BlockedMatrix* getRMat() override
|
||||
{
|
||||
return rmat.get();
|
||||
}
|
||||
|
||||
int* getFromOrder()
|
||||
{
|
||||
return fromOrder.data();
|
||||
}
|
||||
|
||||
BlockedMatrix<block_size>* getRMat()
|
||||
{
|
||||
return rmat.get();
|
||||
}
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
@@ -63,7 +63,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
double tolerance,
|
||||
[[maybe_unused]] unsigned int platformID,
|
||||
unsigned int deviceID,
|
||||
[[maybe_unused]] std::string opencl_ilu_reorder)
|
||||
[[maybe_unused]] std::string opencl_ilu_reorder,
|
||||
[[maybe_unused]] std::string linsolver)
|
||||
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
|
||||
{
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
@@ -88,7 +89,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
||||
}
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
|
||||
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
@@ -168,29 +169,26 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||
// sparsity pattern should stay the same
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix>
|
||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
int sum_nnzs = 0;
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
|
||||
h_rows.clear();
|
||||
h_cols.clear();
|
||||
|
||||
// convert colIndices and rowPointers
|
||||
if (h_rows.empty()) {
|
||||
h_rows.emplace_back(0);
|
||||
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
int size_row = 0;
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
h_rows.emplace_back(0);
|
||||
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
h_cols.emplace_back(c.index());
|
||||
}
|
||||
h_rows.emplace_back(h_cols.size());
|
||||
}
|
||||
} // end getSparsityPattern()
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
@@ -220,7 +218,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
|
||||
if (h_rows.capacity() == 0) {
|
||||
h_rows.reserve(Nb+1);
|
||||
h_cols.reserve(nnzb);
|
||||
getSparsityPattern(*mat, h_rows, h_cols);
|
||||
copySparsityPatternFromISTL(*mat, h_rows, h_cols);
|
||||
}
|
||||
|
||||
Dune::Timer t_zeros;
|
||||
|
||||
@@ -54,7 +54,9 @@ public:
|
||||
/// \param[in] platformID the OpenCL platform ID to be used
|
||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
||||
/// \param[in] linsolver copy of cmdline argument --linsolver
|
||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
|
||||
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
@@ -75,6 +77,12 @@ public:
|
||||
return use_gpu;
|
||||
}
|
||||
|
||||
/// Store sparsity pattern into vectors
|
||||
/// \param[in] mat input matrix, probably BCRSMatrix
|
||||
/// \param[out] h_rows rowpointers
|
||||
/// \param[out] h_cols columnindices
|
||||
static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
|
||||
|
||||
/// Initialize the WellContributions object with opencl context and queue
|
||||
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
|
||||
/// \param[in] wellContribs container to hold all WellContributions
|
||||
|
||||
@@ -76,6 +76,7 @@ namespace Accelerator {
|
||||
/// \param[in] tolerance required relative tolerance for solver
|
||||
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
|
||||
/// \param[in] deviceID the device to be used
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
||||
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
|
||||
|
||||
@@ -26,7 +26,9 @@
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
@@ -37,8 +39,7 @@ using Opm::OpmLog;
|
||||
|
||||
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
||||
|
||||
template <unsigned int block_size>
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right) {
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
|
||||
const unsigned int bs = block_size;
|
||||
int l = left;
|
||||
int r = right;
|
||||
@@ -63,17 +64,16 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right) {
|
||||
} while (l < r);
|
||||
|
||||
if (left < r)
|
||||
sortBlockedRow<bs>(colIndices, data, left, r);
|
||||
sortBlockedRow(colIndices, data, left, r, bs);
|
||||
|
||||
if (right > l)
|
||||
sortBlockedRow<bs>(colIndices, data, l, right);
|
||||
sortBlockedRow(colIndices, data, l, right, bs);
|
||||
}
|
||||
|
||||
|
||||
// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
|
||||
// a = a - (b * c)
|
||||
template <unsigned int block_size>
|
||||
void blockMultSub(double *a, double *b, double *c)
|
||||
void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
|
||||
{
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
@@ -88,8 +88,7 @@ void blockMultSub(double *a, double *b, double *c)
|
||||
|
||||
/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
|
||||
|
||||
template <unsigned int block_size>
|
||||
void blockMult(double *mat1, double *mat2, double *resMat) {
|
||||
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
double temp = 0;
|
||||
@@ -104,8 +103,7 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
|
||||
#if HAVE_FPGA
|
||||
|
||||
/*Subtract two blocks from one another element by element*/
|
||||
template <unsigned int block_size>
|
||||
void blockSub(double *mat1, double *mat2, double *resMat) {
|
||||
void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
|
||||
@@ -114,8 +112,7 @@ void blockSub(double *mat1, double *mat2, double *resMat) {
|
||||
}
|
||||
|
||||
/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
|
||||
template <unsigned int block_size>
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size) {
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
if (resetRes) {
|
||||
resVect[row] = 0.0;
|
||||
@@ -128,8 +125,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::countUnblockedNnzs() {
|
||||
int BlockedMatrix::countUnblockedNnzs() {
|
||||
int numNnzsOverThreshold = 0;
|
||||
int totalNnzs = rowPointers[Nb];
|
||||
for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
|
||||
@@ -144,8 +140,7 @@ int BlockedMatrix<block_size>::countUnblockedNnzs() {
|
||||
* Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
|
||||
* If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
|
||||
*/
|
||||
template <unsigned int block_size>
|
||||
void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
|
||||
void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) {
|
||||
const unsigned int bs = block_size;
|
||||
int valIndex = 0, nnzsPerRow;
|
||||
|
||||
@@ -183,8 +178,7 @@ void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
|
||||
|
||||
/*Optimized version*/
|
||||
// ub* prefixes indicate unblocked data
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||
int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
|
||||
{
|
||||
@@ -223,8 +217,7 @@ int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isU
|
||||
// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
|
||||
// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
|
||||
// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
|
||||
int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor,
|
||||
int rowsPerColorLimit, int columnsPerColorLimit,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
|
||||
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
|
||||
@@ -470,41 +463,5 @@ void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::
|
||||
#endif // HAVE_FPGA
|
||||
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template void sortBlockedRow<n>(int *, double *, int, int); \
|
||||
template void blockMultSub<n>(double *, double *, double *); \
|
||||
template void blockMult<n>(double *, double *, double *); \
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
#if HAVE_FPGA
|
||||
#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \
|
||||
template void blockSub<n>(double *, double *, double *); \
|
||||
template void blockVectMult<n>(double *, double *, double, double *, bool); \
|
||||
template int BlockedMatrix<n>::toRDF(int, int *, bool, \
|
||||
std::vector<std::vector<int> >& , int, int *, \
|
||||
std::vector<std::vector<double> >&, short int *, unsigned char *, int *, int *); \
|
||||
template int BlockedMatrix<n>::findPartitionColumns(int, int *, \
|
||||
int, int, \
|
||||
std::vector<std::vector<int> >& , int *, int *, \
|
||||
std::vector<std::vector<int> >& , int *, int *, \
|
||||
std::vector<std::vector<int> >& , int *, int *);
|
||||
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
|
||||
|
||||
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
|
||||
#endif // HAVE_FPGA
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
@@ -17,14 +17,14 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BLOCKED_MATRIX_HPP
|
||||
#define BLOCKED_MATRIX_HPP
|
||||
#ifndef OPM_BLOCKED_MATRIX_HPP
|
||||
#define OPM_BLOCKED_MATRIX_HPP
|
||||
|
||||
#if HAVE_FPGA
|
||||
#include <vector>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#endif
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
@@ -33,7 +33,6 @@ namespace Accelerator
|
||||
|
||||
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
template<unsigned int block_size>
|
||||
class BlockedMatrix
|
||||
{
|
||||
|
||||
@@ -42,12 +41,14 @@ public:
|
||||
/// Allocate BlockedMatrix and data arrays with given sizes
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
BlockedMatrix(int Nb_, int nnzbs_)
|
||||
: nnzValues(new double[nnzbs_*block_size*block_size]),
|
||||
colIndices(new int[nnzbs_*block_size*block_size]),
|
||||
/// \param[in] block_size the number of rows and columns for each block
|
||||
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
|
||||
: nnzValues(new double[nnzbs_*block_size_*block_size_]),
|
||||
colIndices(new int[nnzbs_*block_size_*block_size_]),
|
||||
rowPointers(new int[Nb_+1]),
|
||||
Nb(Nb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_),
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(true)
|
||||
{}
|
||||
@@ -55,11 +56,12 @@ public:
|
||||
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
|
||||
/// \param[in] M matrix to be copied
|
||||
BlockedMatrix(const BlockedMatrix& M)
|
||||
: nnzValues(new double[M.nnzbs*block_size*block_size]),
|
||||
: nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
|
||||
colIndices(M.colIndices),
|
||||
rowPointers(M.rowPointers),
|
||||
Nb(M.Nb),
|
||||
nnzbs(M.nnzbs),
|
||||
block_size(M.block_size),
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(false)
|
||||
{}
|
||||
@@ -67,15 +69,17 @@ public:
|
||||
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
/// \param[in] block_size the number of rows and columns for each block
|
||||
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
|
||||
/// \param[in] colIndices array of column indices, contains nnzb entries
|
||||
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
|
||||
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
||||
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
||||
: nnzValues(nnzValues_),
|
||||
colIndices(colIndices_),
|
||||
rowPointers(rowPointers_),
|
||||
Nb(Nb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_),
|
||||
deleteNnzs(false),
|
||||
deleteSparsity(false)
|
||||
{}
|
||||
@@ -117,6 +121,7 @@ public:
|
||||
int *rowPointers;
|
||||
int Nb;
|
||||
int nnzbs;
|
||||
unsigned int block_size;
|
||||
bool deleteNnzs;
|
||||
bool deleteSparsity;
|
||||
};
|
||||
@@ -127,31 +132,45 @@ public:
|
||||
/// \param[inout] data
|
||||
/// \param[in] left lower index of data of row
|
||||
/// \param[in] right upper index of data of row
|
||||
template <unsigned int block_size>
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right);
|
||||
/// \param[in] block_size size of blocks in the row
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
|
||||
|
||||
/// Multiply and subtract blocks
|
||||
/// a = a - (b * c)
|
||||
/// \param[inout] a block to be subtracted from
|
||||
/// \param[in] b input block
|
||||
/// \param[in] c input block
|
||||
template <unsigned int block_size>
|
||||
void blockMultSub(double *a, double *b, double *c);
|
||||
/// \param[in] block_size size of block
|
||||
void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
|
||||
|
||||
/// Perform a 3x3 matrix-matrix multiplication on two blocks
|
||||
/// Perform a matrix-matrix multiplication on two blocks
|
||||
/// resMat = mat1 * mat2
|
||||
/// \param[in] mat1 input block 1
|
||||
/// \param[in] mat2 input block 2
|
||||
/// \param[inout] resMat output block
|
||||
template <unsigned int block_size>
|
||||
void blockMult(double *mat1, double *mat2, double *resMat);
|
||||
/// \param[out] resMat output block
|
||||
/// \param[in] block_size size of block
|
||||
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
|
||||
|
||||
|
||||
#if HAVE_FPGA
|
||||
template <unsigned int block_size>
|
||||
void blockSub(double *mat1, double *mat2, double *resMat);
|
||||
/// Perform a matrix-matrix subtraction on two blocks, element-wise
|
||||
/// resMat = mat1 - mat2
|
||||
/// \param[in] mat1 input block 1
|
||||
/// \param[in] mat2 input block 2
|
||||
/// \param[out] resMat output block
|
||||
/// \param[in] block_size size of block
|
||||
void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size);
|
||||
|
||||
template <unsigned int block_size>
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
|
||||
/// Perform a matrix-vector multiplication
|
||||
/// resVect = mat * vect
|
||||
/// resVect += mat * vect
|
||||
/// \param[in] mat input matrix
|
||||
/// \param[in] vect input vector
|
||||
/// \param[in] scale multiply output with this factor
|
||||
/// \param[inout] resVect output vector
|
||||
/// \param[in] resetRes if true, overwrite resVect, otherwise add to it
|
||||
/// \param[in] block_size size of block
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size);
|
||||
|
||||
/// Convert a blocked inverse diagonal to the FPGA format.
|
||||
/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
|
||||
|
||||
535
opm/simulators/linalg/bda/CPR.cpp
Normal file
535
opm/simulators/linalg/bda/CPR.cpp
Normal file
@@ -0,0 +1,535 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) :
|
||||
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{
|
||||
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
|
||||
diagIndices.resize(1);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
|
||||
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_);
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
mat = mat_;
|
||||
} else {
|
||||
mat = bilu0->getRMat();
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// return the absolute value of the N elements for which the absolute value is highest
|
||||
double get_absmax(const double *data, const int N) {
|
||||
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
|
||||
}
|
||||
|
||||
|
||||
// solve A^T * x = b
|
||||
void solve_transposed_3x3(const double *A, const double *b, double *x) {
|
||||
const int B = 3;
|
||||
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
|
||||
double t4 = A[0+0*B] * A[1+1*B];
|
||||
double t6 = A[0+0*B] * A[1+2*B];
|
||||
double t8 = A[0+1*B] * A[1+0*B];
|
||||
double t10 = A[0+2*B] * A[1+0*B];
|
||||
double t12 = A[0+1*B] * A[2+0*B];
|
||||
double t14 = A[0+2*B] * A[2+0*B];
|
||||
|
||||
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
|
||||
|
||||
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
|
||||
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
|
||||
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
|
||||
|
||||
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
|
||||
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
|
||||
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
|
||||
|
||||
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
|
||||
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
|
||||
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::init_opencl_buffers() {
|
||||
d_Amatrices.reserve(num_levels);
|
||||
d_Rmatrices.reserve(num_levels - 1);
|
||||
d_invDiags.reserve(num_levels - 1);
|
||||
for (Matrix& m : Amatrices) {
|
||||
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
}
|
||||
for (Matrix& m : Rmatrices) {
|
||||
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
|
||||
|
||||
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
|
||||
}
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::opencl_upload() {
|
||||
d_mat->upload(queue.get(), mat);
|
||||
|
||||
err = CL_SUCCESS;
|
||||
events.resize(2 * Rmatrices.size() + 1);
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
|
||||
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
|
||||
this->mat = mat_;
|
||||
|
||||
coarse_vals.resize(nnzb);
|
||||
coarse_x.resize(Nb);
|
||||
coarse_y.resize(Nb);
|
||||
weights.resize(N);
|
||||
|
||||
try{
|
||||
double rhs[] = {0, 0, 0};
|
||||
rhs[pressure_idx] = 1;
|
||||
|
||||
// find diagonal index for each row
|
||||
if (diagIndices[0].empty()) {
|
||||
diagIndices[0].resize(Nb);
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
|
||||
assert(candidate != mat->colIndices + end);
|
||||
diagIndices[0][row] = candidate - mat->colIndices;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate weights for each row
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
// solve to find weights
|
||||
double *row_weights = weights.data() + block_size * row; // weights for this row
|
||||
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
|
||||
|
||||
// normalize weights for this row
|
||||
double abs_max = get_absmax(row_weights, block_size);
|
||||
for(unsigned int i = 0; i < block_size; i++){
|
||||
row_weights[i] /= abs_max;
|
||||
}
|
||||
}
|
||||
|
||||
// extract pressure
|
||||
// transform blocks to scalars to create scalar linear system
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
double *block = mat->nnzValues + idx * block_size * block_size;
|
||||
double *row_weights = weights.data() + block_size * row;
|
||||
double value = 0.0;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
value += block[block_size * i + pressure_idx] * row_weights[i];
|
||||
}
|
||||
coarse_vals[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
|
||||
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
|
||||
if (recalculate_aggregates) {
|
||||
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
|
||||
|
||||
typedef DuneMat::CreateIterator Iter;
|
||||
|
||||
// setup sparsity pattern
|
||||
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
|
||||
int start = mat->rowPointers[row.index()];
|
||||
int end = mat->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
row.insert(col);
|
||||
}
|
||||
}
|
||||
|
||||
// set values
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
|
||||
Dune::Amg::SequentialInformation seqinfo;
|
||||
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
|
||||
|
||||
Opm::PropertyTree property_tree;
|
||||
property_tree.put("alpha", 0.333333333333);
|
||||
|
||||
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
|
||||
// Yet a SymmetricDependency is used in AMGCPR
|
||||
// An UnSymmetricCriterion is also available
|
||||
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
|
||||
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
const Criterion c = Opm::PreconditionerFactory<MatrixOperator, Dune::Amg::SequentialInformation>::amgCriterion(property_tree);
|
||||
num_pre_smooth_steps = c.getNoPreSmoothSteps();
|
||||
num_post_smooth_steps = c.getNoPostSmoothSteps();
|
||||
|
||||
dune_amg->build<OverlapFlags>(c);
|
||||
|
||||
analyzeHierarchy();
|
||||
analyzeAggregateMaps();
|
||||
|
||||
recalculate_aggregates = false;
|
||||
} else {
|
||||
// update values of coarsest level in AMG
|
||||
// this works because that level is actually a reference to the DuneMat held by dune_coarse
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
// update the rest of the AMG hierarchy
|
||||
dune_amg->recalculateGalerkin(OverlapFlags());
|
||||
analyzeHierarchy();
|
||||
}
|
||||
|
||||
// initialize OpenclMatrices and Buffers if needed
|
||||
auto init_func = std::bind(&CPR::init_opencl_buffers, this);
|
||||
std::call_once(opencl_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
opencl_upload();
|
||||
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "Caught exception: " << ex.what() << std::endl;
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeHierarchy() {
|
||||
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
|
||||
// store coarsest AMG level in umfpack format, also performs LU decomposition
|
||||
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
|
||||
|
||||
num_levels = dune_amg->levels();
|
||||
level_sizes.resize(num_levels);
|
||||
diagIndices.resize(num_levels);
|
||||
|
||||
Amatrices.reserve(num_levels);
|
||||
Rmatrices.reserve(num_levels - 1); // coarsest level does not need one
|
||||
invDiags.reserve(num_levels);
|
||||
|
||||
Amatrices.clear();
|
||||
invDiags.clear();
|
||||
|
||||
// matrixIter.dereference() returns MatrixAdapter
|
||||
// matrixIter.dereference().getmat() returns BCRSMat
|
||||
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
const auto& A = matrixIter.dereference().getmat();
|
||||
level_sizes[level] = A.N();
|
||||
diagIndices[level].reserve(A.N());
|
||||
|
||||
// extract matrix A
|
||||
Amatrices.emplace_back(A.N(), A.nonzeroes());
|
||||
// contiguous copy is not possible
|
||||
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
|
||||
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
|
||||
int nnz_idx = 0;
|
||||
const bool fillDiagIndices = diagIndices[level].empty();
|
||||
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
|
||||
if (fillDiagIndices && r.index() == c.index()) {
|
||||
diagIndices[level].emplace_back(nnz_idx);
|
||||
}
|
||||
nnz_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
|
||||
|
||||
// compute inverse diagonal values for current level
|
||||
invDiags.emplace_back(A.N());
|
||||
for (unsigned int row = 0; row < A.N(); ++row) {
|
||||
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::analyzeAggregateMaps() {
|
||||
|
||||
PcolIndices.resize(num_levels - 1);
|
||||
Rmatrices.clear();
|
||||
|
||||
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
|
||||
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
DuneAmg::AggregatesMap *map = *mapIter;
|
||||
|
||||
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
|
||||
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
|
||||
|
||||
// get indices for each row of P and R
|
||||
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
|
||||
PcolIndices[level].resize(level_sizes[level]);
|
||||
|
||||
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
|
||||
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
|
||||
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
|
||||
const long int diff = ai - map->begin();
|
||||
PcolIndices[level][diff] = *ai;
|
||||
indicesR[*ai].emplace_back(diff);
|
||||
}
|
||||
}
|
||||
|
||||
int col_idx = 0;
|
||||
// set sparsity pattern of R
|
||||
Rmatrices.back().rowPointers[0] = 0;
|
||||
for (unsigned int i = 0; i < indicesR.size(); ++i) {
|
||||
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
|
||||
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
|
||||
Rmatrices.back().colIndices[col_idx++] = *it;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
|
||||
OpenclMatrix *A = &d_Amatrices[level];
|
||||
OpenclMatrix *R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
|
||||
if (level == num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
|
||||
}
|
||||
|
||||
// solve coarsest level using umfpack
|
||||
umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
return;
|
||||
}
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
cl::Buffer& t = d_t[level];
|
||||
cl::Buffer& f = d_f[level];
|
||||
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
double jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// x = prec(y)
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
|
||||
// 0-initialize u and x vectors
|
||||
events.resize(d_u.size() + 1);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
|
||||
Dune::Timer t_bilu0;
|
||||
bilu0->apply(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
apply_amg(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "CPR apply amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template class CPR<n>;
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
|
||||
163
opm/simulators/linalg/bda/CPR.hpp
Normal file
163
opm/simulators/linalg/bda/CPR.hpp
Normal file
@@ -0,0 +1,163 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_CPR_HPP
|
||||
#define OPM_CPR_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <dune/istl/paamg/matrixhierarchy.hh>
|
||||
#include <dune/istl/umfpack.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
|
||||
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend;
|
||||
|
||||
class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <unsigned int block_size>
|
||||
class CPR : public Preconditioner<block_size>
|
||||
{
|
||||
typedef Preconditioner<block_size> Base;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
using Base::context;
|
||||
using Base::queue;
|
||||
using Base::events;
|
||||
using Base::err;
|
||||
|
||||
private:
|
||||
int num_levels;
|
||||
std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<cl::Buffer> d_PcolIndices;
|
||||
std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<cl::Buffer> d_invDiags;
|
||||
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
|
||||
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
|
||||
std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
|
||||
std::unique_ptr<OpenclMatrix> d_mat; // stores blocked matrix
|
||||
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
std::unique_ptr<BILU0<block_size> > bilu0; // Blocked ILU0 preconditioner
|
||||
BlockedMatrix *mat = nullptr; // input matrix, blocked
|
||||
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
|
||||
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
|
||||
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
|
||||
std::unique_ptr<DuneAmg> dune_amg;
|
||||
std::unique_ptr<DuneMat> dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy
|
||||
std::shared_ptr<MatrixOperator> dune_op; // operator, input to Dune AMG
|
||||
std::vector<int> level_sizes; // size of each level in the AMG hierarchy
|
||||
std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
|
||||
Dune::UMFPack<DuneMat> umfpack; // dune/istl/umfpack object used to solve the coarsest level of AMG
|
||||
bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
|
||||
bool recalculate_aggregates = true; // only rerecalculate if true
|
||||
const int pressure_idx = 1; // hardcoded to mimic OPM
|
||||
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
|
||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||
|
||||
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
|
||||
ILUReorder opencl_ilu_reorder; // reordering strategy for ILU0 in coarse solver
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
void analyzeHierarchy();
|
||||
|
||||
// Analyze the aggregateMaps from the AMG hierarchy
|
||||
// These can be reused, so only use when recalculate_aggregates is true
|
||||
void analyzeAggregateMaps();
|
||||
|
||||
// Initialize and allocate matrices and vectors
|
||||
void init_opencl_buffers();
|
||||
|
||||
// Copy matrices and vectors to GPU
|
||||
void opencl_upload();
|
||||
|
||||
// apply pressure correction to vector
|
||||
void apply_amg(const cl::Buffer& y, cl::Buffer& x);
|
||||
|
||||
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
|
||||
|
||||
void create_preconditioner_amg(BlockedMatrix *mat);
|
||||
|
||||
public:
|
||||
|
||||
CPR(int verbosity, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
|
||||
// applies blocked ilu0
|
||||
// also applies amg for pressure component
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
|
||||
int* getToOrder() override
|
||||
{
|
||||
return bilu0->getToOrder();
|
||||
}
|
||||
|
||||
int* getFromOrder() override
|
||||
{
|
||||
return bilu0->getFromOrder();
|
||||
}
|
||||
|
||||
BlockedMatrix* getRMat() override
|
||||
{
|
||||
return bilu0->getRMat();
|
||||
}
|
||||
};
|
||||
|
||||
// solve A^T * x = b
|
||||
// A should represent a 3x3 matrix
|
||||
// x and b are vectors with 3 elements
|
||||
void solve_transposed_3x3(const double *A, const double *b, double *x);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
||||
@@ -482,7 +482,7 @@ __kernel void chow_patel_ilu_sweep(
|
||||
template <unsigned int block_size>
|
||||
void ChowPatelIlu<block_size>::decomposition(
|
||||
cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
|
||||
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
|
||||
BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
|
||||
double *invDiagVals, std::vector<int>& diagIndex,
|
||||
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
|
||||
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
|
||||
@@ -569,7 +569,7 @@ void ChowPatelIlu<block_size>::decomposition(
|
||||
} else {
|
||||
Lmat->colIndices[num_blocks_L] = j;
|
||||
// multiply block of L with corresponding diag block
|
||||
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
|
||||
blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs, bs);
|
||||
num_blocks_L++;
|
||||
}
|
||||
}
|
||||
@@ -671,7 +671,7 @@ void ChowPatelIlu<block_size>::decomposition(
|
||||
}
|
||||
if (colL == rowU2) {
|
||||
// Aij -= (Lik * Ukj)
|
||||
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
|
||||
blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs, bs);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -713,7 +713,7 @@ void ChowPatelIlu<block_size>::decomposition(
|
||||
|
||||
if (rowU == colL) {
|
||||
// Aij -= (Lik * Ukj)
|
||||
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
|
||||
blockMultSub(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs, bs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -721,7 +721,7 @@ void ChowPatelIlu<block_size>::decomposition(
|
||||
double ujj[bs*bs];
|
||||
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
|
||||
// Lij_new = (Aij - sum) / Ujj
|
||||
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
|
||||
blockMult(&aij[0], &ujj[0], Ltmp + ij * bs * bs, bs);
|
||||
}
|
||||
}
|
||||
// 1st sweep writes to Ltmp
|
||||
|
||||
@@ -82,7 +82,7 @@ public:
|
||||
/// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
|
||||
void decomposition(
|
||||
cl::CommandQueue *queue, cl::Context *context,
|
||||
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
|
||||
BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
|
||||
double *invDiagVals, std::vector<int>& diagIndex,
|
||||
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
|
||||
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
|
||||
|
||||
@@ -59,7 +59,7 @@ FPGABILU0<block_size>::~FPGABILU0()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
bool FPGABILU0<block_size>::init(BlockedMatrix *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@@ -91,8 +91,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
}
|
||||
|
||||
Timer t_analysis;
|
||||
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
|
||||
rMat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
|
||||
LUMat = std::make_unique<BlockedMatrix>(*rMat);
|
||||
std::ostringstream out;
|
||||
if (level_scheduling) {
|
||||
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
|
||||
@@ -117,7 +117,7 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
|
||||
int blockDiagSize = 0;
|
||||
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
|
||||
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
int doneRows = 0;
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
|
||||
@@ -187,8 +187,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
|
||||
diagIndex.resize(mat->Nb, 0);
|
||||
invDiagVals = new double[mat->Nb * bs * bs];
|
||||
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
LMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
UMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
|
||||
resultPointers[0] = (void *) colorSizes.data();
|
||||
resultPointers[1] = (void *) PIndicesAddr.data();
|
||||
resultPointers[2] = (void *) nnzValues.data();
|
||||
@@ -232,11 +232,11 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
||||
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
@@ -286,7 +286,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
|
||||
|
||||
LSize++;
|
||||
// calculate the pivot of this row
|
||||
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
|
||||
blockMult(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0], bs);
|
||||
|
||||
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
|
||||
|
||||
@@ -296,7 +296,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
|
||||
// subtract that row scaled by the pivot from this row.
|
||||
while (ik < iRowEnd && jk < jRowEnd) {
|
||||
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
|
||||
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
|
||||
blockMultSub(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs, bs);
|
||||
ik++;
|
||||
jk++;
|
||||
} else {
|
||||
@@ -402,8 +402,8 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
|
||||
template FPGABILU0<n>::~FPGABILU0(); \
|
||||
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
|
||||
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
|
||||
template bool FPGABILU0<n>::init(BlockedMatrix*); \
|
||||
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix *);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
|
||||
@@ -45,8 +45,8 @@ private:
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzbs; // number of blocks of the matrix
|
||||
std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
|
||||
std::unique_ptr<BlockedMatrix> LMat = nullptr, UMat = nullptr, LUMat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix> rMat = nullptr; // reordered mat
|
||||
double *invDiagVals = nullptr;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> toOrder, fromOrder;
|
||||
@@ -82,10 +82,10 @@ public:
|
||||
~FPGABILU0();
|
||||
|
||||
// analysis (optional)
|
||||
bool init(BlockedMatrix<block_size> *mat);
|
||||
bool init(BlockedMatrix *mat);
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
bool create_preconditioner(BlockedMatrix *mat);
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
@@ -97,7 +97,7 @@ public:
|
||||
return fromOrder.data();
|
||||
}
|
||||
|
||||
BlockedMatrix<block_size>* getRMat()
|
||||
BlockedMatrix* getRMat()
|
||||
{
|
||||
return rMat.get();
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include <config.h>
|
||||
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/material/common/Unused.hpp>
|
||||
@@ -261,7 +262,7 @@ void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double
|
||||
|
||||
// allocate host memory for matrices and vectors
|
||||
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
|
||||
mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
|
||||
mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
|
||||
|
||||
@@ -56,8 +56,8 @@ private:
|
||||
bool level_scheduling = false;
|
||||
|
||||
// LUMat will shallow copy rowPointers and colIndices of mat/rMat
|
||||
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
|
||||
BlockedMatrix<block_size> *rMat = nullptr;
|
||||
std::unique_ptr<BlockedMatrix> mat = nullptr;
|
||||
BlockedMatrix *rMat = nullptr;
|
||||
std::unique_ptr<Preconditioner> prec = nullptr;
|
||||
|
||||
// vectors with data processed by the preconditioner (input to the kernel)
|
||||
|
||||
@@ -17,10 +17,13 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace Opm
|
||||
@@ -57,6 +60,7 @@ void sortRow(int *colIndices, double *data, int left, int right) {
|
||||
|
||||
}
|
||||
|
||||
#if HAVE_FPGA
|
||||
/*
|
||||
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
|
||||
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
|
||||
@@ -247,6 +251,8 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
@@ -17,8 +17,8 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef FPGA_MATRIX_HEADER_INCLUDED
|
||||
#define FPGA_MATRIX_HEADER_INCLUDED
|
||||
#ifndef OPM_MATRIX_HEADER_INCLUDED
|
||||
#define OPM_MATRIX_HEADER_INCLUDED
|
||||
|
||||
#include <vector>
|
||||
|
||||
@@ -33,25 +33,30 @@ class Matrix {
|
||||
|
||||
public:
|
||||
|
||||
/// Allocate Matrix and data arrays with given sizes
|
||||
/// Allocate square Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int nnzs_)
|
||||
: nnzValues(new double[nnzs_]),
|
||||
colIndices(new int[nnzs_]),
|
||||
rowPointers(new int[N_+1]),
|
||||
N(N_),
|
||||
: N(N_),
|
||||
M(N_),
|
||||
nnzs(nnzs_)
|
||||
{}
|
||||
|
||||
/// All constructors allocate new memory, so always delete here
|
||||
~Matrix(){
|
||||
delete[] nnzValues;
|
||||
delete[] colIndices;
|
||||
delete[] rowPointers;
|
||||
{
|
||||
nnzValues.resize(nnzs);
|
||||
colIndices.resize(nnzs);
|
||||
rowPointers.resize(N+1);
|
||||
}
|
||||
|
||||
/// Allocate rectangular Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] M number of columns
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int M_, int nnzs_)
|
||||
: Matrix(N_, nnzs_)
|
||||
{
|
||||
M = M_;
|
||||
}
|
||||
|
||||
#if HAVE_FPGA
|
||||
/// Converts this matrix to the dataformat used by the FPGA.
|
||||
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
|
||||
/// The purpose of this format is to allow the data to be streamable.
|
||||
@@ -71,11 +76,12 @@ public:
|
||||
int toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
|
||||
#endif
|
||||
|
||||
double *nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
int N;
|
||||
std::vector<double> nnzValues;
|
||||
std::vector<int> colIndices;
|
||||
std::vector<int> rowPointers;
|
||||
int N, M;
|
||||
int nnzs;
|
||||
};
|
||||
|
||||
@@ -84,4 +90,4 @@ void sortRow(int *colIndices, double *data, int left, int right);
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
#endif // FPGA_MATRIX_HEADER_INCLUDED
|
||||
#endif // OPM_MATRIX_HEADER_INCLUDED
|
||||
66
opm/simulators/linalg/bda/OpenclMatrix.cpp
Normal file
66
opm/simulators/linalg/bda/OpenclMatrix.cpp
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
|
||||
std::vector<cl::Event> events(3);
|
||||
|
||||
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
|
||||
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "OpenclMatrix OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
|
||||
if (block_size != 1) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
|
||||
}
|
||||
|
||||
void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
|
||||
if (matrix->block_size != block_size) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
66
opm/simulators/linalg/bda/OpenclMatrix.hpp
Normal file
66
opm/simulators/linalg/bda/OpenclMatrix.hpp
Normal file
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_OPENCLMATRIX_HEADER_INCLUDED
|
||||
#define OPM_OPENCLMATRIX_HEADER_INCLUDED
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
class Matrix;
|
||||
class BlockedMatrix;
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The matrix data is stored in OpenCL Buffers
|
||||
class OpenclMatrix {
|
||||
public:
|
||||
|
||||
OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
|
||||
: Nb(Nb_),
|
||||
Mb(Mb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_)
|
||||
{
|
||||
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
|
||||
colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
|
||||
rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
}
|
||||
|
||||
void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
|
||||
void upload(cl::CommandQueue *queue, Matrix *matrix);
|
||||
void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
|
||||
|
||||
cl::Buffer nnzValues;
|
||||
cl::Buffer colIndices;
|
||||
cl::Buffer rowPointers;
|
||||
int Nb, Mb;
|
||||
int nnzbs;
|
||||
unsigned int block_size;
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
#endif // OPM_OPENCLMATRIX_HEADER_INCLUDED
|
||||
@@ -177,10 +177,10 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
|
||||
/* Reorder a matrix by a specified input order.
|
||||
* Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
|
||||
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
|
||||
assert(mat->block_size == rmat->block_size);
|
||||
|
||||
template <unsigned int block_size>
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat) {
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int bs = mat->block_size;
|
||||
int rIndex = 0;
|
||||
int i, k;
|
||||
unsigned int j;
|
||||
@@ -204,7 +204,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
|
||||
}
|
||||
// re-sort the column indices of every row.
|
||||
for (i = 0; i < mat->Nb; i++) {
|
||||
sortBlockedRow<bs>(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1);
|
||||
sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,7 +370,6 @@ void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template int colorBlockedNodes<n>(int, const int *, const int *, const int *, const int *, std::vector<int>&, int, int); \
|
||||
template void reorderBlockedMatrixByPattern<n>(BlockedMatrix<n> *, int *, int *, BlockedMatrix<n> *); \
|
||||
template void reorderBlockedVectorByPattern<n>(int, double*, int*, double*); \
|
||||
template void findGraphColoring<n>(const int *, const int *, const int *, const int *, int, int, int, int *, int *, int *, std::vector<int>&); \
|
||||
|
||||
|
||||
@@ -52,9 +52,8 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
|
||||
/// \param[in] mat matrix to be reordered
|
||||
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rMat reordered Matrix
|
||||
template <unsigned int block_size>
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat);
|
||||
/// \param[inout] rMat reordered Matrix
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
|
||||
|
||||
/// Compute reorder mapping from the color that each node has received
|
||||
/// The toOrder, fromOrder and iters arrays must be allocated already
|
||||
|
||||
@@ -21,6 +21,11 @@
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <string>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
/// Translate OpenCL error codes to strings
|
||||
/// Integer - String combinations are defined in CL/cl.h
|
||||
/// \param[in] error error code
|
||||
@@ -95,3 +100,6 @@ std::string getErrorString(cl_int error)
|
||||
default: return "UNKNOWN_CL_CODE";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
@@ -33,7 +33,16 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
/// Translate OpenCL error codes to strings
|
||||
/// Integer - String combinations are defined in CL/cl.h
|
||||
/// \param[in] error error code
|
||||
std::string getErrorString(cl_int error);
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
|
||||
68
opm/simulators/linalg/bda/opencl/Preconditioner.cpp
Normal file
68
opm/simulators/linalg/bda/opencl/Preconditioner.cpp
Normal file
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <memory>
|
||||
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder) {
|
||||
if (type == PreconditionerType::BILU0) {
|
||||
return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity);
|
||||
} else if (type == PreconditionerType::CPR) {
|
||||
return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder);
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Invalid PreconditionerType");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
|
||||
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
INSTANTIATE_BDA_FUNCTIONS(5);
|
||||
INSTANTIATE_BDA_FUNCTIONS(6);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} //namespace Accelerator
|
||||
} //namespace Opm
|
||||
|
||||
86
opm/simulators/linalg/bda/opencl/Preconditioner.hpp
Normal file
86
opm/simulators/linalg/bda/opencl/Preconditioner.hpp
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_PRECONDITIONER_HEADER_INCLUDED
|
||||
#define OPM_PRECONDITIONER_HEADER_INCLUDED
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
|
||||
class BlockedMatrix;
|
||||
|
||||
template <unsigned int block_size>
|
||||
class Preconditioner
|
||||
{
|
||||
|
||||
protected:
|
||||
int N = 0; // number of rows of the matrix
|
||||
int Nb = 0; // number of blockrows of the matrix
|
||||
int nnz = 0; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzb = 0; // number of blocks of the matrix
|
||||
int verbosity = 0;
|
||||
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
|
||||
Preconditioner(int verbosity_) :
|
||||
verbosity(verbosity_)
|
||||
{};
|
||||
|
||||
public:
|
||||
enum PreconditionerType {
|
||||
BILU0,
|
||||
CPR
|
||||
};
|
||||
|
||||
static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
// nested Preconditioners might need to override this
|
||||
virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
|
||||
|
||||
// analyze matrix, e.g. the sparsity pattern
|
||||
// probably only called once
|
||||
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
|
||||
|
||||
// get reordering mappings
|
||||
virtual int* getToOrder() = 0;
|
||||
virtual int* getFromOrder() = 0;
|
||||
|
||||
// get reordered matrix
|
||||
virtual BlockedMatrix* getRMat() = 0;
|
||||
};
|
||||
|
||||
} //namespace Accelerator
|
||||
} //namespace Opm
|
||||
|
||||
#endif
|
||||
@@ -22,6 +22,7 @@
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
@@ -45,8 +46,16 @@ std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const u
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
@@ -60,7 +69,7 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
void add_kernel_string(cl::Program::Sources &sources, const std::string &source) {
|
||||
void add_kernel_source(cl::Program::Sources &sources, const std::string &source) {
|
||||
sources.emplace_back(source);
|
||||
}
|
||||
|
||||
@@ -75,33 +84,49 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
verbosity = verbosity_;
|
||||
|
||||
cl::Program::Sources sources;
|
||||
const std::string& axpy_s = get_axpy_string();
|
||||
add_kernel_string(sources, axpy_s);
|
||||
const std::string& scale_s = get_scale_string();
|
||||
add_kernel_string(sources, scale_s);
|
||||
const std::string& dot_1_s = get_dot_1_string();
|
||||
add_kernel_string(sources, dot_1_s);
|
||||
const std::string& norm_s = get_norm_string();
|
||||
add_kernel_string(sources, norm_s);
|
||||
const std::string& custom_s = get_custom_string();
|
||||
add_kernel_string(sources, custom_s);
|
||||
const std::string& spmv_blocked_s = get_spmv_blocked_string();
|
||||
add_kernel_string(sources, spmv_blocked_s);
|
||||
const std::string& axpy_s = get_axpy_source();
|
||||
add_kernel_source(sources, axpy_s);
|
||||
const std::string& scale_s = get_scale_source();
|
||||
add_kernel_source(sources, scale_s);
|
||||
const std::string& vmul_s = get_vmul_source();
|
||||
add_kernel_source(sources, vmul_s);
|
||||
const std::string& dot_1_s = get_dot_1_source();
|
||||
add_kernel_source(sources, dot_1_s);
|
||||
const std::string& norm_s = get_norm_source();
|
||||
add_kernel_source(sources, norm_s);
|
||||
const std::string& custom_s = get_custom_source();
|
||||
add_kernel_source(sources, custom_s);
|
||||
const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source();
|
||||
add_kernel_source(sources, full_to_pressure_restriction_s);
|
||||
const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
|
||||
add_kernel_source(sources, add_coarse_pressure_correction_s);
|
||||
const std::string& prolongate_vector_s = get_prolongate_vector_source();
|
||||
add_kernel_source(sources, prolongate_vector_s);
|
||||
const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
|
||||
add_kernel_source(sources, spmv_blocked_s);
|
||||
const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
|
||||
add_kernel_source(sources, spmv_s);
|
||||
const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false);
|
||||
add_kernel_source(sources, spmv_noreset_s);
|
||||
const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op);
|
||||
add_kernel_source(sources, residual_blocked_s);
|
||||
const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op);
|
||||
add_kernel_source(sources, residual_s);
|
||||
#if CHOW_PATEL
|
||||
bool ilu_operate_on_full_matrix = false;
|
||||
#else
|
||||
bool ilu_operate_on_full_matrix = true;
|
||||
#endif
|
||||
const std::string& ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply1_s);
|
||||
const std::string& ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply2_s);
|
||||
const std::string& stdwell_apply_s = get_stdwell_apply_string(true);
|
||||
add_kernel_string(sources, stdwell_apply_s);
|
||||
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
|
||||
add_kernel_string(sources, stdwell_apply_no_reorder_s);
|
||||
const std::string& ilu_decomp_s = get_ilu_decomp_string();
|
||||
add_kernel_string(sources, ilu_decomp_s);
|
||||
const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix);
|
||||
add_kernel_source(sources, ILU_apply1_s);
|
||||
const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix);
|
||||
add_kernel_source(sources, ILU_apply2_s);
|
||||
const std::string& stdwell_apply_s = get_stdwell_apply_source(true);
|
||||
add_kernel_source(sources, stdwell_apply_s);
|
||||
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false);
|
||||
add_kernel_source(sources, stdwell_apply_no_reorder_s);
|
||||
const std::string& ilu_decomp_s = get_ilu_decomp_source();
|
||||
add_kernel_source(sources, ilu_decomp_s);
|
||||
|
||||
cl::Program program = cl::Program(*context, sources);
|
||||
program.build(devices);
|
||||
@@ -114,8 +139,16 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
|
||||
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
|
||||
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
|
||||
spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
|
||||
spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
|
||||
residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
|
||||
residual_k.reset(new residual_kernel_type(cl::Kernel(program, "residual")));
|
||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||
@@ -219,6 +252,23 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_vmul;
|
||||
|
||||
cl::Event event = (*vmul_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), alpha, in1, in2, out, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels vmul() time: " << t_vmul.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
@@ -236,15 +286,75 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const do
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
|
||||
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t;
|
||||
|
||||
cl::Event event = (*full_to_pressure_restriction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), fine_y, weights, coarse_y, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels full_to_pressure_restriction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t;
|
||||
|
||||
cl::Event event = (*add_coarse_pressure_correction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), coarse_x, fine_x, pressure_idx, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t;
|
||||
|
||||
cl::Event event = (*prolongate_vector_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, cols, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels prolongate_vector() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_spmv;
|
||||
cl::Event event;
|
||||
|
||||
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
if (block_size > 1) {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
if (reset) {
|
||||
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@@ -254,6 +364,29 @@ void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer&
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_residual;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels residual_blocked() time: " << t_residual.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||
{
|
||||
@@ -356,7 +489,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
}
|
||||
|
||||
|
||||
std::string OpenclKernels::get_axpy_string() {
|
||||
std::string OpenclKernels::get_axpy_source() {
|
||||
return R"(
|
||||
__kernel void axpy(
|
||||
__global double *in,
|
||||
@@ -377,7 +510,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
|
||||
|
||||
// scale vector with scalar
|
||||
std::string OpenclKernels::get_scale_string() {
|
||||
std::string OpenclKernels::get_scale_source() {
|
||||
return R"(
|
||||
__kernel void scale(
|
||||
__global double *vec,
|
||||
@@ -395,9 +528,31 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
)";
|
||||
}
|
||||
|
||||
// multiply vector with another vector and a scalar, element-wise
|
||||
// add result to a third vector
|
||||
std::string OpenclKernels::get_vmul_source() {
|
||||
return R"(
|
||||
__kernel void vmul(
|
||||
const double alpha,
|
||||
__global double const *in1,
|
||||
__global double const *in2,
|
||||
__global double *out,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
int idx = get_global_id(0);
|
||||
|
||||
while(idx < N){
|
||||
out[idx] += alpha * in1[idx] * in2[idx];
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
|
||||
// returns partial sums, instead of the final dot product
|
||||
std::string OpenclKernels::get_dot_1_string() {
|
||||
std::string OpenclKernels::get_dot_1_source() {
|
||||
return R"(
|
||||
__kernel void dot_1(
|
||||
__global double *in1,
|
||||
@@ -440,7 +595,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
|
||||
// returns partial sums, instead of the final norm
|
||||
// the square root must be computed on CPU
|
||||
std::string OpenclKernels::get_norm_string() {
|
||||
std::string OpenclKernels::get_norm_source() {
|
||||
return R"(
|
||||
__kernel void norm(
|
||||
__global double *in,
|
||||
@@ -481,7 +636,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
|
||||
|
||||
// p = (p - omega * v) * beta + r
|
||||
std::string OpenclKernels::get_custom_string() {
|
||||
std::string OpenclKernels::get_custom_source() {
|
||||
return R"(
|
||||
__kernel void custom(
|
||||
__global double *p,
|
||||
@@ -506,15 +661,98 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
)";
|
||||
}
|
||||
|
||||
std::string OpenclKernels::get_spmv_blocked_string() {
|
||||
|
||||
// transform blocked vector to scalar vector using pressure-weights
|
||||
// every workitem handles one blockrow
|
||||
std::string OpenclKernels::get_full_to_pressure_restriction_source() {
|
||||
return R"(
|
||||
__kernel void spmv_blocked(
|
||||
__global const double *vals,
|
||||
__kernel void full_to_pressure_restriction(
|
||||
__global const double *fine_y,
|
||||
__global const double *weights,
|
||||
__global double *coarse_y,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = get_global_id(0);
|
||||
|
||||
while(target_block_row < Nb){
|
||||
double sum = 0.0;
|
||||
unsigned int idx = block_size * target_block_row;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
sum += fine_y[idx + i] * weights[idx + i];
|
||||
}
|
||||
coarse_y[target_block_row] = sum;
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
// add the coarse pressure solution back to the finer, complete solution
|
||||
// every workitem handles one blockrow
|
||||
std::string OpenclKernels::get_add_coarse_pressure_correction_source() {
|
||||
return R"(
|
||||
__kernel void add_coarse_pressure_correction(
|
||||
__global const double *coarse_x,
|
||||
__global double *fine_x,
|
||||
const unsigned int pressure_idx,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = get_global_id(0);
|
||||
|
||||
while(target_block_row < Nb){
|
||||
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
// prolongate vector during amg cycle
|
||||
// every workitem handles one row
|
||||
std::string OpenclKernels::get_prolongate_vector_source() {
|
||||
return R"(
|
||||
__kernel void prolongate_vector(
|
||||
__global const double *in,
|
||||
__global double *out,
|
||||
__global const int *cols,
|
||||
const unsigned int N)
|
||||
{
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
unsigned int row = get_global_id(0);
|
||||
|
||||
while(row < N){
|
||||
out[row] += in[cols[row]];
|
||||
row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
|
||||
/// either b = mat * x
|
||||
/// or res = rhs - mat * x
|
||||
std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) {
|
||||
std::string s;
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
s += "__kernel void spmv_blocked(";
|
||||
} else {
|
||||
s += "__kernel void residual_blocked(";
|
||||
}
|
||||
s += R"(__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int Nb,
|
||||
__global const double *x,
|
||||
__global double *b,
|
||||
)";
|
||||
if (op == matrix_operation::residual_op) {
|
||||
s += "__global const double *rhs,";
|
||||
}
|
||||
s += R"(
|
||||
__global double *out,
|
||||
const unsigned int block_size,
|
||||
__local double *tmp)
|
||||
{
|
||||
@@ -566,17 +804,99 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
b[row] = tmp[lane];
|
||||
)";
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
s += " out[row] = tmp[lane];";
|
||||
} else {
|
||||
s += " out[row] = rhs[row] - tmp[lane];";
|
||||
}
|
||||
s += R"(
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
)";
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
/// either b = mat * x
|
||||
/// or res = rhs - mat * x
|
||||
std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) {
|
||||
std::string s;
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
if (spmv_reset) {
|
||||
s += "__kernel void spmv(";
|
||||
} else {
|
||||
s += "__kernel void spmv_noreset(";
|
||||
}
|
||||
} else {
|
||||
s += "__kernel void residual(";
|
||||
}
|
||||
s += R"(__global const double *vals,
|
||||
__global const int *cols,
|
||||
__global const int *rows,
|
||||
const int N,
|
||||
__global const double *x,
|
||||
)";
|
||||
if (op == matrix_operation::residual_op) {
|
||||
s += "__global const double *rhs,";
|
||||
}
|
||||
s += R"(
|
||||
__global double *out,
|
||||
__local double *tmp)
|
||||
{
|
||||
const unsigned int bsize = get_local_size(0);
|
||||
const unsigned int idx_b = get_global_id(0) / bsize;
|
||||
const unsigned int idx_t = get_local_id(0);
|
||||
const unsigned int num_workgroups = get_num_groups(0);
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
double local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
)";
|
||||
if (op == matrix_operation::spmv_op) {
|
||||
if (spmv_reset) {
|
||||
s += " out[row] = tmp[idx_t];";
|
||||
} else {
|
||||
s += " out[row] += tmp[idx_t];";
|
||||
}
|
||||
} else {
|
||||
s += " out[row] = rhs[row] - tmp[idx_t];";
|
||||
}
|
||||
s += R"(
|
||||
}
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
)";
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
|
||||
std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply1(
|
||||
__global const double *LUvals,
|
||||
@@ -652,7 +972,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
}
|
||||
|
||||
|
||||
std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
|
||||
std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *LUvals,
|
||||
@@ -735,7 +1055,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
return s;
|
||||
}
|
||||
|
||||
std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
|
||||
std::string OpenclKernels::get_stdwell_apply_source(bool reorder) {
|
||||
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
|
||||
std::string s = "__kernel void " + kernel_name + R"((
|
||||
__global const double *Cnnzs,
|
||||
@@ -830,7 +1150,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
|
||||
}
|
||||
|
||||
|
||||
std::string OpenclKernels::get_ilu_decomp_string() {
|
||||
std::string OpenclKernels::get_ilu_decomp_source() {
|
||||
return R"(
|
||||
|
||||
// a = a - (b * c)
|
||||
|
||||
@@ -30,8 +30,14 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
@@ -55,12 +61,25 @@ private:
|
||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
|
||||
enum matrix_operation {
|
||||
spmv_op,
|
||||
residual_op
|
||||
};
|
||||
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
|
||||
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
|
||||
static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
|
||||
static std::unique_ptr<residual_kernel_type> residual_k;
|
||||
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
||||
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||
@@ -69,54 +88,70 @@ private:
|
||||
|
||||
/// Generate string with axpy kernel
|
||||
/// a = a + alpha * b
|
||||
static std::string get_axpy_string();
|
||||
static std::string get_axpy_source();
|
||||
|
||||
/// Generate string with scale kernel
|
||||
/// a = a * alpha
|
||||
static std::string get_scale_string();
|
||||
static std::string get_scale_source();
|
||||
|
||||
/// multiply vector with another vector and a scalar, element-wise
|
||||
/// add result to a third vector
|
||||
static std::string get_vmul_source();
|
||||
|
||||
/// returns partial sums, instead of the final dot product
|
||||
/// partial sums are added on CPU
|
||||
static std::string get_dot_1_string();
|
||||
static std::string get_dot_1_source();
|
||||
|
||||
/// returns partial sums, instead of the final norm
|
||||
/// the square root must be computed on CPU
|
||||
static std::string get_norm_string();
|
||||
static std::string get_norm_source();
|
||||
|
||||
/// Generate string with custom kernel
|
||||
/// This kernel combines some ilubicgstab vector operations into 1
|
||||
/// p = (p - omega * v) * beta + r
|
||||
static std::string get_custom_string();
|
||||
static std::string get_custom_source();
|
||||
|
||||
/// Transform blocked vector to scalar vector using pressure-weights
|
||||
static std::string get_full_to_pressure_restriction_source();
|
||||
|
||||
/// Add the coarse pressure solution back to the finer, complete solution
|
||||
static std::string get_add_coarse_pressure_correction_source();
|
||||
|
||||
/// Prolongate a vector during the AMG cycle
|
||||
static std::string get_prolongate_vector_source();
|
||||
|
||||
/// b = mat * x
|
||||
/// algorithm based on:
|
||||
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
static std::string get_spmv_blocked_string();
|
||||
/// or
|
||||
/// res = rhs - (mat * x)
|
||||
static std::string get_blocked_matrix_operation_source(matrix_operation op);
|
||||
static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true);
|
||||
|
||||
/// ILU apply part 1: forward substitution
|
||||
/// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||
/// this L can be it's own BSR matrix (if full_matrix is false),
|
||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
|
||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
||||
static std::string get_ILU_apply1_string(bool full_matrix);
|
||||
static std::string get_ILU_apply1_source(bool full_matrix);
|
||||
|
||||
/// ILU apply part 2: backward substitution
|
||||
/// solves U*x=y where U is an upper triangular sparse blocked matrix
|
||||
/// this U can be it's own BSR matrix (if full_matrix is false),
|
||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
|
||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
||||
static std::string get_ILU_apply2_string(bool full_matrix);
|
||||
static std::string get_ILU_apply2_source(bool full_matrix);
|
||||
|
||||
/// Generate string with the stdwell_apply kernels
|
||||
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
|
||||
/// the x/y vector is reordered, use toOrder to address that
|
||||
/// \param[in] reorder whether the matrix is reordered or not
|
||||
static std::string get_stdwell_apply_string(bool reorder);
|
||||
static std::string get_stdwell_apply_source(bool reorder);
|
||||
|
||||
/// Generate string with the exact ilu decomposition kernel
|
||||
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
|
||||
static std::string get_ilu_decomp_string();
|
||||
static std::string get_ilu_decomp_source();
|
||||
|
||||
OpenclKernels(){}; // disable instantiation
|
||||
|
||||
@@ -127,8 +162,14 @@ public:
|
||||
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const double a, int N);
|
||||
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
|
||||
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
|
||||
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
|
||||
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
|
||||
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
|
||||
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
|
||||
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
|
||||
|
||||
@@ -45,8 +45,25 @@ using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
|
||||
|
||||
bool use_cpr;
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
use_cpr = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
use_cpr = true;
|
||||
} else if (linsolver.compare("cpr_trueimpes") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error openclSolver does not support --linsolver=cpr_trueimpes");
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linsolver, " + linsolver);
|
||||
}
|
||||
|
||||
using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
|
||||
if (use_cpr) {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder);
|
||||
} else {
|
||||
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder);
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
try {
|
||||
@@ -191,6 +208,19 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) :
|
||||
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{
|
||||
// prec = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
|
||||
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_reorder, /*use_amg=*/false);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
openclSolverBackend<block_size>::~openclSolverBackend() {
|
||||
@@ -257,7 +287,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// v = A * pw
|
||||
t_spmv.start();
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@@ -288,7 +318,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// t = A * s
|
||||
t_spmv.start();
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@@ -332,7 +362,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::ilu_apply: " << t_prec.elapsed() << " s\n";
|
||||
out << "openclSolver::prec_apply: " << t_prec.elapsed() << " s\n";
|
||||
out << "wellContributions::apply: " << t_well.elapsed() << " s\n";
|
||||
out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n";
|
||||
out << "openclSolver::rest: " << t_rest.elapsed() << " s\n";
|
||||
@@ -358,13 +388,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
out.clear();
|
||||
|
||||
try {
|
||||
prec->setOpenCLContext(context.get());
|
||||
prec->setOpenCLQueue(queue.get());
|
||||
prec->setOpencl(context, queue);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
vals_contiguous = new double[N];
|
||||
vals_contiguous.resize(nnz);
|
||||
#endif
|
||||
mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
|
||||
mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
|
||||
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
@@ -408,10 +437,6 @@ void openclSolverBackend<block_size>::finalize() {
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] rb;
|
||||
}
|
||||
#if COPY_ROW_BY_ROW
|
||||
delete[] vals_contiguous;
|
||||
#endif
|
||||
delete prec;
|
||||
} // end finalize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
@@ -423,10 +448,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
@@ -463,10 +488,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
|
||||
#else
|
||||
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
|
||||
#endif
|
||||
@@ -489,29 +514,34 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::analyse_matrix() {
|
||||
bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
Timer t;
|
||||
|
||||
bool success = prec->init(mat.get());
|
||||
// bool success = bilu0->init(mat.get());
|
||||
bool success = prec->analyze_matrix(mat.get());
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
rmat = mat.get();
|
||||
} else {
|
||||
// toOrder = bilu0->getToOrder();
|
||||
// fromOrder = bilu0->getFromOrder();
|
||||
// rmat = bilu0->getRMat();
|
||||
toOrder = prec->getToOrder();
|
||||
fromOrder = prec->getFromOrder();
|
||||
rmat = prec->getRMat();
|
||||
}
|
||||
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::analyse_matrix(): " << t.stop() << " s";
|
||||
out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
analysis_done = true;
|
||||
|
||||
return success;
|
||||
} // end analyse_matrix()
|
||||
} // end analyze_matrix()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
@@ -603,7 +633,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
if (!analyze_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
@@ -624,8 +654,11 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder); \
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template openclSolverBackend<n>::openclSolverBackend( \
|
||||
int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \
|
||||
template openclSolverBackend<n>::openclSolverBackend(int, int, double, ILUReorder); \
|
||||
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
#include <opm/simulators/linalg/bda/BILU0.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
#include <tuple>
|
||||
|
||||
namespace Opm
|
||||
@@ -35,12 +37,14 @@ namespace Opm
|
||||
namespace Accelerator
|
||||
{
|
||||
|
||||
template <unsigned int block_size>
|
||||
class CPR;
|
||||
|
||||
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
|
||||
template <unsigned int block_size>
|
||||
class openclSolverBackend : public BdaSolver<block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
typedef BILU0<block_size> Preconditioner;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@@ -55,7 +59,7 @@ class openclSolverBackend : public BdaSolver<block_size>
|
||||
|
||||
private:
|
||||
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
|
||||
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
|
||||
|
||||
// OpenCL variables must be reusable, they are initialized in initialize()
|
||||
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU
|
||||
@@ -66,11 +70,13 @@ private:
|
||||
|
||||
std::vector<cl::Device> devices;
|
||||
|
||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||
std::unique_ptr<Preconditioner<block_size> > prec;
|
||||
// can perform blocked ILU0 and AMG on pressure component
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
bool analysis_done = false;
|
||||
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
|
||||
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
|
||||
std::unique_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
|
||||
ILUReorder opencl_ilu_reorder; // reordering strategy
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
@@ -154,9 +160,9 @@ private:
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
void update_system_on_gpu();
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// Analyze sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
bool analyse_matrix();
|
||||
bool analyze_matrix();
|
||||
|
||||
/// Perform ilu0-decomposition
|
||||
/// \return true iff decomposition was successful
|
||||
@@ -178,7 +184,13 @@ public:
|
||||
/// \param[in] platformID the OpenCL platform to be used
|
||||
/// \param[in] deviceID the device to be used
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, ILUReorder opencl_ilu_reorder);
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linsolver cmdline argument
|
||||
/// only ilu0 and cpr_quasiimpes are supported
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
|
||||
ILUReorder opencl_ilu_reorder, std::string linsolver);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~openclSolverBackend();
|
||||
@@ -200,10 +212,20 @@ public:
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
// SolverStatus solve_system(BdaResult &res);
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
/// Set OpenCL objects
|
||||
/// This class either creates them based on platformID and deviceID or receives them through this function
|
||||
/// \param[in] context the opencl context to be used
|
||||
/// \param[in] queue the opencl queue to be used
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
|
||||
@@ -76,15 +76,16 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
|
||||
const std::string opencl_ilu_reorder("none"); // unused
|
||||
const int platformID = 0; // unused
|
||||
const int deviceID = 0;
|
||||
const std::string gpu_mode("cusparse");
|
||||
const std::string accelerator_mode("cusparse");
|
||||
const std::string fpga_bitstream("empty"); // unused
|
||||
const std::string linsolver("ilu0");
|
||||
Dune::InverseOperatorResult result;
|
||||
|
||||
Vector x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("cusparse", false);
|
||||
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
|
||||
try {
|
||||
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
|
||||
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
|
||||
|
||||
bridge->solve_system(&matrix, rhs, *wellContribs, result);
|
||||
bridge->get_result(x);
|
||||
|
||||
@@ -75,15 +75,16 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
|
||||
const std::string opencl_ilu_reorder("none");
|
||||
const int platformID = 0;
|
||||
const int deviceID = 0;
|
||||
const std::string gpu_mode("opencl");
|
||||
const std::string accelerator_mode("opencl");
|
||||
const std::string fpga_bitstream("empty"); // unused
|
||||
const std::string linsolver("ilu0");
|
||||
Dune::InverseOperatorResult result;
|
||||
|
||||
Vector x(rhs.size());
|
||||
auto wellContribs = Opm::WellContributions::create("opencl", false);
|
||||
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
|
||||
try {
|
||||
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
|
||||
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
|
||||
} catch (const std::logic_error& error) {
|
||||
BOOST_WARN_MESSAGE(true, error.what());
|
||||
throw PlatformInitException(error.what());
|
||||
@@ -100,8 +101,8 @@ void test3(const pt::ptree& prm)
|
||||
{
|
||||
const int bz = 3;
|
||||
auto sol = testOpenclSolver<bz>(prm, "matr33.txt", "rhs3.txt");
|
||||
Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-1.30307e-2, -3.58263e-6, 1.13836e-9},
|
||||
{-1.25425e-3, -1.4167e-4, -3.2213e-3},
|
||||
Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-0.0131626, -3.58263e-6, 1.13836e-9},
|
||||
{-1.25425e-3, -1.4167e-4, -0.0029366},
|
||||
{-4.5436e-4, 1.28682e-5, 4.7644e-6}};
|
||||
BOOST_REQUIRE_EQUAL(sol.size(), expected.size());
|
||||
for (size_t i = 0; i < sol.size(); ++i) {
|
||||
|
||||
82
tests/test_solvetransposed3x3.cpp
Normal file
82
tests/test_solvetransposed3x3.cpp
Normal file
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#define BOOST_TEST_MODULE SolveTransposed3x3
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include <boost/test/floating_point_comparison.hpp>
|
||||
|
||||
#include <dune/istl/bcrsmatrix.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/CPR.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(testsolvetransposed3x3)
|
||||
{
|
||||
const unsigned numTests = 3;
|
||||
const unsigned blockSize = 3;
|
||||
|
||||
std::vector<std::vector<double> > A = {{4, 2, 1,
|
||||
3, 4, 2,
|
||||
2, 4, 3},
|
||||
{1, 2, 4,
|
||||
1, 3, 2,
|
||||
2, 4, 2},
|
||||
{1, 2, 2,
|
||||
1, 3, 5,
|
||||
3, 2, 4}};
|
||||
|
||||
std::vector<std::vector<double> > b = {{0, 1, 0},
|
||||
{1, 3, 5},
|
||||
{2, 4, 5}};
|
||||
|
||||
std::vector<std::vector<double> > x_expected = {{-0.5, 1, -0.5},
|
||||
{ 1, 1, -0.5},
|
||||
{ 1.3, 0.4, 0.1}};
|
||||
|
||||
for (unsigned testId = 0; testId < numTests; ++testId) {
|
||||
std::vector<double> x(blockSize);
|
||||
Opm::Accelerator::solve_transposed_3x3(A[testId].data(), b[testId].data(), x.data());
|
||||
|
||||
for (unsigned i = 0; i < blockSize; ++i) {
|
||||
BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
|
||||
}
|
||||
}
|
||||
|
||||
// retest cases using Dune methods
|
||||
using Mat3 = Dune::FieldMatrix<double, 3, 3>;
|
||||
using Vec3 = Dune::FieldVector<double, 3>;
|
||||
for (unsigned testId = 0; testId < numTests; ++testId) {
|
||||
Mat3 a3 = {{ A[testId][0], A[testId][1], A[testId][2] },
|
||||
{ A[testId][3], A[testId][4], A[testId][5] },
|
||||
{ A[testId][6], A[testId][7], A[testId][8] } };
|
||||
Vec3 y({b[testId][0], b[testId][1], b[testId][2]});
|
||||
Mat3 b3 = a3;
|
||||
|
||||
// b3 = inv(a3^T)
|
||||
Dune::FMatrixHelp::invertMatrix_retTransposed(a3, b3);
|
||||
|
||||
// x = b3 * y
|
||||
Vec3 x = Dune::FMatrixHelp::mult(b3, y);
|
||||
|
||||
for (unsigned i = 0; i < blockSize; ++i) {
|
||||
BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user