Merge pull request #3672 from Tongdongq/add-cpr-opencl

Add CPR preconditioner to openclSolver
This commit is contained in:
Markus Blatt
2021-12-08 19:44:59 +01:00
committed by GitHub
35 changed files with 2014 additions and 541 deletions

View File

@@ -100,18 +100,21 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/ChowPatelIlu.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/CPR.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/OpenclMatrix.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
endif()
if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR HAVE_AMGCL)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Matrix.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif()
if(HAVE_FPGA)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
@@ -145,6 +148,7 @@ list (APPEND TEST_SOURCE_FILES
tests/test_timer.cpp
tests/test_invert.cpp
tests/test_stoppedwells.cpp
tests/test_solvetransposed3x3.cpp
tests/test_relpermdiagnostics.cpp
tests/test_norne_pvt.cpp
tests/test_wellprodindexcalculator.cpp
@@ -247,10 +251,10 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/BdaSolver.hpp
opm/simulators/linalg/bda/BILU0.hpp
opm/simulators/linalg/bda/BlockedMatrix.hpp
opm/simulators/linalg/bda/CPR.hpp
opm/simulators/linalg/bda/cuda_header.hpp
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
opm/simulators/linalg/bda/ChowPatelIlu.hpp
opm/simulators/linalg/bda/FPGAMatrix.hpp
opm/simulators/linalg/bda/FPGABILU0.hpp
opm/simulators/linalg/bda/FPGASolverBackend.hpp
opm/simulators/linalg/bda/FPGAUtils.hpp
@@ -258,8 +262,11 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/ILUReorder.hpp
opm/simulators/linalg/bda/opencl.hpp
opm/simulators/linalg/bda/openclKernels.hpp
opm/simulators/linalg/bda/OpenclMatrix.hpp
opm/simulators/linalg/bda/opencl/Preconditioner.hpp
opm/simulators/linalg/bda/openclSolverBackend.hpp
opm/simulators/linalg/bda/openclWellContributions.hpp
opm/simulators/linalg/bda/Matrix.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/amgcpr.hh

View File

@@ -146,7 +146,8 @@ namespace Opm
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver));
}
#else
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {

View File

@@ -122,7 +122,6 @@ public:
instance().doAddCreator(type, creator);
}
private:
using CriterionBase
= Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<Matrix, Dune::Amg::FirstDiagonal>>;
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
@@ -150,6 +149,7 @@ private:
criterion.setMinAggregateSize(prm.get<int>("minaggsize", 4));
return criterion;
}
private:
/// Helper struct to explicitly overload amgSmootherArgs() version for
/// ParallelOverlappingILU0, since in-class specialization is not allowed.

View File

@@ -40,279 +40,251 @@ using Dune::Timer;
template <unsigned int block_size>
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
{
#if CHOW_PATEL
chowPatelIlu.setVerbosity(verbosity);
#endif
}
template <unsigned int block_size>
BILU0<block_size>::~BILU0()
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
{
delete[] invDiagVals;
}
const unsigned int bs = block_size;
template <unsigned int block_size>
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
this->N = mat->Nb * block_size;
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzb = mat->nnzbs;
this->N = mat->Nb * block_size;
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs;
std::vector<int> CSCRowIndices;
std::vector<int> CSCColPointers;
int *CSCRowIndices = nullptr;
int *CSCColPointers = nullptr;
if (opencl_ilu_reorder == ILUReorder::NONE) {
LUmat = std::make_unique<BlockedMatrix>(*mat);
} else {
toOrder.resize(Nb);
fromOrder.resize(Nb);
CSCRowIndices.resize(nnzb);
CSCColPointers.resize(Nb + 1);
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
LUmat = std::make_unique<BlockedMatrix>(*rmat);
if (opencl_ilu_reorder == ILUReorder::NONE) {
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
} else {
toOrder.resize(Nb);
fromOrder.resize(Nb);
CSCRowIndices = new int[nnzbs];
CSCColPointers = new int[Nb + 1];
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
if(verbosity >= 3){
std::ostringstream out;
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
}
Timer t_analysis;
std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
// numColors = 1;
// rowsPerColor.emplace_back(Nb);
numColors = Nb;
for(int i = 0; i < Nb; ++i){
rowsPerColor.emplace_back(1);
}
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
}
if(verbosity >= 1){
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
Timer t_analysis;
std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
// numColors = 1;
// rowsPerColor.emplace_back(Nb);
numColors = Nb;
for (int i = 0; i < Nb; ++i) {
rowsPerColor.emplace_back(1);
}
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
}
if (verbosity >= 1) {
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
}
#if CHOW_PATEL
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
#endif
OpmLog::info(out.str());
OpmLog::info(out.str());
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] CSCRowIndices;
delete[] CSCColPointers;
}
diagIndex.resize(mat->Nb);
invDiagVals = new double[mat->Nb * bs * bs];
diagIndex.resize(mat->Nb);
invDiagVals.resize(mat->Nb * bs * bs);
#if CHOW_PATEL
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
#endif
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
#if CHOW_PATEL
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1));
#else
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs);
s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs);
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
#endif
events.resize(2);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals, nullptr, &events[0]);
events.resize(2);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
return true;
} // end init()
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{
const unsigned int bs = block_size;
auto *m = mat;
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
auto *m = mat;
if (opencl_ilu_reorder != ILUReorder::NONE) {
m = rmat.get();
Timer t_reorder;
reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
Timer t_copy;
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 memcpy: " << t_copy.stop() << " s";
OpmLog::info(out.str());
}
#if CHOW_PATEL
chowPatelIlu.decomposition(queue, context,
LUmat.get(), Lmat.get(), Umat.get(),
invDiagVals, diagIndex,
s.diagIndex, s.invDiagVals,
s.Lvals, s.Lcols, s.Lrows,
s.Uvals, s.Ucols, s.Urows);
#else
Timer t_copyToGpu;
events.resize(1);
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(4);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
if (opencl_ilu_reorder != ILUReorder::NONE) {
m = rmat.get();
Timer t_reorder;
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
OpmLog::info(out.str());
}
Timer t_decomposition;
std::ostringstream out;
cl::Event event;
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1];
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
}
if (verbosity >= 3) {
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
#endif // CHOW_PATEL
return true;
} // end create_preconditioner()
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
cl::Event event;
Timer t_apply;
for(int color = 0; color < numColors; ++color){
#if CHOW_PATEL
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#else
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#endif
}
for(int color = numColors-1; color >= 0; --color){
#if CHOW_PATEL
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#else
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#endif
}
// apply relaxation
OpenclKernels::scale(x, relaxation, N);
if (verbosity >= 4) {
std::ostringstream out;
out << "BILU0 apply: " << t_apply.stop() << " s";
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
Timer t_copy;
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 memcpy: " << t_copy.stop() << " s";
OpmLog::info(out.str());
}
#if CHOW_PATEL
chowPatelIlu.decomposition(queue, context,
LUmat.get(), Lmat.get(), Umat.get(),
invDiagVals.data(), diagIndex,
s.diagIndex, s.invDiagVals,
s.Lvals, s.Lcols, s.Lrows,
s.Uvals, s.Ucols, s.Urows);
#else
Timer t_copyToGpu;
events.resize(1);
err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
std::call_once(pattern_uploaded, [&]() {
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row + 1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(4);
err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
});
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
}
if (verbosity >= 3) {
std::ostringstream out;
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
OpmLog::info(out.str());
}
Timer t_decomposition;
std::ostringstream out;
cl::Event event;
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
}
if (verbosity >= 3) {
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
#endif // CHOW_PATEL
return true;
} // end create_preconditioner()
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::setOpenCLContext(cl::Context *context_) {
this->context = context_;
}
template <unsigned int block_size>
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
this->queue = queue_;
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
cl::Event event;
Timer t_apply;
for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#else
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#endif
}
for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#else
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#endif
}
// apply relaxation
OpenclKernels::scale(x, relaxation, N);
if (verbosity >= 4) {
std::ostringstream out;
out << "BILU0 apply: " << t_apply.stop() << " s";
OpmLog::info(out.str());
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(ILUReorder, int); \
template BILU0<n>::~BILU0(); \
template bool BILU0<n>::init(BlockedMatrix<n>*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class BILU0<n>;
INSTANTIATE_BDA_FUNCTIONS(1);

View File

@@ -27,6 +27,7 @@
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
@@ -35,89 +36,86 @@ namespace Opm
namespace Accelerator
{
/// This class implementa a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix
template <unsigned int block_size>
class BILU0
{
/// This class implements a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix
template <unsigned int block_size>
class BILU0 : public Preconditioner<block_size>
{
typedef Preconditioner<block_size> Base;
private:
int N; // number of rows of the matrix
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::context;
using Base::queue;
using Base::events;
using Base::err;
private:
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
#endif
double *invDiagVals = nullptr;
std::vector<int> diagIndex;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
std::vector<int> toOrder, fromOrder;
int numColors;
int verbosity;
std::once_flag pattern_uploaded;
std::vector<double> invDiagVals;
std::vector<int> diagIndex;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
std::vector<int> toOrder, fromOrder;
int numColors;
std::once_flag pattern_uploaded;
ILUReorder opencl_ilu_reorder;
ILUReorder opencl_ilu_reorder;
typedef struct {
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
cl::Buffer rowsPerColor;
typedef struct {
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
cl::Buffer rowsPerColor;
#if CHOW_PATEL
cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows;
cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows;
#else
cl::Buffer LUvals, LUcols, LUrows;
cl::Buffer LUvals, LUcols, LUrows;
#endif
} GPU_storage;
} GPU_storage;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
std::vector<cl::Event> events;
cl_int err;
GPU_storage s;
#if CHOW_PATEL
ChowPatelIlu<block_size> chowPatelIlu;
ChowPatelIlu<block_size> chowPatelIlu;
#endif
public:
public:
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
~BILU0();
// analysis, find reordering if specified
bool analyze_matrix(BlockedMatrix *mat) override;
// analysis
bool init(BlockedMatrix<block_size> *mat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat) override;
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<block_size> *mat);
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) override;
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x);
int* getToOrder() override
{
return toOrder.data();
}
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
int* getFromOrder() override
{
return fromOrder.data();
}
int* getToOrder()
{
return toOrder.data();
}
BlockedMatrix* getRMat() override
{
return rmat.get();
}
int* getFromOrder()
{
return fromOrder.data();
}
BlockedMatrix<block_size>* getRMat()
{
return rmat.get();
}
};
};
} // namespace Accelerator
} // namespace Opm

View File

@@ -63,7 +63,8 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
double tolerance,
[[maybe_unused]] unsigned int platformID,
unsigned int deviceID,
[[maybe_unused]] std::string opencl_ilu_reorder)
[[maybe_unused]] std::string opencl_ilu_reorder,
[[maybe_unused]] std::string linsolver)
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
{
if (accelerator_mode.compare("cusparse") == 0) {
@@ -88,7 +89,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
} else {
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder));
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver));
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
@@ -168,29 +169,26 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
// sparsity pattern should stay the same
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
template <class BridgeMatrix>
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
int sum_nnzs = 0;
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
h_rows.clear();
h_cols.clear();
// convert colIndices and rowPointers
if (h_rows.empty()) {
h_rows.emplace_back(0);
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
int size_row = 0;
for (auto c = r->begin(); c != r->end(); ++c) {
h_cols.emplace_back(c.index());
size_row++;
}
sum_nnzs += size_row;
h_rows.emplace_back(sum_nnzs);
}
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
h_rows.emplace_back(0);
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
for (auto c = r->begin(); c != r->end(); ++c) {
h_cols.emplace_back(c.index());
}
h_rows.emplace_back(h_cols.size());
}
} // end getSparsityPattern()
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
@@ -220,7 +218,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
if (h_rows.capacity() == 0) {
h_rows.reserve(Nb+1);
h_cols.reserve(nnzb);
getSparsityPattern(*mat, h_rows, h_cols);
copySparsityPatternFromISTL(*mat, h_rows, h_cols);
}
Dune::Timer t_zeros;

View File

@@ -54,7 +54,9 @@ public:
/// \param[in] platformID the OpenCL platform ID to be used
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
/// \param[in] linsolver copy of cmdline argument --linsolver
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver);
/// Solve linear system, A*x = b
@@ -75,6 +77,12 @@ public:
return use_gpu;
}
/// Store sparsity pattern into vectors
/// \param[in] mat input matrix, probably BCRSMatrix
/// \param[out] h_rows rowpointers
/// \param[out] h_cols columnindices
static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
/// Initialize the WellContributions object with opencl context and queue
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
/// \param[in] wellContribs container to hold all WellContributions

View File

@@ -76,6 +76,7 @@ namespace Accelerator {
/// \param[in] tolerance required relative tolerance for solver
/// \param[in] platformID the OpenCL platform to be used, only used in openclSolver
/// \param[in] deviceID the device to be used
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};

View File

@@ -26,7 +26,9 @@
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
namespace Opm
{
@@ -37,8 +39,7 @@ using Opm::OpmLog;
/*Sort a row of matrix elements from a blocked CSR-format.*/
template <unsigned int block_size>
void sortBlockedRow(int *colIndices, double *data, int left, int right) {
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
const unsigned int bs = block_size;
int l = left;
int r = right;
@@ -63,17 +64,16 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right) {
} while (l < r);
if (left < r)
sortBlockedRow<bs>(colIndices, data, left, r);
sortBlockedRow(colIndices, data, left, r, bs);
if (right > l)
sortBlockedRow<bs>(colIndices, data, l, right);
sortBlockedRow(colIndices, data, l, right, bs);
}
// LUMat->nnzValues[ik] = LUMat->nnzValues[ik] - (pivot * LUMat->nnzValues[jk]) in ilu decomposition
// a = a - (b * c)
template <unsigned int block_size>
void blockMultSub(double *a, double *b, double *c)
void blockMultSub(double *a, double *b, double *c, unsigned int block_size)
{
for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) {
@@ -88,8 +88,7 @@ void blockMultSub(double *a, double *b, double *c)
/*Perform a 3x3 matrix-matrix multiplicationj on two blocks*/
template <unsigned int block_size>
void blockMult(double *mat1, double *mat2, double *resMat) {
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) {
double temp = 0;
@@ -104,8 +103,7 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
#if HAVE_FPGA
/*Subtract two blocks from one another element by element*/
template <unsigned int block_size>
void blockSub(double *mat1, double *mat2, double *resMat) {
void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size) {
for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) {
resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
@@ -114,8 +112,7 @@ void blockSub(double *mat1, double *mat2, double *resMat) {
}
/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
template <unsigned int block_size>
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size) {
for (unsigned int row = 0; row < block_size; row++) {
if (resetRes) {
resVect[row] = 0.0;
@@ -128,8 +125,7 @@ void blockVectMult(double *mat, double *vect, double scale, double *resVect, boo
template <unsigned int block_size>
int BlockedMatrix<block_size>::countUnblockedNnzs() {
int BlockedMatrix::countUnblockedNnzs() {
int numNnzsOverThreshold = 0;
int totalNnzs = rowPointers[Nb];
for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
@@ -144,8 +140,7 @@ int BlockedMatrix<block_size>::countUnblockedNnzs() {
* Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
* If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
*/
template <unsigned int block_size>
void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) {
const unsigned int bs = block_size;
int valIndex = 0, nnzsPerRow;
@@ -183,8 +178,7 @@ void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
/*Optimized version*/
// ub* prefixes indicate unblocked data
template <unsigned int block_size>
int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
{
@@ -223,8 +217,7 @@ int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isU
// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
template <unsigned int block_size>
int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor,
int rowsPerColorLimit, int columnsPerColorLimit,
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
@@ -470,41 +463,5 @@ void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::
#endif // HAVE_FPGA
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template void sortBlockedRow<n>(int *, double *, int, int); \
template void blockMultSub<n>(double *, double *, double *); \
template void blockMult<n>(double *, double *, double *); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
#if HAVE_FPGA
#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \
template void blockSub<n>(double *, double *, double *); \
template void blockVectMult<n>(double *, double *, double, double *, bool); \
template int BlockedMatrix<n>::toRDF(int, int *, bool, \
std::vector<std::vector<int> >& , int, int *, \
std::vector<std::vector<double> >&, short int *, unsigned char *, int *, int *); \
template int BlockedMatrix<n>::findPartitionColumns(int, int *, \
int, int, \
std::vector<std::vector<int> >& , int *, int *, \
std::vector<std::vector<int> >& , int *, int *, \
std::vector<std::vector<int> >& , int *, int *);
INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
#endif // HAVE_FPGA
} // namespace Accelerator
} // namespace Opm

View File

@@ -17,14 +17,14 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef BLOCKED_MATRIX_HPP
#define BLOCKED_MATRIX_HPP
#ifndef OPM_BLOCKED_MATRIX_HPP
#define OPM_BLOCKED_MATRIX_HPP
#if HAVE_FPGA
#include <vector>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#endif
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
namespace Opm
{
@@ -33,7 +33,6 @@ namespace Accelerator
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
template<unsigned int block_size>
class BlockedMatrix
{
@@ -42,12 +41,14 @@ public:
/// Allocate BlockedMatrix and data arrays with given sizes
/// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of nonzero blocks
BlockedMatrix(int Nb_, int nnzbs_)
: nnzValues(new double[nnzbs_*block_size*block_size]),
colIndices(new int[nnzbs_*block_size*block_size]),
/// \param[in] block_size the number of rows and columns for each block
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_)
: nnzValues(new double[nnzbs_*block_size_*block_size_]),
colIndices(new int[nnzbs_*block_size_*block_size_]),
rowPointers(new int[Nb_+1]),
Nb(Nb_),
nnzbs(nnzbs_),
block_size(block_size_),
deleteNnzs(true),
deleteSparsity(true)
{}
@@ -55,11 +56,12 @@ public:
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
/// \param[in] M matrix to be copied
BlockedMatrix(const BlockedMatrix& M)
: nnzValues(new double[M.nnzbs*block_size*block_size]),
: nnzValues(new double[M.nnzbs*M.block_size*M.block_size]),
colIndices(M.colIndices),
rowPointers(M.rowPointers),
Nb(M.Nb),
nnzbs(M.nnzbs),
block_size(M.block_size),
deleteNnzs(true),
deleteSparsity(false)
{}
@@ -67,15 +69,17 @@ public:
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
/// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of nonzero blocks
/// \param[in] block_size the number of rows and columns for each block
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
/// \param[in] colIndices array of column indices, contains nnzb entries
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
BlockedMatrix(int Nb_, int nnzbs_, unsigned int block_size_, double *nnzValues_, int *colIndices_, int *rowPointers_)
: nnzValues(nnzValues_),
colIndices(colIndices_),
rowPointers(rowPointers_),
Nb(Nb_),
nnzbs(nnzbs_),
block_size(block_size_),
deleteNnzs(false),
deleteSparsity(false)
{}
@@ -117,6 +121,7 @@ public:
int *rowPointers;
int Nb;
int nnzbs;
unsigned int block_size;
bool deleteNnzs;
bool deleteSparsity;
};
@@ -127,31 +132,45 @@ public:
/// \param[inout] data
/// \param[in] left lower index of data of row
/// \param[in] right upper index of data of row
template <unsigned int block_size>
void sortBlockedRow(int *colIndices, double *data, int left, int right);
/// \param[in] block_size size of blocks in the row
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
/// Multiply and subtract blocks
/// a = a - (b * c)
/// \param[inout] a block to be subtracted from
/// \param[in] b input block
/// \param[in] c input block
template <unsigned int block_size>
void blockMultSub(double *a, double *b, double *c);
/// \param[in] block_size size of block
void blockMultSub(double *a, double *b, double *c, unsigned int block_size);
/// Perform a 3x3 matrix-matrix multiplication on two blocks
/// Perform a matrix-matrix multiplication on two blocks
/// resMat = mat1 * mat2
/// \param[in] mat1 input block 1
/// \param[in] mat2 input block 2
/// \param[inout] resMat output block
template <unsigned int block_size>
void blockMult(double *mat1, double *mat2, double *resMat);
/// \param[out] resMat output block
/// \param[in] block_size size of block
void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size);
#if HAVE_FPGA
template <unsigned int block_size>
void blockSub(double *mat1, double *mat2, double *resMat);
/// Perform a matrix-matrix subtraction on two blocks, element-wise
/// resMat = mat1 - mat2
/// \param[in] mat1 input block 1
/// \param[in] mat2 input block 2
/// \param[out] resMat output block
/// \param[in] block_size size of block
void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size);
template <unsigned int block_size>
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
/// Perform a matrix-vector multiplication
/// resVect = mat * vect
/// resVect += mat * vect
/// \param[in] mat input matrix
/// \param[in] vect input vector
/// \param[in] scale multiply output with this factor
/// \param[inout] resVect output vector
/// \param[in] resetRes if true, overwrite resVect, otherwise add to it
/// \param[in] block_size size of block
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size);
/// Convert a blocked inverse diagonal to the FPGA format.
/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.

View File

@@ -0,0 +1,535 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <dune/common/shared_ptr.hh>
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
#include <opm/simulators/linalg/PropertyTree.hpp>
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/CPR.hpp>
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
namespace Opm
{
namespace Accelerator
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
{
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
diagIndices.resize(1);
}
template <unsigned int block_size>
void CPR<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
context = context_;
queue = queue_;
bilu0->setOpencl(context, queue);
}
template <unsigned int block_size>
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_);
if (opencl_ilu_reorder == ILUReorder::NONE) {
mat = mat_;
} else {
mat = bilu0->getRMat();
}
return success;
}
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_);
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
return result;
}
// return the absolute value of the N elements for which the absolute value is highest
double get_absmax(const double *data, const int N) {
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
}
// solve A^T * x = b
void solve_transposed_3x3(const double *A, const double *b, double *x) {
const int B = 3;
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
double t4 = A[0+0*B] * A[1+1*B];
double t6 = A[0+0*B] * A[1+2*B];
double t8 = A[0+1*B] * A[1+0*B];
double t10 = A[0+2*B] * A[1+0*B];
double t12 = A[0+1*B] * A[2+0*B];
double t14 = A[0+2*B] * A[2+0*B];
double d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); //determinant
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
}
template <unsigned int block_size>
void CPR<block_size>::init_opencl_buffers() {
d_Amatrices.reserve(num_levels);
d_Rmatrices.reserve(num_levels - 1);
d_invDiags.reserve(num_levels - 1);
for (Matrix& m : Amatrices) {
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
}
for (Matrix& m : Rmatrices) {
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.N);
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M); // create a cl::Buffer
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(double) * m.M);
}
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_mat = std::make_unique<OpenclMatrix>(context.get(), Nb, Nb, nnzb, block_size);
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * Nb);
}
template <unsigned int block_size>
void CPR<block_size>::opencl_upload() {
d_mat->upload(queue.get(), mat);
err = CL_SUCCESS;
events.resize(2 * Rmatrices.size() + 1);
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0, sizeof(double) * N, weights.data(), nullptr, &events[0]);
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0, sizeof(double) * Amatrices[i].N, invDiags[i].data(), nullptr, &events[2*i+1]);
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0, sizeof(int) * Amatrices[i].N, PcolIndices[i].data(), nullptr, &events[2*i+2]);
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
}
}
template <unsigned int block_size>
void CPR<block_size>::create_preconditioner_amg(BlockedMatrix *mat_) {
this->mat = mat_;
coarse_vals.resize(nnzb);
coarse_x.resize(Nb);
coarse_y.resize(Nb);
weights.resize(N);
try{
double rhs[] = {0, 0, 0};
rhs[pressure_idx] = 1;
// find diagonal index for each row
if (diagIndices[0].empty()) {
diagIndices[0].resize(Nb);
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
assert(candidate != mat->colIndices + end);
diagIndices[0][row] = candidate - mat->colIndices;
}
}
// calculate weights for each row
for (int row = 0; row < Nb; ++row) {
// solve to find weights
double *row_weights = weights.data() + block_size * row; // weights for this row
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
// normalize weights for this row
double abs_max = get_absmax(row_weights, block_size);
for(unsigned int i = 0; i < block_size; i++){
row_weights[i] /= abs_max;
}
}
// extract pressure
// transform blocks to scalars to create scalar linear system
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
double *block = mat->nnzValues + idx * block_size * block_size;
double *row_weights = weights.data() + block_size * row;
double value = 0.0;
for (unsigned int i = 0; i < block_size; ++i) {
value += block[block_size * i + pressure_idx] * row_weights[i];
}
coarse_vals[idx] = value;
}
}
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
if (recalculate_aggregates) {
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
typedef DuneMat::CreateIterator Iter;
// setup sparsity pattern
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
int start = mat->rowPointers[row.index()];
int end = mat->rowPointers[row.index() + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
row.insert(col);
}
}
// set values
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
(*dune_coarse)[row][col] = coarse_vals[idx];
}
}
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
Dune::Amg::SequentialInformation seqinfo;
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
Opm::PropertyTree property_tree;
property_tree.put("alpha", 0.333333333333);
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
// Yet a SymmetricDependency is used in AMGCPR
// An UnSymmetricCriterion is also available
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
const Criterion c = Opm::PreconditionerFactory<MatrixOperator, Dune::Amg::SequentialInformation>::amgCriterion(property_tree);
num_pre_smooth_steps = c.getNoPreSmoothSteps();
num_post_smooth_steps = c.getNoPostSmoothSteps();
dune_amg->build<OverlapFlags>(c);
analyzeHierarchy();
analyzeAggregateMaps();
recalculate_aggregates = false;
} else {
// update values of coarsest level in AMG
// this works because that level is actually a reference to the DuneMat held by dune_coarse
for (int row = 0; row < Nb; ++row) {
int start = mat->rowPointers[row];
int end = mat->rowPointers[row + 1];
for (int idx = start; idx < end; ++idx) {
int col = mat->colIndices[idx];
(*dune_coarse)[row][col] = coarse_vals[idx];
}
}
// update the rest of the AMG hierarchy
dune_amg->recalculateGalerkin(OverlapFlags());
analyzeHierarchy();
}
// initialize OpenclMatrices and Buffers if needed
auto init_func = std::bind(&CPR::init_opencl_buffers, this);
std::call_once(opencl_buffers_allocated, init_func);
// upload matrices and vectors to GPU
opencl_upload();
} catch (const std::exception& ex) {
std::cerr << "Caught exception: " << ex.what() << std::endl;
throw ex;
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeHierarchy() {
const DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
// store coarsest AMG level in umfpack format, also performs LU decomposition
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
num_levels = dune_amg->levels();
level_sizes.resize(num_levels);
diagIndices.resize(num_levels);
Amatrices.reserve(num_levels);
Rmatrices.reserve(num_levels - 1); // coarsest level does not need one
invDiags.reserve(num_levels);
Amatrices.clear();
invDiags.clear();
// matrixIter.dereference() returns MatrixAdapter
// matrixIter.dereference().getmat() returns BCRSMat
DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
const auto& A = matrixIter.dereference().getmat();
level_sizes[level] = A.N();
diagIndices[level].reserve(A.N());
// extract matrix A
Amatrices.emplace_back(A.N(), A.nonzeroes());
// contiguous copy is not possible
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
int nnz_idx = 0;
const bool fillDiagIndices = diagIndices[level].empty();
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
for (auto c = r->begin(); c != r->end(); ++c) {
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
if (fillDiagIndices && r.index() == c.index()) {
diagIndices[level].emplace_back(nnz_idx);
}
nnz_idx++;
}
}
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
// compute inverse diagonal values for current level
invDiags.emplace_back(A.N());
for (unsigned int row = 0; row < A.N(); ++row) {
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
}
}
}
template <unsigned int block_size>
void CPR<block_size>::analyzeAggregateMaps() {
PcolIndices.resize(num_levels - 1);
Rmatrices.clear();
const DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
DuneAmg::AggregatesMap *map = *mapIter;
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
// get indices for each row of P and R
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
PcolIndices[level].resize(level_sizes[level]);
using AggregateIterator = DuneAmg::AggregatesMap::const_iterator;
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
const long int diff = ai - map->begin();
PcolIndices[level][diff] = *ai;
indicesR[*ai].emplace_back(diff);
}
}
int col_idx = 0;
// set sparsity pattern of R
Rmatrices.back().rowPointers[0] = 0;
for (unsigned int i = 0; i < indicesR.size(); ++i) {
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
Rmatrices.back().colIndices[col_idx++] = *it;
}
}
}
}
template <unsigned int block_size>
void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x) {
OpenclMatrix *A = &d_Amatrices[level];
OpenclMatrix *R = &d_Rmatrices[level];
int Ncur = A->Nb;
if (level == num_levels - 1) {
// solve coarsest level
std::vector<double> h_y(Ncur), h_x(Ncur, 0);
events.resize(1);
err = queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * Ncur, h_y.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
}
// solve coarsest level using umfpack
umfpack.apply(h_x.data(), h_y.data());
events.resize(1);
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0, sizeof(double) * Ncur, h_x.data(), nullptr, &events[0]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
return;
}
int Nnext = d_Amatrices[level+1].Nb;
cl::Buffer& t = d_t[level];
cl::Buffer& f = d_f[level];
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
// presmooth
double jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
// move to coarser level
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
amg_cycle_gpu(level + 1, f, u);
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
// postsmooth
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
}
// x = prec(y)
template <unsigned int block_size>
void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
// 0-initialize u and x vectors
events.resize(d_u.size() + 1);
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0, sizeof(double) * Nb, nullptr, &events[0]);
for (unsigned int i = 0; i < d_u.size(); ++i) {
err |= queue->enqueueFillBuffer(d_u[i], 0, 0, sizeof(double) * Rmatrices[i].N, nullptr, &events[i + 1]);
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
}
template <unsigned int block_size>
void CPR<block_size>::apply(const cl::Buffer& y, cl::Buffer& x) {
Dune::Timer t_bilu0;
bilu0->apply(y, x);
if (verbosity >= 4) {
std::ostringstream out;
out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
apply_amg(y, x);
if (verbosity >= 4) {
std::ostringstream out;
out << "CPR apply amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template class CPR<n>;
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} // namespace Accelerator
} // namespace Opm

View File

@@ -0,0 +1,163 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_CPR_HPP
#define OPM_CPR_HPP
#include <mutex>
#include <dune/istl/paamg/matrixhierarchy.hh>
#include <dune/istl/umfpack.hh>
#include <opm/simulators/linalg/bda/BILU0.hpp>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
namespace Opm
{
namespace Accelerator
{
template <unsigned int block_size>
class openclSolverBackend;
class BlockedMatrix;
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
template <unsigned int block_size>
class CPR : public Preconditioner<block_size>
{
typedef Preconditioner<block_size> Base;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::context;
using Base::queue;
using Base::events;
using Base::err;
private:
int num_levels;
std::vector<double> weights, coarse_vals, coarse_x, coarse_y;
std::vector<Matrix> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<OpenclMatrix> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
std::vector<cl::Buffer> d_PcolIndices;
std::vector<std::vector<double> > invDiags; // inverse of diagonal of Amatrices
std::vector<cl::Buffer> d_invDiags;
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
std::unique_ptr<cl::Buffer> d_weights; // the quasiimpes weights, used to extract pressure
std::unique_ptr<OpenclMatrix> d_mat; // stores blocked matrix
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
std::unique_ptr<BILU0<block_size> > bilu0; // Blocked ILU0 preconditioner
BlockedMatrix *mat = nullptr; // input matrix, blocked
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<double, 1, 1> >;
using DuneVec = Dune::BlockVector<Dune::FieldVector<double, 1> >;
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
std::unique_ptr<DuneAmg> dune_amg;
std::unique_ptr<DuneMat> dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy
std::shared_ptr<MatrixOperator> dune_op; // operator, input to Dune AMG
std::vector<int> level_sizes; // size of each level in the AMG hierarchy
std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
Dune::UMFPack<DuneMat> umfpack; // dune/istl/umfpack object used to solve the coarsest level of AMG
bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
bool recalculate_aggregates = true; // only rerecalculate if true
const int pressure_idx = 1; // hardcoded to mimic OPM
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
ILUReorder opencl_ilu_reorder; // reordering strategy for ILU0 in coarse solver
// Analyze the AMG hierarchy build by Dune
void analyzeHierarchy();
// Analyze the aggregateMaps from the AMG hierarchy
// These can be reused, so only use when recalculate_aggregates is true
void analyzeAggregateMaps();
// Initialize and allocate matrices and vectors
void init_opencl_buffers();
// Copy matrices and vectors to GPU
void opencl_upload();
// apply pressure correction to vector
void apply_amg(const cl::Buffer& y, cl::Buffer& x);
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
void create_preconditioner_amg(BlockedMatrix *mat);
public:
CPR(int verbosity, ILUReorder opencl_ilu_reorder);
bool analyze_matrix(BlockedMatrix *mat) override;
// set own Opencl variables, but also that of the bilu0 preconditioner
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
// applies blocked ilu0
// also applies amg for pressure component
void apply(const cl::Buffer& y, cl::Buffer& x) override;
bool create_preconditioner(BlockedMatrix *mat) override;
int* getToOrder() override
{
return bilu0->getToOrder();
}
int* getFromOrder() override
{
return bilu0->getFromOrder();
}
BlockedMatrix* getRMat() override
{
return bilu0->getRMat();
}
};
// solve A^T * x = b
// A should represent a 3x3 matrix
// x and b are vectors with 3 elements
void solve_transposed_3x3(const double *A, const double *b, double *x);
} // namespace Accelerator
} // namespace Opm
#endif

View File

@@ -482,7 +482,7 @@ __kernel void chow_patel_ilu_sweep(
template <unsigned int block_size>
void ChowPatelIlu<block_size>::decomposition(
cl::CommandQueue *queue, [[maybe_unused]] cl::Context *context,
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
double *invDiagVals, std::vector<int>& diagIndex,
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,
@@ -569,7 +569,7 @@ void ChowPatelIlu<block_size>::decomposition(
} else {
Lmat->colIndices[num_blocks_L] = j;
// multiply block of L with corresponding diag block
blockMult<bs>(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs);
blockMult(LUmat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, Lmat->nnzValues + num_blocks_L * bs * bs, bs);
num_blocks_L++;
}
}
@@ -671,7 +671,7 @@ void ChowPatelIlu<block_size>::decomposition(
}
if (colL == rowU2) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs);
blockMultSub(&aij[0], Lmat->nnzValues + jk * bs * bs, Ut->nnzValues + k * bs * bs, bs);
}
}
}
@@ -713,7 +713,7 @@ void ChowPatelIlu<block_size>::decomposition(
if (rowU == colL) {
// Aij -= (Lik * Ukj)
blockMultSub<bs>(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs);
blockMultSub(&aij[0], Lmat->nnzValues + k * bs * bs , Ut->nnzValues + jk * bs * bs, bs);
}
}
@@ -721,7 +721,7 @@ void ChowPatelIlu<block_size>::decomposition(
double ujj[bs*bs];
inverter(Ut->nnzValues + (Ut->rowPointers[j+1] - 1) * bs * bs, &ujj[0]);
// Lij_new = (Aij - sum) / Ujj
blockMult<bs>(&aij[0], &ujj[0], Ltmp + ij * bs * bs);
blockMult(&aij[0], &ujj[0], Ltmp + ij * bs * bs, bs);
}
}
// 1st sweep writes to Ltmp

View File

@@ -82,7 +82,7 @@ public:
/// This function calls gpu_decomposition() if CHOW_PATEL_GPU is set
void decomposition(
cl::CommandQueue *queue, cl::Context *context,
BlockedMatrix<block_size> *LUmat, BlockedMatrix<block_size> *Lmat, BlockedMatrix<block_size> *Umat,
BlockedMatrix *LUmat, BlockedMatrix *Lmat, BlockedMatrix *Umat,
double *invDiagVals, std::vector<int>& diagIndex,
cl::Buffer& d_diagIndex, cl::Buffer& d_invDiagVals,
cl::Buffer& d_Lvals, cl::Buffer& d_Lcols, cl::Buffer& d_Lrows,

View File

@@ -59,7 +59,7 @@ FPGABILU0<block_size>::~FPGABILU0()
template <unsigned int block_size>
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
bool FPGABILU0<block_size>::init(BlockedMatrix *mat)
{
const unsigned int bs = block_size;
@@ -91,8 +91,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
}
Timer t_analysis;
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
rMat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
LUMat = std::make_unique<BlockedMatrix>(*rMat);
std::ostringstream out;
if (level_scheduling) {
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
@@ -117,7 +117,7 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
int blockDiagSize = 0;
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
int doneRows = 0;
for (int c = 0; c < numColors; c++) {
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
@@ -187,8 +187,8 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
diagIndex.resize(mat->Nb, 0);
invDiagVals = new double[mat->Nb * bs * bs];
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
LMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
UMat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size);
resultPointers[0] = (void *) colorSizes.data();
resultPointers[1] = (void *) PIndicesAddr.data();
resultPointers[2] = (void *) nnzValues.data();
@@ -232,11 +232,11 @@ bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
template <unsigned int block_size>
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{
const unsigned int bs = block_size;
Timer t_reorder;
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get());
if (verbosity >= 3) {
std::ostringstream out;
@@ -286,7 +286,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
LSize++;
// calculate the pivot of this row
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
blockMult(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0], bs);
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
@@ -296,7 +296,7 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
// subtract that row scaled by the pivot from this row.
while (ik < iRowEnd && jk < jRowEnd) {
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
blockMultSub(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs, bs);
ik++;
jk++;
} else {
@@ -402,8 +402,8 @@ bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
template FPGABILU0<n>::~FPGABILU0(); \
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
template bool FPGABILU0<n>::init(BlockedMatrix*); \
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix *);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);

View File

@@ -45,8 +45,8 @@ private:
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
std::unique_ptr<BlockedMatrix> LMat = nullptr, UMat = nullptr, LUMat = nullptr;
std::shared_ptr<BlockedMatrix> rMat = nullptr; // reordered mat
double *invDiagVals = nullptr;
std::vector<int> diagIndex;
std::vector<int> toOrder, fromOrder;
@@ -82,10 +82,10 @@ public:
~FPGABILU0();
// analysis (optional)
bool init(BlockedMatrix<block_size> *mat);
bool init(BlockedMatrix *mat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<block_size> *mat);
bool create_preconditioner(BlockedMatrix *mat);
int* getToOrder()
{
@@ -97,7 +97,7 @@ public:
return fromOrder.data();
}
BlockedMatrix<block_size>* getRMat()
BlockedMatrix* getRMat()
{
return rMat.get();
}

View File

@@ -20,6 +20,7 @@
#include <config.h>
#include <cmath>
#include <cstring>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/material/common/Unused.hpp>
@@ -261,7 +262,7 @@ void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double
// allocate host memory for matrices and vectors
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));
std::ostringstream oss;
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \

View File

@@ -56,8 +56,8 @@ private:
bool level_scheduling = false;
// LUMat will shallow copy rowPointers and colIndices of mat/rMat
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
BlockedMatrix<block_size> *rMat = nullptr;
std::unique_ptr<BlockedMatrix> mat = nullptr;
BlockedMatrix *rMat = nullptr;
std::unique_ptr<Preconditioner> prec = nullptr;
// vectors with data processed by the preconditioner (input to the kernel)

View File

@@ -17,10 +17,13 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace Opm
@@ -57,6 +60,7 @@ void sortRow(int *colIndices, double *data, int left, int right) {
}
#if HAVE_FPGA
/*
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
@@ -247,6 +251,8 @@ int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
return 0;
}
#endif
} // namespace Accelerator
} // namespace Opm

View File

@@ -17,8 +17,8 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef FPGA_MATRIX_HEADER_INCLUDED
#define FPGA_MATRIX_HEADER_INCLUDED
#ifndef OPM_MATRIX_HEADER_INCLUDED
#define OPM_MATRIX_HEADER_INCLUDED
#include <vector>
@@ -33,25 +33,30 @@ class Matrix {
public:
/// Allocate Matrix and data arrays with given sizes
/// Allocate square Matrix and data arrays with given sizes
/// \param[in] N number of rows
/// \param[in] nnzs number of nonzeros
Matrix(int N_, int nnzs_)
: nnzValues(new double[nnzs_]),
colIndices(new int[nnzs_]),
rowPointers(new int[N_+1]),
N(N_),
: N(N_),
M(N_),
nnzs(nnzs_)
{}
/// All constructors allocate new memory, so always delete here
~Matrix(){
delete[] nnzValues;
delete[] colIndices;
delete[] rowPointers;
{
nnzValues.resize(nnzs);
colIndices.resize(nnzs);
rowPointers.resize(N+1);
}
/// Allocate rectangular Matrix and data arrays with given sizes
/// \param[in] N number of rows
/// \param[in] M number of columns
/// \param[in] nnzs number of nonzeros
Matrix(int N_, int M_, int nnzs_)
: Matrix(N_, nnzs_)
{
M = M_;
}
#if HAVE_FPGA
/// Converts this matrix to the dataformat used by the FPGA.
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
/// The purpose of this format is to allow the data to be streamable.
@@ -71,11 +76,12 @@ public:
int toRDF(int numColors, std::vector<int>& nodesPerColor,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
#endif
double *nnzValues;
int *colIndices;
int *rowPointers;
int N;
std::vector<double> nnzValues;
std::vector<int> colIndices;
std::vector<int> rowPointers;
int N, M;
int nnzs;
};
@@ -84,4 +90,4 @@ void sortRow(int *colIndices, double *data, int left, int right);
} // namespace Accelerator
} // namespace Opm
#endif // FPGA_MATRIX_HEADER_INCLUDED
#endif // OPM_MATRIX_HEADER_INCLUDED

View File

@@ -0,0 +1,66 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/OpenclMatrix.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp>
namespace Opm
{
namespace Accelerator
{
void OpenclMatrix::upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows) {
std::vector<cl::Event> events(3);
cl_int err = queue->enqueueWriteBuffer(nnzValues, CL_FALSE, 0, sizeof(double) * block_size * block_size * nnzbs, vals, nullptr, &events[0]);
err |= queue->enqueueWriteBuffer(colIndices, CL_FALSE, 0, sizeof(int) * nnzbs, cols, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(rowPointers, CL_FALSE, 0, sizeof(int) * (Nb + 1), rows, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "OpenclMatrix OpenCL enqueueWriteBuffer error");
}
}
void OpenclMatrix::upload(cl::CommandQueue *queue, Matrix *matrix) {
if (block_size != 1) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
}
upload(queue, matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data());
}
void OpenclMatrix::upload(cl::CommandQueue *queue, BlockedMatrix *matrix) {
if (matrix->block_size != block_size) {
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to OpenclMatrix with different block_size");
}
upload(queue, matrix->nnzValues, matrix->colIndices, matrix->rowPointers);
}
} // namespace Accelerator
} // namespace Opm

View File

@@ -0,0 +1,66 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_OPENCLMATRIX_HEADER_INCLUDED
#define OPM_OPENCLMATRIX_HEADER_INCLUDED
#include <vector>
#include <opm/simulators/linalg/bda/opencl.hpp>
namespace Opm
{
namespace Accelerator
{
class Matrix;
class BlockedMatrix;
/// This struct resembles a csr matrix, only doubles are supported
/// The matrix data is stored in OpenCL Buffers
class OpenclMatrix {
public:
OpenclMatrix(cl::Context *context, int Nb_, int Mb_, int nnzbs_, unsigned int block_size_)
: Nb(Nb_),
Mb(Mb_),
nnzbs(nnzbs_),
block_size(block_size_)
{
nnzValues = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * block_size * block_size * nnzbs);
colIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzbs);
rowPointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
}
void upload(cl::CommandQueue *queue, double *vals, int *cols, int *rows);
void upload(cl::CommandQueue *queue, Matrix *matrix);
void upload(cl::CommandQueue *queue, BlockedMatrix *matrix);
cl::Buffer nnzValues;
cl::Buffer colIndices;
cl::Buffer rowPointers;
int Nb, Mb;
int nnzbs;
unsigned int block_size;
};
} // namespace Accelerator
} // namespace Opm
#endif // OPM_OPENCLMATRIX_HEADER_INCLUDED

View File

@@ -177,10 +177,10 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
/* Reorder a matrix by a specified input order.
* Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
assert(mat->block_size == rmat->block_size);
template <unsigned int block_size>
void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat) {
const unsigned int bs = block_size;
const unsigned int bs = mat->block_size;
int rIndex = 0;
int i, k;
unsigned int j;
@@ -204,7 +204,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
}
// re-sort the column indices of every row.
for (i = 0; i < mat->Nb; i++) {
sortBlockedRow<bs>(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1);
sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
}
}
@@ -370,7 +370,6 @@ void csrPatternToCsc(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template int colorBlockedNodes<n>(int, const int *, const int *, const int *, const int *, std::vector<int>&, int, int); \
template void reorderBlockedMatrixByPattern<n>(BlockedMatrix<n> *, int *, int *, BlockedMatrix<n> *); \
template void reorderBlockedVectorByPattern<n>(int, double*, int*, double*); \
template void findGraphColoring<n>(const int *, const int *, const int *, const int *, int, int, int, int *, int *, int *, std::vector<int>&); \

View File

@@ -52,9 +52,8 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
/// \param[in] mat matrix to be reordered
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rMat reordered Matrix
template <unsigned int block_size>
void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder, int *fromOrder, BlockedMatrix<block_size> *rmat);
/// \param[inout] rMat reordered Matrix
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
/// Compute reorder mapping from the color that each node has received
/// The toOrder, fromOrder and iters arrays must be allocated already

View File

@@ -21,6 +21,11 @@
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <string>
namespace Opm
{
namespace Accelerator
{
/// Translate OpenCL error codes to strings
/// Integer - String combinations are defined in CL/cl.h
/// \param[in] error error code
@@ -95,3 +100,6 @@ std::string getErrorString(cl_int error)
default: return "UNKNOWN_CL_CODE";
}
}
} // namespace Accelerator
} // namespace Opm

View File

@@ -33,7 +33,16 @@
#include <string>
namespace Opm
{
namespace Accelerator
{
/// Translate OpenCL error codes to strings
/// Integer - String combinations are defined in CL/cl.h
/// \param[in] error error code
std::string getErrorString(cl_int error);
} // namespace Accelerator
} // namespace Opm

View File

@@ -0,0 +1,68 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <memory>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp>
#include <opm/simulators/linalg/bda/CPR.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
namespace Opm
{
namespace Accelerator
{
template <unsigned int block_size>
void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
context = context_;
queue = queue_;
}
template <unsigned int block_size>
std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder) {
if (type == PreconditionerType::BILU0) {
return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity);
} else if (type == PreconditionerType::CPR) {
return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder);
} else {
OPM_THROW(std::logic_error, "Invalid PreconditionerType");
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
INSTANTIATE_BDA_FUNCTIONS(5);
INSTANTIATE_BDA_FUNCTIONS(6);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace Accelerator
} //namespace Opm

View File

@@ -0,0 +1,86 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_PRECONDITIONER_HEADER_INCLUDED
#define OPM_PRECONDITIONER_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
namespace Opm
{
namespace Accelerator
{
class BlockedMatrix;
template <unsigned int block_size>
class Preconditioner
{
protected:
int N = 0; // number of rows of the matrix
int Nb = 0; // number of blockrows of the matrix
int nnz = 0; // number of nonzeroes of the matrix (scalar)
int nnzb = 0; // number of blocks of the matrix
int verbosity = 0;
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
std::vector<cl::Event> events;
cl_int err;
Preconditioner(int verbosity_) :
verbosity(verbosity_)
{};
public:
enum PreconditionerType {
BILU0,
CPR
};
static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder);
// nested Preconditioners might need to override this
virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
// apply preconditioner, x = prec(y)
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
// analyze matrix, e.g. the sparsity pattern
// probably only called once
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
// create/update preconditioner, probably used every linear solve
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
// get reordering mappings
virtual int* getToOrder() = 0;
virtual int* getFromOrder() = 0;
// get reordered matrix
virtual BlockedMatrix* getRMat() = 0;
};
} //namespace Accelerator
} //namespace Opm
#endif

View File

@@ -22,6 +22,7 @@
#include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
@@ -45,8 +46,16 @@ std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const u
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
@@ -60,7 +69,7 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
return A / B + (A % B > 0);
}
void add_kernel_string(cl::Program::Sources &sources, const std::string &source) {
void add_kernel_source(cl::Program::Sources &sources, const std::string &source) {
sources.emplace_back(source);
}
@@ -75,33 +84,49 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
verbosity = verbosity_;
cl::Program::Sources sources;
const std::string& axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
const std::string& scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
const std::string& dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
const std::string& norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
const std::string& custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
const std::string& spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
const std::string& axpy_s = get_axpy_source();
add_kernel_source(sources, axpy_s);
const std::string& scale_s = get_scale_source();
add_kernel_source(sources, scale_s);
const std::string& vmul_s = get_vmul_source();
add_kernel_source(sources, vmul_s);
const std::string& dot_1_s = get_dot_1_source();
add_kernel_source(sources, dot_1_s);
const std::string& norm_s = get_norm_source();
add_kernel_source(sources, norm_s);
const std::string& custom_s = get_custom_source();
add_kernel_source(sources, custom_s);
const std::string& full_to_pressure_restriction_s = get_full_to_pressure_restriction_source();
add_kernel_source(sources, full_to_pressure_restriction_s);
const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
add_kernel_source(sources, add_coarse_pressure_correction_s);
const std::string& prolongate_vector_s = get_prolongate_vector_source();
add_kernel_source(sources, prolongate_vector_s);
const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
add_kernel_source(sources, spmv_blocked_s);
const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
add_kernel_source(sources, spmv_s);
const std::string& spmv_noreset_s = get_matrix_operation_source(matrix_operation::spmv_op, false);
add_kernel_source(sources, spmv_noreset_s);
const std::string& residual_blocked_s = get_blocked_matrix_operation_source(matrix_operation::residual_op);
add_kernel_source(sources, residual_blocked_s);
const std::string& residual_s = get_matrix_operation_source(matrix_operation::residual_op);
add_kernel_source(sources, residual_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
const std::string& ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
const std::string& ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
const std::string& stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
const std::string& ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
const std::string& ILU_apply1_s = get_ILU_apply1_source(ilu_operate_on_full_matrix);
add_kernel_source(sources, ILU_apply1_s);
const std::string& ILU_apply2_s = get_ILU_apply2_source(ilu_operate_on_full_matrix);
add_kernel_source(sources, ILU_apply2_s);
const std::string& stdwell_apply_s = get_stdwell_apply_source(true);
add_kernel_source(sources, stdwell_apply_s);
const std::string& stdwell_apply_no_reorder_s = get_stdwell_apply_source(false);
add_kernel_source(sources, stdwell_apply_no_reorder_s);
const std::string& ilu_decomp_s = get_ilu_decomp_source();
add_kernel_source(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
@@ -114,8 +139,16 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
residual_blocked_k.reset(new residual_blocked_kernel_type(cl::Kernel(program, "residual_blocked")));
residual_k.reset(new residual_kernel_type(cl::Kernel(program, "residual")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
@@ -219,6 +252,23 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
}
}
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_vmul;
cl::Event event = (*vmul_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), alpha, in1, in2, out, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels vmul() time: " << t_vmul.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
{
const unsigned int work_group_size = 32;
@@ -236,15 +286,75 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const do
}
}
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t;
cl::Event event = (*full_to_pressure_restriction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), fine_y, weights, coarse_y, Nb);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels full_to_pressure_restriction() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t;
cl::Event event = (*add_coarse_pressure_correction_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), coarse_x, fine_x, pressure_idx, Nb);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t;
cl::Event event = (*prolongate_vector_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, cols, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels prolongate_vector() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_spmv;
cl::Event event;
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (block_size > 1) {
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
} else {
if (reset) {
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
} else {
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
}
}
if (verbosity >= 4) {
event.wait();
@@ -254,6 +364,29 @@ void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer&
}
}
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_residual;
cl::Event event;
if (block_size > 1) {
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
} else {
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
}
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels residual_blocked() time: " << t_residual.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
{
@@ -356,7 +489,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
}
std::string OpenclKernels::get_axpy_string() {
std::string OpenclKernels::get_axpy_source() {
return R"(
__kernel void axpy(
__global double *in,
@@ -377,7 +510,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
// scale vector with scalar
std::string OpenclKernels::get_scale_string() {
std::string OpenclKernels::get_scale_source() {
return R"(
__kernel void scale(
__global double *vec,
@@ -395,9 +528,31 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
)";
}
// multiply vector with another vector and a scalar, element-wise
// add result to a third vector
std::string OpenclKernels::get_vmul_source() {
return R"(
__kernel void vmul(
const double alpha,
__global double const *in1,
__global double const *in2,
__global double *out,
const int N)
{
unsigned int NUM_THREADS = get_global_size(0);
int idx = get_global_id(0);
while(idx < N){
out[idx] += alpha * in1[idx] * in2[idx];
idx += NUM_THREADS;
}
}
)";
}
// returns partial sums, instead of the final dot product
std::string OpenclKernels::get_dot_1_string() {
std::string OpenclKernels::get_dot_1_source() {
return R"(
__kernel void dot_1(
__global double *in1,
@@ -440,7 +595,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
// returns partial sums, instead of the final norm
// the square root must be computed on CPU
std::string OpenclKernels::get_norm_string() {
std::string OpenclKernels::get_norm_source() {
return R"(
__kernel void norm(
__global double *in,
@@ -481,7 +636,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
// p = (p - omega * v) * beta + r
std::string OpenclKernels::get_custom_string() {
std::string OpenclKernels::get_custom_source() {
return R"(
__kernel void custom(
__global double *p,
@@ -506,15 +661,98 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
)";
}
std::string OpenclKernels::get_spmv_blocked_string() {
// transform blocked vector to scalar vector using pressure-weights
// every workitem handles one blockrow
std::string OpenclKernels::get_full_to_pressure_restriction_source() {
return R"(
__kernel void spmv_blocked(
__global const double *vals,
__kernel void full_to_pressure_restriction(
__global const double *fine_y,
__global const double *weights,
__global double *coarse_y,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int block_size = 3;
unsigned int target_block_row = get_global_id(0);
while(target_block_row < Nb){
double sum = 0.0;
unsigned int idx = block_size * target_block_row;
for (unsigned int i = 0; i < block_size; ++i) {
sum += fine_y[idx + i] * weights[idx + i];
}
coarse_y[target_block_row] = sum;
target_block_row += NUM_THREADS;
}
}
)";
}
// add the coarse pressure solution back to the finer, complete solution
// every workitem handles one blockrow
std::string OpenclKernels::get_add_coarse_pressure_correction_source() {
return R"(
__kernel void add_coarse_pressure_correction(
__global const double *coarse_x,
__global double *fine_x,
const unsigned int pressure_idx,
const unsigned int Nb)
{
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int block_size = 3;
unsigned int target_block_row = get_global_id(0);
while(target_block_row < Nb){
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
target_block_row += NUM_THREADS;
}
}
)";
}
// prolongate vector during amg cycle
// every workitem handles one row
std::string OpenclKernels::get_prolongate_vector_source() {
return R"(
__kernel void prolongate_vector(
__global const double *in,
__global double *out,
__global const int *cols,
const unsigned int N)
{
const unsigned int NUM_THREADS = get_global_size(0);
unsigned int row = get_global_id(0);
while(row < N){
out[row] += in[cols[row]];
row += NUM_THREADS;
}
}
)";
}
/// either b = mat * x
/// or res = rhs - mat * x
std::string OpenclKernels::get_blocked_matrix_operation_source(matrix_operation op) {
std::string s;
if (op == matrix_operation::spmv_op) {
s += "__kernel void spmv_blocked(";
} else {
s += "__kernel void residual_blocked(";
}
s += R"(__global const double *vals,
__global const int *cols,
__global const int *rows,
const int Nb,
__global const double *x,
__global double *b,
)";
if (op == matrix_operation::residual_op) {
s += "__global const double *rhs,";
}
s += R"(
__global double *out,
const unsigned int block_size,
__local double *tmp)
{
@@ -566,17 +804,99 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
if(lane < bs){
unsigned int row = target_block_row*bs + lane;
b[row] = tmp[lane];
)";
if (op == matrix_operation::spmv_op) {
s += " out[row] = tmp[lane];";
} else {
s += " out[row] = rhs[row] - tmp[lane];";
}
s += R"(
}
target_block_row += num_warps_in_grid;
}
}
)";
)";
return s;
}
/// either b = mat * x
/// or res = rhs - mat * x
std::string OpenclKernels::get_matrix_operation_source(matrix_operation op, bool spmv_reset) {
std::string s;
if (op == matrix_operation::spmv_op) {
if (spmv_reset) {
s += "__kernel void spmv(";
} else {
s += "__kernel void spmv_noreset(";
}
} else {
s += "__kernel void residual(";
}
s += R"(__global const double *vals,
__global const int *cols,
__global const int *rows,
const int N,
__global const double *x,
)";
if (op == matrix_operation::residual_op) {
s += "__global const double *rhs,";
}
s += R"(
__global double *out,
__local double *tmp)
{
const unsigned int bsize = get_local_size(0);
const unsigned int idx_b = get_global_id(0) / bsize;
const unsigned int idx_t = get_local_id(0);
const unsigned int num_workgroups = get_num_groups(0);
int row = idx_b;
while (row < N) {
int rowStart = rows[row];
int rowEnd = rows[row+1];
int rowLength = rowEnd - rowStart;
double local_sum = 0.0;
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
int col = cols[j];
local_sum += vals[j] * x[col];
}
tmp[idx_t] = local_sum;
barrier(CLK_LOCAL_MEM_FENCE);
int offset = bsize / 2;
while(offset > 0) {
if (idx_t < offset) {
tmp[idx_t] += tmp[idx_t + offset];
}
barrier(CLK_LOCAL_MEM_FENCE);
offset = offset / 2;
}
if (idx_t == 0) {
)";
if (op == matrix_operation::spmv_op) {
if (spmv_reset) {
s += " out[row] = tmp[idx_t];";
} else {
s += " out[row] += tmp[idx_t];";
}
} else {
s += " out[row] = rhs[row] - tmp[idx_t];";
}
s += R"(
}
row += num_workgroups;
}
}
)";
return s;
}
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply1_source(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply1(
__global const double *LUvals,
@@ -652,7 +972,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
}
std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply2_source(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
@@ -735,7 +1055,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
return s;
}
std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
std::string OpenclKernels::get_stdwell_apply_source(bool reorder) {
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
std::string s = "__kernel void " + kernel_name + R"((
__global const double *Cnnzs,
@@ -830,7 +1150,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
}
std::string OpenclKernels::get_ilu_decomp_string() {
std::string OpenclKernels::get_ilu_decomp_source() {
return R"(
// a = a - (b * c)

View File

@@ -30,8 +30,14 @@ namespace Opm
namespace Accelerator
{
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
@@ -55,12 +61,25 @@ private:
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
enum matrix_operation {
spmv_op,
residual_op
};
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
static std::unique_ptr<spmv_kernel_type> spmv_k;
static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
static std::unique_ptr<residual_blocked_kernel_type> residual_blocked_k;
static std::unique_ptr<residual_kernel_type> residual_k;
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
@@ -69,54 +88,70 @@ private:
/// Generate string with axpy kernel
/// a = a + alpha * b
static std::string get_axpy_string();
static std::string get_axpy_source();
/// Generate string with scale kernel
/// a = a * alpha
static std::string get_scale_string();
static std::string get_scale_source();
/// multiply vector with another vector and a scalar, element-wise
/// add result to a third vector
static std::string get_vmul_source();
/// returns partial sums, instead of the final dot product
/// partial sums are added on CPU
static std::string get_dot_1_string();
static std::string get_dot_1_source();
/// returns partial sums, instead of the final norm
/// the square root must be computed on CPU
static std::string get_norm_string();
static std::string get_norm_source();
/// Generate string with custom kernel
/// This kernel combines some ilubicgstab vector operations into 1
/// p = (p - omega * v) * beta + r
static std::string get_custom_string();
static std::string get_custom_source();
/// Transform blocked vector to scalar vector using pressure-weights
static std::string get_full_to_pressure_restriction_source();
/// Add the coarse pressure solution back to the finer, complete solution
static std::string get_add_coarse_pressure_correction_source();
/// Prolongate a vector during the AMG cycle
static std::string get_prolongate_vector_source();
/// b = mat * x
/// algorithm based on:
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
static std::string get_spmv_blocked_string();
/// or
/// res = rhs - (mat * x)
static std::string get_blocked_matrix_operation_source(matrix_operation op);
static std::string get_matrix_operation_source(matrix_operation op, bool spmv_reset = true);
/// ILU apply part 1: forward substitution
/// solves L*x=y where L is a lower triangular sparse blocked matrix
/// this L can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
static std::string get_ILU_apply1_string(bool full_matrix);
static std::string get_ILU_apply1_source(bool full_matrix);
/// ILU apply part 2: backward substitution
/// solves U*x=y where U is an upper triangular sparse blocked matrix
/// this U can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
static std::string get_ILU_apply2_string(bool full_matrix);
static std::string get_ILU_apply2_source(bool full_matrix);
/// Generate string with the stdwell_apply kernels
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
/// the x/y vector is reordered, use toOrder to address that
/// \param[in] reorder whether the matrix is reordered or not
static std::string get_stdwell_apply_string(bool reorder);
static std::string get_stdwell_apply_source(bool reorder);
/// Generate string with the exact ilu decomposition kernel
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
static std::string get_ilu_decomp_string();
static std::string get_ilu_decomp_source();
OpenclKernels(){}; // disable instantiation
@@ -127,8 +162,14 @@ public:
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N);
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);

View File

@@ -45,8 +45,25 @@ using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
bool use_cpr;
if (linsolver.compare("ilu0") == 0) {
use_cpr = false;
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
use_cpr = true;
} else if (linsolver.compare("cpr_trueimpes") == 0) {
OPM_THROW(std::logic_error, "Error openclSolver does not support --linsolver=cpr_trueimpes");
} else {
OPM_THROW(std::logic_error, "Error unknown value for argument --linsolver, " + linsolver);
}
using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
if (use_cpr) {
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder);
} else {
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder);
}
std::ostringstream out;
try {
@@ -191,6 +208,19 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
}
}
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) :
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_reorder(opencl_ilu_reorder_)
{
// prec = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_);
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_reorder, /*use_amg=*/false);
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
context = context_;
queue = queue_;
}
template <unsigned int block_size>
openclSolverBackend<block_size>::~openclSolverBackend() {
@@ -257,7 +287,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// v = A * pw
t_spmv.start();
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@@ -288,7 +318,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// t = A * s
t_spmv.start();
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@@ -332,7 +362,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
if (verbosity >= 4) {
std::ostringstream out;
out << "openclSolver::ilu_apply: " << t_prec.elapsed() << " s\n";
out << "openclSolver::prec_apply: " << t_prec.elapsed() << " s\n";
out << "wellContributions::apply: " << t_well.elapsed() << " s\n";
out << "openclSolver::spmv: " << t_spmv.elapsed() << " s\n";
out << "openclSolver::rest: " << t_rest.elapsed() << " s\n";
@@ -358,13 +388,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
out.clear();
try {
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
prec->setOpencl(context, queue);
#if COPY_ROW_BY_ROW
vals_contiguous = new double[N];
vals_contiguous.resize(nnz);
#endif
mat.reset(new BlockedMatrix<block_size>(Nb, nnzb, vals, cols, rows));
mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -408,10 +437,6 @@ void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
#if COPY_ROW_BY_ROW
delete[] vals_contiguous;
#endif
delete prec;
} // end finalize()
template <unsigned int block_size>
@@ -423,10 +448,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
@@ -463,10 +488,10 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i];
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
#else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
@@ -489,29 +514,34 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyse_matrix() {
bool openclSolverBackend<block_size>::analyze_matrix() {
Timer t;
bool success = prec->init(mat.get());
// bool success = bilu0->init(mat.get());
bool success = prec->analyze_matrix(mat.get());
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();
} else {
// toOrder = bilu0->getToOrder();
// fromOrder = bilu0->getFromOrder();
// rmat = bilu0->getRMat();
toOrder = prec->getToOrder();
fromOrder = prec->getFromOrder();
rmat = prec->getRMat();
}
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::analyse_matrix(): " << t.stop() << " s";
out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
OpmLog::info(out.str());
}
analysis_done = true;
return success;
} // end analyse_matrix()
} // end analyze_matrix()
template <unsigned int block_size>
@@ -603,7 +633,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
if (analysis_done == false) {
if (!analyse_matrix()) {
if (!analyze_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
@@ -624,8 +654,11 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend(int, int, double, unsigned int, unsigned int, ILUReorder); \
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend( \
int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \
template openclSolverBackend<n>::openclSolverBackend(int, int, double, ILUReorder); \
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);

View File

@@ -28,6 +28,8 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
#include <tuple>
namespace Opm
@@ -35,12 +37,14 @@ namespace Opm
namespace Accelerator
{
template <unsigned int block_size>
class CPR;
/// This class implements a opencl-based ilu0-bicgstab solver on GPU
template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
typedef BILU0<block_size> Preconditioner;
using Base::N;
using Base::Nb;
@@ -55,7 +59,7 @@ class openclSolverBackend : public BdaSolver<block_size>
private:
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize()
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU
@@ -66,11 +70,13 @@ private:
std::vector<cl::Device> devices;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
std::unique_ptr<Preconditioner<block_size> > prec;
// can perform blocked ILU0 and AMG on pressure component
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
bool analysis_done = false;
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
std::unique_ptr<BlockedMatrix> mat = nullptr; // original matrix
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
ILUReorder opencl_ilu_reorder; // reordering strategy
std::vector<cl::Event> events;
cl_int err;
@@ -154,9 +160,9 @@ private:
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu();
/// Analyse sparsity pattern to extract parallelism
/// Analyze sparsity pattern to extract parallelism
/// \return true iff analysis was successful
bool analyse_matrix();
bool analyze_matrix();
/// Perform ilu0-decomposition
/// \return true iff decomposition was successful
@@ -178,7 +184,13 @@ public:
/// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, ILUReorder opencl_ilu_reorder);
/// \param[in] linsolver indicating the preconditioner, equal to the --linsolver cmdline argument
/// only ilu0 and cpr_quasiimpes are supported
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
ILUReorder opencl_ilu_reorder, std::string linsolver);
/// For the CPR coarse solver
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
/// Destroy a openclSolver, and free memory
~openclSolverBackend();
@@ -200,10 +212,20 @@ public:
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
// SolverStatus solve_system(BdaResult &res);
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
/// Set OpenCL objects
/// This class either creates them based on platformID and deviceID or receives them through this function
/// \param[in] context the opencl context to be used
/// \param[in] queue the opencl queue to be used
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
}; // end class openclSolverBackend
} // namespace Accelerator

View File

@@ -76,15 +76,16 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
const std::string opencl_ilu_reorder("none"); // unused
const int platformID = 0; // unused
const int deviceID = 0;
const std::string gpu_mode("cusparse");
const std::string accelerator_mode("cusparse");
const std::string fpga_bitstream("empty"); // unused
const std::string linsolver("ilu0");
Dune::InverseOperatorResult result;
Vector x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
bridge->solve_system(&matrix, rhs, *wellContribs, result);
bridge->get_result(x);

View File

@@ -75,15 +75,16 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
const std::string opencl_ilu_reorder("none");
const int platformID = 0;
const int deviceID = 0;
const std::string gpu_mode("opencl");
const std::string accelerator_mode("opencl");
const std::string fpga_bitstream("empty"); // unused
const std::string linsolver("ilu0");
Dune::InverseOperatorResult result;
Vector x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver);
} catch (const std::logic_error& error) {
BOOST_WARN_MESSAGE(true, error.what());
throw PlatformInitException(error.what());
@@ -100,8 +101,8 @@ void test3(const pt::ptree& prm)
{
const int bz = 3;
auto sol = testOpenclSolver<bz>(prm, "matr33.txt", "rhs3.txt");
Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-1.30307e-2, -3.58263e-6, 1.13836e-9},
{-1.25425e-3, -1.4167e-4, -3.2213e-3},
Dune::BlockVector<Dune::FieldVector<double, bz>> expected {{-0.0131626, -3.58263e-6, 1.13836e-9},
{-1.25425e-3, -1.4167e-4, -0.0029366},
{-4.5436e-4, 1.28682e-5, 4.7644e-6}};
BOOST_REQUIRE_EQUAL(sol.size(), expected.size());
for (size_t i = 0; i < sol.size(); ++i) {

View File

@@ -0,0 +1,82 @@
/*
Copyright 2021 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#define BOOST_TEST_MODULE SolveTransposed3x3
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
#include <dune/istl/bcrsmatrix.hh>
#include <opm/simulators/linalg/bda/CPR.hpp>
BOOST_AUTO_TEST_CASE(testsolvetransposed3x3)
{
const unsigned numTests = 3;
const unsigned blockSize = 3;
std::vector<std::vector<double> > A = {{4, 2, 1,
3, 4, 2,
2, 4, 3},
{1, 2, 4,
1, 3, 2,
2, 4, 2},
{1, 2, 2,
1, 3, 5,
3, 2, 4}};
std::vector<std::vector<double> > b = {{0, 1, 0},
{1, 3, 5},
{2, 4, 5}};
std::vector<std::vector<double> > x_expected = {{-0.5, 1, -0.5},
{ 1, 1, -0.5},
{ 1.3, 0.4, 0.1}};
for (unsigned testId = 0; testId < numTests; ++testId) {
std::vector<double> x(blockSize);
Opm::Accelerator::solve_transposed_3x3(A[testId].data(), b[testId].data(), x.data());
for (unsigned i = 0; i < blockSize; ++i) {
BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
}
}
// retest cases using Dune methods
using Mat3 = Dune::FieldMatrix<double, 3, 3>;
using Vec3 = Dune::FieldVector<double, 3>;
for (unsigned testId = 0; testId < numTests; ++testId) {
Mat3 a3 = {{ A[testId][0], A[testId][1], A[testId][2] },
{ A[testId][3], A[testId][4], A[testId][5] },
{ A[testId][6], A[testId][7], A[testId][8] } };
Vec3 y({b[testId][0], b[testId][1], b[testId][2]});
Mat3 b3 = a3;
// b3 = inv(a3^T)
Dune::FMatrixHelp::invertMatrix_retTransposed(a3, b3);
// x = b3 * y
Vec3 x = Dune::FMatrixHelp::mult(b3, y);
for (unsigned i = 0; i < blockSize; ++i) {
BOOST_CHECK_CLOSE(x[i], x_expected[testId][i], 1e-6);
}
}
}