Merge pull request #4136 from Tongdongq/remove_reordering

Remove reordering for opencl
This commit is contained in:
Markus Blatt 2022-10-19 15:30:01 +02:00 committed by GitHub
commit 3e680e41f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 258 additions and 625 deletions

View File

@ -293,7 +293,6 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/FPGASolverBackend.hpp opm/simulators/linalg/bda/FPGASolverBackend.hpp
opm/simulators/linalg/bda/FPGAUtils.hpp opm/simulators/linalg/bda/FPGAUtils.hpp
opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/Reorder.hpp
opm/simulators/linalg/bda/ILUReorder.hpp
opm/simulators/linalg/bda/opencl/opencl.hpp opm/simulators/linalg/bda/opencl/opencl.hpp
opm/simulators/linalg/bda/opencl/openclKernels.hpp opm/simulators/linalg/bda/opencl/openclKernels.hpp
opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp

View File

@ -129,7 +129,7 @@ struct OpenclPlatformId {
using type = UndefinedProperty; using type = UndefinedProperty;
}; };
template<class TypeTag, class MyTypeTag> template<class TypeTag, class MyTypeTag>
struct OpenclIluReorder { struct OpenclIluParallel {
using type = UndefinedProperty; using type = UndefinedProperty;
}; };
template<class TypeTag, class MyTypeTag> template<class TypeTag, class MyTypeTag>
@ -231,8 +231,8 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr int value = 0; static constexpr int value = 0;
}; };
template<class TypeTag> template<class TypeTag>
struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> { struct OpenclIluParallel<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr auto value = ""; // note: default value is chosen depending on the solver used static constexpr bool value = true; // note: false should only be used in debug
}; };
template<class TypeTag> template<class TypeTag>
struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> { struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
@ -268,7 +268,7 @@ namespace Opm
int cpr_max_ell_iter_; int cpr_max_ell_iter_;
int cpr_reuse_setup_; int cpr_reuse_setup_;
int cpr_reuse_interval_; int cpr_reuse_interval_;
std::string opencl_ilu_reorder_; bool opencl_ilu_parallel_;
std::string fpga_bitstream_; std::string fpga_bitstream_;
template <class TypeTag> template <class TypeTag>
@ -295,7 +295,7 @@ namespace Opm
accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); opencl_ilu_parallel_ = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel);
fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
} }
@ -322,7 +322,7 @@ namespace Opm
EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]'"); EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]'");
EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly."); EWOMS_REGISTER_PARAM(TypeTag, bool, OpenclIluParallel, "Parallelize ILU decomposition and application on GPU. Default: true");
EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'"); EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
} }
@ -346,7 +346,7 @@ namespace Opm
accelerator_mode_ = "none"; accelerator_mode_ = "none";
bda_device_id_ = 0; bda_device_id_ = 0;
opencl_platform_id_ = 0; opencl_platform_id_ = 0;
opencl_ilu_reorder_ = ""; // note: the default value is chosen depending on the solver used opencl_ilu_parallel_ = true;
fpga_bitstream_ = ""; fpga_bitstream_ = "";
} }
}; };

View File

@ -181,12 +181,12 @@ BdaSolverInfo(const std::string& accelerator_mode,
const double tolerance, const double tolerance,
const int platformID, const int platformID,
const int deviceID, const int deviceID,
const std::string& opencl_ilu_reorder, const bool opencl_ilu_parallel,
const std::string& linsolver) const std::string& linsolver)
: bridge_(std::make_unique<Bridge>(accelerator_mode, fpga_bitstream, : bridge_(std::make_unique<Bridge>(accelerator_mode, fpga_bitstream,
linear_solver_verbosity, maxit, linear_solver_verbosity, maxit,
tolerance, platformID, deviceID, tolerance, platformID, deviceID,
opencl_ilu_reorder, linsolver)) opencl_ilu_parallel, linsolver))
, accelerator_mode_(accelerator_mode) , accelerator_mode_(accelerator_mode)
{} {}

View File

@ -127,7 +127,7 @@ struct BdaSolverInfo
const double tolerance, const double tolerance,
const int platformID, const int platformID,
const int deviceID, const int deviceID,
const std::string& opencl_ilu_reorder, const bool opencl_ilu_parallel,
const std::string& linsolver); const std::string& linsolver);
~BdaSolverInfo(); ~BdaSolverInfo();
@ -259,7 +259,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter); const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); const bool opencl_ilu_parallel = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolver); std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolver);
@ -270,7 +270,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
tolerance, tolerance,
platformID, platformID,
deviceID, deviceID,
opencl_ilu_reorder, opencl_ilu_parallel,
linsolver); linsolver);
} }
#else #else

View File

@ -53,7 +53,6 @@ namespace Opm
using Opm::Accelerator::BdaResult; using Opm::Accelerator::BdaResult;
using Opm::Accelerator::BdaSolver; using Opm::Accelerator::BdaSolver;
using Opm::Accelerator::SolverStatus; using Opm::Accelerator::SolverStatus;
using Opm::Accelerator::ILUReorder;
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
@ -62,7 +61,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
double tolerance, double tolerance,
[[maybe_unused]] unsigned int platformID, [[maybe_unused]] unsigned int platformID,
unsigned int deviceID, unsigned int deviceID,
[[maybe_unused]] std::string opencl_ilu_reorder, [[maybe_unused]] bool opencl_ilu_parallel,
[[maybe_unused]] std::string linsolver) [[maybe_unused]] std::string linsolver)
: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_) : verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
{ {
@ -76,36 +75,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
} else if (accelerator_mode.compare("opencl") == 0) { } else if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL #if HAVE_OPENCL
use_gpu = true; use_gpu = true;
ILUReorder ilu_reorder; backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver));
if (opencl_ilu_reorder == "") {
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
} else if (opencl_ilu_reorder == "none") {
ilu_reorder = Opm::Accelerator::ILUReorder::NONE;
} else {
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new Opm::Accelerator::openclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, ilu_reorder, linsolver));
#else #else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif #endif
} else if (accelerator_mode.compare("fpga") == 0) { } else if (accelerator_mode.compare("fpga") == 0) {
#if HAVE_FPGA #if HAVE_FPGA
use_fpga = true; use_fpga = true;
ILUReorder ilu_reorder; backend.reset(new Opm::Accelerator::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, opencl_ilu_parallel));
if (opencl_ilu_reorder == "") {
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = Opm::Accelerator::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = Opm::Accelerator::ILUReorder::GRAPH_COLORING;
} else {
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new Opm::Accelerator::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
#else #else
OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake"); OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
#endif #endif

View File

@ -23,8 +23,6 @@
#include "dune/istl/solver.hh" // for struct InverseOperatorResult #include "dune/istl/solver.hh" // for struct InverseOperatorResult
#include <opm/simulators/linalg/bda/BdaSolver.hpp> #include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
namespace Opm namespace Opm
{ {
@ -32,7 +30,6 @@ namespace Opm
class WellContributions; class WellContributions;
typedef Dune::InverseOperatorResult InverseOperatorResult; typedef Dune::InverseOperatorResult InverseOperatorResult;
using Opm::Accelerator::ILUReorder;
/// BdaBridge acts as interface between opm-simulators with the BdaSolvers /// BdaBridge acts as interface between opm-simulators with the BdaSolvers
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
@ -60,10 +57,10 @@ public:
/// \param[in] tolerance required relative tolerance for BdaSolver /// \param[in] tolerance required relative tolerance for BdaSolver
/// \param[in] platformID the OpenCL platform ID to be used /// \param[in] platformID the OpenCL platform ID to be used
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors /// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation /// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
/// \param[in] linsolver copy of cmdline argument --linear-solver /// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder, std::string linsolver); unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
/// Solve linear system, A*x = b /// Solve linear system, A*x = b

View File

@ -1,39 +0,0 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef ILUREORDER_HEADER_INCLUDED
#define ILUREORDER_HEADER_INCLUDED
namespace Opm
{
namespace Accelerator
{
// Level Scheduling respects the dependencies in the original matrix, and behaves like Dune and cusparse
// Graph Coloring is more aggresive and is likely to increase the number of linearizations and linear iterations to converge significantly, but can still be faster on GPU because it results in more parallelism
enum class ILUReorder {
LEVEL_SCHEDULING,
GRAPH_COLORING,
NONE
};
} // namespace Accelerator
} // namespace Opm
#endif

View File

@ -77,7 +77,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
for (unsigned int row = 0; row < Mb; ++row) { for (unsigned int row = 0; row < Mb; ++row) {
// for every block in the row // for every block in the row
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) { for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
unsigned int colIdx = getColIdx(Bcols[blockID]); unsigned int colIdx = Bcols[blockID];
for (unsigned int j = 0; j < dim_wells; ++j) { for (unsigned int j = 0; j < dim_wells; ++j) {
double temp = 0.0; double temp = 0.0;
for (unsigned int k = 0; k < dim; ++k) { for (unsigned int k = 0; k < dim; ++k) {
@ -97,7 +97,7 @@ void MultisegmentWellContribution::apply(double *h_x, double *h_y)
for (unsigned int row = 0; row < Mb; ++row) { for (unsigned int row = 0; row < Mb; ++row) {
// for every block in the row // for every block in the row
for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) { for (unsigned int blockID = Brows[row]; blockID < Brows[row + 1]; ++blockID) {
unsigned int colIdx = getColIdx(Bcols[blockID]); unsigned int colIdx = Bcols[blockID];
for (unsigned int j = 0; j < dim; ++j) { for (unsigned int j = 0; j < dim; ++j) {
double temp = 0.0; double temp = 0.0;
for (unsigned int k = 0; k < dim_wells; ++k) { for (unsigned int k = 0; k < dim_wells; ++k) {
@ -116,20 +116,5 @@ void MultisegmentWellContribution::setCudaStream(cudaStream_t stream_)
} }
#endif #endif
unsigned int MultisegmentWellContribution::getColIdx(unsigned int idx)
{
if (reorder) {
return toOrder[idx];
} else {
return idx;
}
}
void MultisegmentWellContribution::setReordering(int *toOrder_, bool reorder_)
{
this->toOrder = toOrder_;
this->reorder = reorder_;
}
} //namespace Opm } //namespace Opm

View File

@ -68,9 +68,6 @@ private:
std::vector<double> z2; // z2 = D^-1 * B * x std::vector<double> z2; // z2 = D^-1 * B * x
void *UMFPACK_Symbolic, *UMFPACK_Numeric; void *UMFPACK_Symbolic, *UMFPACK_Numeric;
int *toOrder = nullptr;
bool reorder = false;
/// Translate the columnIndex if needed /// Translate the columnIndex if needed
/// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well /// Some preconditioners reorder the rows of the matrix, this means the columnIndices of the wellcontributions need to be reordered as well
unsigned int getColIdx(unsigned int idx); unsigned int getColIdx(unsigned int idx);
@ -117,12 +114,6 @@ public:
/// \param[in] h_x vector x, must be on CPU /// \param[in] h_x vector x, must be on CPU
/// \param[inout] h_y vector y, must be on CPU /// \param[inout] h_y vector y, must be on CPU
void apply(double *h_x, double *h_y); void apply(double *h_x, double *h_y);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int *toOrder, bool reorder);
}; };
} //namespace Opm } //namespace Opm

View File

@ -336,6 +336,7 @@ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowInd
nextActiveRowIndex++; nextActiveRowIndex++;
} }
} }
rowsToStart.clear();
colorEnd = nextActiveRowIndex; colorEnd = nextActiveRowIndex;
rowsPerColor.emplace_back(nextActiveRowIndex - activeRowIndex); rowsPerColor.emplace_back(nextActiveRowIndex - activeRowIndex);
} }

View File

@ -93,16 +93,16 @@ void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, doubl
bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndices, const std::vector<bool>& doneRows); bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndices, const std::vector<bool>& doneRows);
/// Find a level scheduling reordering for an input matrix /// Find a level scheduling reordering for an input matrix
/// The toOrder and fromOrder arrays must be allocated already /// The toOrder and fromOrder arrays must be allocated already
/// \param[in] CSRColIndices column indices array, obtained from storing the input matrix in the CSR format /// \param[in] CSRColIndices column indices array, obtained from storing the input matrix in the CSR format
/// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format /// \param[in] CSRRowPointers row pointers array, obtained from storing the input matrix in the CSR format
/// \param[in] CSCRowIndices row indices array, obtained from storing the input matrix in the CSC format /// \param[in] CSCRowIndices row indices array, obtained from storing the input matrix in the CSC format
/// \param[in] CSCColPointers column pointers array, obtained from storing the input matrix in the CSC format /// \param[in] CSCColPointers column pointers array, obtained from storing the input matrix in the CSC format
/// \param[in] Nb number of blockrows in the matrix /// \param[in] Nb number of blockrows in the matrix
/// \param[out] numColors a pointer to the number of colors needed for the level scheduling /// \param[out] numColors a pointer to the number of colors needed for the level scheduling
/// \param[inout] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved /// \param[out] toOrder the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
/// \param[inout] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved /// \param[out] fromOrder the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rowsPerColor for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill /// \param[out] rowsPerColor for each color, an array of all rowIndices in that color, this function uses emplace_back() to fill
void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor); void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);
/// Find a graph coloring reordering for an input matrix /// Find a graph coloring reordering for an input matrix

View File

@ -24,7 +24,6 @@
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh> #include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp> #include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp> #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
@ -40,8 +39,8 @@ using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
template <unsigned int block_size> template <unsigned int block_size>
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) : BILU0<block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_) Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
{ {
#if CHOW_PATEL #if CHOW_PATEL
chowPatelIlu.setVerbosity(verbosity); chowPatelIlu.setVerbosity(verbosity);
@ -71,21 +70,13 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat
if (opencl_ilu_reorder == ILUReorder::NONE) { if (opencl_ilu_parallel) {
LUmat = std::make_unique<BlockedMatrix>(*mat);
} else {
toOrder.resize(Nb); toOrder.resize(Nb);
fromOrder.resize(Nb); fromOrder.resize(Nb);
CSCRowIndices.resize(matToDecompose->nnzbs); CSCRowIndices.resize(matToDecompose->nnzbs);
CSCColPointers.resize(Nb + 1); CSCColPointers.resize(Nb + 1);
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
if (jacMat) { LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
} else {
LUmat = std::make_unique<BlockedMatrix>(*rmat);
}
Timer t_convert; Timer t_convert;
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb); csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
@ -94,36 +85,26 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s"; out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str()); OpmLog::info(out.str());
} }
} else {
LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);
} }
Timer t_analysis; Timer t_analysis;
std::ostringstream out; std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { if (opencl_ilu_parallel) {
out << "BILU0 reordering strategy: " << "level_scheduling\n"; out << "opencl_ilu_parallel: true (level_scheduling)\n";
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get()); } else {
if (jacMat) { out << "opencl_ilu_parallel: false\n";
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
// numColors = 1; // numColors = 1;
// rowsPerColor.emplace_back(Nb); // rowsPerColor.emplace_back(Nb);
numColors = Nb; numColors = Nb;
for(int i = 0; i < Nb; ++i){ for(int i = 0; i < Nb; ++i){
rowsPerColor.emplace_back(1); rowsPerColor.emplace_back(1);
} }
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
} }
if(verbosity >= 1){
if (verbosity >= 1) {
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n"; out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
} }
@ -143,6 +124,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb); s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
#if CHOW_PATEL #if CHOW_PATEL
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
@ -156,15 +138,25 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1)); s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
#endif #endif
events.resize(2); events.resize(3);
err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]); err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) { for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
} }
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]); err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
if (opencl_ilu_parallel) {
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);
} else {
// fromOrder is not initialized, so use something else to fill s.rowIndices
// s.rowIndices[i] == i must hold, since every rowidx is mapped to itself (i.e. no actual mapping)
// rowsPerColorPrefix is misused here, it contains an increasing sequence (0, 1, 2, ...)
err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), rowsPerColorPrefix.data(), nullptr, &events[2]);
}
cl::WaitForEvents(events); cl::WaitForEvents(events);
events.clear(); events.clear();
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
@ -189,30 +181,9 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
{ {
const unsigned int bs = block_size; const unsigned int bs = block_size;
auto *matToDecompose = jacMat; auto *matToDecompose = jacMat ? jacMat : mat;
if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
matToDecompose = jacMat ? jacMat : mat;
} else {
Timer t_reorder;
if (jacMat) {
matToDecompose = rJacMat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
} else {
matToDecompose = rmat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
}
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
Timer t_copy; Timer t_copy;
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs); memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
@ -237,7 +208,6 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
std::call_once(pattern_uploaded, [&](){ std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block // find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) { for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row]; int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1]; int rowEnd = LUmat->rowPointers[row+1];
@ -269,11 +239,11 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
std::ostringstream out; std::ostringstream out;
for (int color = 0; color < numColors; ++color) { for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color]; const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1]; const unsigned int lastRow = rowsPerColorPrefix[color + 1];
if (verbosity >= 5) { if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
} }
OpenclKernels::ILU_decomp(firstRow, lastRow, OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size); s.invDiagVals, rowsPerColor[color], block_size);
} }
@ -301,11 +271,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = 0; color < numColors; ++color) { for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor, s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor, s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#endif #endif
@ -313,11 +283,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = numColors - 1; color >= 0; --color) { for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor, s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor, s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#endif #endif

View File

@ -23,7 +23,6 @@
#include <mutex> #include <mutex>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp> #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/opencl/opencl.hpp> #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp> #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
@ -36,7 +35,8 @@ namespace Accelerator
{ {
/// This class implements a Blocked ILU0 preconditioner /// This class implements a Blocked ILU0 preconditioner
/// The decomposition is done on CPU, and reorders the rows of the matrix /// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
/// The preconditioner is applied via two exact triangular solves
template <unsigned int block_size> template <unsigned int block_size>
class BILU0 : public Preconditioner<block_size> class BILU0 : public Preconditioner<block_size>
{ {
@ -54,8 +54,6 @@ class BILU0 : public Preconditioner<block_size>
private: private:
std::unique_ptr<BlockedMatrix> LUmat = nullptr; std::unique_ptr<BlockedMatrix> LUmat = nullptr;
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
std::shared_ptr<BlockedMatrix> rJacMat = nullptr;
#if CHOW_PATEL #if CHOW_PATEL
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr; std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
#endif #endif
@ -67,15 +65,15 @@ private:
int numColors; int numColors;
std::once_flag pattern_uploaded; std::once_flag pattern_uploaded;
ILUReorder opencl_ilu_reorder; bool opencl_ilu_parallel;
std::vector<int> reordermappingNonzeroes; // maps nonzero blocks to new location in reordered matrix
std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
typedef struct { typedef struct {
cl::Buffer invDiagVals; cl::Buffer invDiagVals; // nnz values of diagonal blocks of the matrix, inverted
cl::Buffer diagIndex; cl::Buffer diagIndex; // index of diagonal block of each row, used to differentiate between lower and upper triangular part
cl::Buffer rowsPerColor; cl::Buffer rowsPerColor; // number of rows for every color
cl::Buffer rowIndices; // mapping every row to another index
// after mapping, all rows that are processed in parallel are contiguous
// equal to the contents of fromOrder
#if CHOW_PATEL #if CHOW_PATEL
cl::Buffer Lvals, Lcols, Lrows; cl::Buffer Lvals, Lcols, Lrows;
cl::Buffer Uvals, Ucols, Urows; cl::Buffer Uvals, Ucols, Urows;
@ -92,9 +90,9 @@ private:
public: public:
BILU0(ILUReorder opencl_ilu_reorder, int verbosity); BILU0(bool opencl_ilu_parallel, int verbosity);
// analysis, find reordering if specified // analysis, extract parallelism if specified
bool analyze_matrix(BlockedMatrix *mat) override; bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
@ -103,28 +101,10 @@ public:
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override; bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// apply preconditioner, x = prec(y) // apply preconditioner, x = prec(y)
// via Lz = y
// and Ux = z
void apply(const cl::Buffer& y, cl::Buffer& x) override; void apply(const cl::Buffer& y, cl::Buffer& x) override;
int* getToOrder() override
{
return toOrder.data();
}
int* getFromOrder() override
{
return fromOrder.data();
}
BlockedMatrix* getRMat() override
{
return rmat.get();
}
BlockedMatrix* getRJacMat()
{
return rJacMat.get();
}
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure() std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
{ {
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex}; return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};

View File

@ -42,13 +42,13 @@ using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
template <unsigned int block_size> template <unsigned int block_size>
BISAI<block_size>::BISAI(ILUReorder opencl_ilu_reorder_, int verbosity_) : BISAI<block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_) :
Preconditioner<block_size>(verbosity_) Preconditioner<block_size>(verbosity_)
{ {
#if CHOW_PATEL #if CHOW_PATEL
OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n"); OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
#endif #endif
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder_, verbosity_); bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel_, verbosity_);
} }
template <unsigned int block_size> template <unsigned int block_size>

View File

@ -70,7 +70,7 @@ private:
cl::Buffer d_invUvals; cl::Buffer d_invUvals;
cl::Buffer d_invL_x; cl::Buffer d_invL_x;
ILUReorder opencl_ilu_reorder; bool opencl_ilu_parallel;
std::unique_ptr<BILU0<block_size> > bilu0; std::unique_ptr<BILU0<block_size> > bilu0;
/// Struct that holds the structure of the small subsystems for each column /// Struct that holds the structure of the small subsystems for each column
@ -110,12 +110,12 @@ private:
void buildUpperSubsystemsStructures(); void buildUpperSubsystemsStructures();
public: public:
BISAI(ILUReorder opencl_ilu_reorder, int verbosity); BISAI(bool opencl_ilu_parallel, int verbosity);
// set own Opencl variables, but also that of the bilu0 preconditioner // set own Opencl variables, but also that of the bilu0 preconditioner
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override; void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
// analysis, find reordering if specified // analysis, extract parallelism
bool analyze_matrix(BlockedMatrix *mat) override; bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
@ -125,21 +125,6 @@ public:
// apply preconditioner, x = prec(y) // apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) override; void apply(const cl::Buffer& y, cl::Buffer& x) override;
int* getToOrder() override
{
return bilu0->getToOrder();
}
int* getFromOrder() override
{
return bilu0->getFromOrder();
}
BlockedMatrix* getRMat() override
{
return bilu0->getRMat();
}
}; };
/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion. /// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.

View File

@ -44,10 +44,10 @@ using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
template <unsigned int block_size> template <unsigned int block_size>
CPR<block_size>::CPR(int verbosity_, ILUReorder opencl_ilu_reorder_) : CPR<block_size>::CPR(int verbosity_, bool opencl_ilu_parallel_) :
Preconditioner<block_size>(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_) Preconditioner<block_size>(verbosity_), opencl_ilu_parallel(opencl_ilu_parallel_)
{ {
bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_); bilu0 = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
diagIndices.resize(1); diagIndices.resize(1);
} }
@ -71,12 +71,7 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
this->nnz = nnzb * block_size * block_size; this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_); bool success = bilu0->analyze_matrix(mat_);
mat = mat_;
if (opencl_ilu_reorder == ILUReorder::NONE) {
mat = mat_;
} else {
mat = bilu0->getRMat();
}
return success; return success;
} }
@ -88,12 +83,8 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat)
this->nnz = nnzb * block_size * block_size; this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_, jacMat); bool success = bilu0->analyze_matrix(mat_, jacMat);
mat = mat_;
if (opencl_ilu_reorder == ILUReorder::NONE) {
mat = mat_;
} else {
mat = bilu0->getRMat();
}
return success; return success;
} }

View File

@ -34,7 +34,6 @@
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp> #include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
#include <opm/simulators/linalg/bda/Matrix.hpp> #include <opm/simulators/linalg/bda/Matrix.hpp>
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp> #include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp> #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
#include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp> #include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>
@ -98,7 +97,7 @@ private:
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar std::unique_ptr<openclSolverBackend<1> > coarse_solver; // coarse solver is scalar
ILUReorder opencl_ilu_reorder; // reordering strategy for ILU0 in coarse solver bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
// Analyze the AMG hierarchy build by Dune // Analyze the AMG hierarchy build by Dune
void analyzeHierarchy(); void analyzeHierarchy();
@ -122,7 +121,7 @@ private:
public: public:
CPR(int verbosity, ILUReorder opencl_ilu_reorder); CPR(int verbosity, bool opencl_ilu_parallel);
bool analyze_matrix(BlockedMatrix *mat) override; bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
@ -136,21 +135,6 @@ public:
bool create_preconditioner(BlockedMatrix *mat) override; bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override; bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
int* getToOrder() override
{
return bilu0->getToOrder();
}
int* getFromOrder() override
{
return bilu0->getFromOrder();
}
BlockedMatrix* getRMat() override
{
return bilu0->getRMat();
}
}; };
// solve A^T * x = b // solve A^T * x = b

View File

@ -756,9 +756,7 @@ void ChowPatelIlu<block_size>::decomposition(
for(int i = 0; i < Nb; ++i) { for(int i = 0; i < Nb; ++i) {
for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) { for(int k = Ut->rowPointers[i]; k < Ut->rowPointers[i+1]; ++k) {
int j = Ut->colIndices[k]; int j = Ut->colIndices[k];
if (j == i) { if (j != i) {
inverter(Ut->nnzValues + k * bs * bs, invDiagVals + i * bs * bs);
} else {
++ptr[j+1]; ++ptr[j+1];
} }
} }
@ -799,7 +797,6 @@ void ChowPatelIlu<block_size>::decomposition(
std::call_once(pattern_uploaded, [&](){ std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block // find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) { for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row]; int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row+1]; int rowEnd = LUmat->rowPointers[row+1];

View File

@ -40,13 +40,13 @@ void Preconditioner<block_size>::setOpencl(std::shared_ptr<cl::Context>& context
} }
template <unsigned int block_size> template <unsigned int block_size>
std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder) { std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(PreconditionerType type, int verbosity, bool opencl_ilu_parallel) {
if (type == PreconditionerType::BILU0) { if (type == PreconditionerType::BILU0) {
return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_reorder, verbosity); return std::make_unique<Opm::Accelerator::BILU0<block_size> >(opencl_ilu_parallel, verbosity);
} else if (type == PreconditionerType::CPR) { } else if (type == PreconditionerType::CPR) {
return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_reorder); return std::make_unique<Opm::Accelerator::CPR<block_size> >(verbosity, opencl_ilu_parallel);
} else if (type == PreconditionerType::BISAI) { } else if (type == PreconditionerType::BISAI) {
return std::make_unique<Opm::Accelerator::BISAI<block_size> >(opencl_ilu_reorder, verbosity); return std::make_unique<Opm::Accelerator::BISAI<block_size> >(opencl_ilu_parallel, verbosity);
} else { } else {
OPM_THROW(std::logic_error, "Invalid PreconditionerType"); OPM_THROW(std::logic_error, "Invalid PreconditionerType");
} }
@ -63,11 +63,12 @@ bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[may
} }
#define INSTANTIATE_BDA_FUNCTIONS(n) \ #define INSTANTIATE_BDA_FUNCTIONS(n) \
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \ template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, bool); \
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \ template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \ template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \
template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *); template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2); INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3); INSTANTIATE_BDA_FUNCTIONS(3);

View File

@ -21,7 +21,6 @@
#define OPM_PRECONDITIONER_HEADER_INCLUDED #define OPM_PRECONDITIONER_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/opencl/opencl.hpp> #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
namespace Opm namespace Opm
{ {
@ -58,7 +57,7 @@ public:
BISAI BISAI
}; };
static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, ILUReorder opencl_ilu_reorder); static std::unique_ptr<Preconditioner> create(PreconditionerType type, int verbosity, bool opencl_ilu_parallel);
virtual ~Preconditioner() = default; virtual ~Preconditioner() = default;
@ -78,13 +77,6 @@ public:
// the version with two params can be overloaded, if not, it will default to using the one param version // the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool create_preconditioner(BlockedMatrix *mat) = 0; virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat); virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
// get reordering mappings
virtual int* getToOrder() = 0;
virtual int* getFromOrder() = 0;
// get reordered matrix
virtual BlockedMatrix* getRMat() = 0;
}; };
} //namespace Accelerator } //namespace Accelerator

View File

@ -3,6 +3,7 @@
/// Here, L is it's own BSR matrix. /// Here, L is it's own BSR matrix.
/// Only used with ChowPatelIlu. /// Only used with ChowPatelIlu.
__kernel void ILU_apply1( __kernel void ILU_apply1(
__global const unsigned *rowIndices,
__global const double *LUvals, __global const double *LUvals,
__global const unsigned int *LUcols, __global const unsigned int *LUcols,
__global const unsigned int *LUrows, __global const unsigned int *LUrows,
@ -30,14 +31,16 @@ __kernel void ILU_apply1(
target_block_row += nodesPerColorPrefix[color]; target_block_row += nodesPerColorPrefix[color];
while(target_block_row < nodesPerColorPrefix[color+1]){ while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row]; unsigned row_idx = rowIndices[target_block_row];
const unsigned int last_block = LUrows[target_block_row+1];
const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs); unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0; double local_out = 0.0;
if(lane < num_active_threads){ if(lane < num_active_threads){
if(lane < bs){ if(lane < bs){
local_out = y[target_block_row*bs+lane]; local_out = y[row_idx*bs+lane];
} }
for(; block < last_block; block += num_blocks_per_warp){ for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c]; const double x_elem = x[LUcols[block]*bs + c];
@ -60,7 +63,7 @@ __kernel void ILU_apply1(
} }
if(lane < bs){ if(lane < bs){
const unsigned int row = target_block_row*bs + lane; const unsigned int row = row_idx*bs + lane;
x[row] = tmp[lane]; x[row] = tmp[lane];
} }

View File

@ -3,6 +3,7 @@
/// Here, L is inside a normal, square matrix. /// Here, L is inside a normal, square matrix.
/// In this case, diagIndex indicates where the rows of L end. /// In this case, diagIndex indicates where the rows of L end.
__kernel void ILU_apply1( __kernel void ILU_apply1(
__global const unsigned *rowIndices,
__global const double *LUvals, __global const double *LUvals,
__global const unsigned int *LUcols, __global const unsigned int *LUcols,
__global const unsigned int *LUrows, __global const unsigned int *LUrows,
@ -29,14 +30,17 @@ __kernel void ILU_apply1(
const unsigned int r = lane % bs; const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){ while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row]; unsigned row_idx = rowIndices[target_block_row];
const unsigned int last_block = diagIndex[target_block_row];
const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = diagIndex[row_idx];
unsigned int block = first_block + lane / (bs*bs); unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0; double local_out = 0.0;
if(lane < num_active_threads){ if(lane < num_active_threads){
if(lane < bs){ if(lane < bs){
local_out = y[target_block_row*bs+lane]; local_out = y[row_idx*bs+lane];
} }
for(; block < last_block; block += num_blocks_per_warp){ for(; block < last_block; block += num_blocks_per_warp){
const double x_elem = x[LUcols[block]*bs + c]; const double x_elem = x[LUcols[block]*bs + c];
@ -59,7 +63,7 @@ __kernel void ILU_apply1(
} }
if(lane < bs){ if(lane < bs){
const unsigned int row = target_block_row*bs + lane; const unsigned int row = row_idx*bs + lane;
x[row] = tmp[lane]; x[row] = tmp[lane];
} }

View File

@ -3,6 +3,7 @@
/// Here, U is it's own BSR matrix. /// Here, U is it's own BSR matrix.
/// Only used with ChowPatelIlu. /// Only used with ChowPatelIlu.
__kernel void ILU_apply2( __kernel void ILU_apply2(
__global const unsigned *rowIndices,
__global const double *LUvals, __global const double *LUvals,
__global const int *LUcols, __global const int *LUcols,
__global const int *LUrows, __global const int *LUrows,
@ -29,14 +30,15 @@ __kernel void ILU_apply2(
const unsigned int r = lane % bs; const unsigned int r = lane % bs;
while(target_block_row < nodesPerColorPrefix[color+1]){ while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = LUrows[target_block_row]; unsigned row_idx = rowIndices[target_block_row];
const unsigned int last_block = LUrows[target_block_row+1]; const unsigned int first_block = LUrows[row_idx];
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs); unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0; double local_out = 0.0;
if(lane < num_active_threads){ if(lane < num_active_threads){
if(lane < bs){ if(lane < bs){
const unsigned int row = target_block_row*bs+lane; const unsigned int row = row_idx*bs+lane;
local_out = x[row]; local_out = x[row];
} }
for(; block < last_block; block += num_blocks_per_warp){ for(; block < last_block; block += num_blocks_per_warp){
@ -64,10 +66,10 @@ __kernel void ILU_apply2(
tmp[lane + bs*idx_t/warpsize] = local_out; tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0; double sum = 0.0;
for(int i = 0; i < bs; ++i){ for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
} }
const unsigned int row = target_block_row*bs + lane; const unsigned int row = row_idx*bs + lane;
x[row] = sum; x[row] = sum;
} }

View File

@ -3,6 +3,7 @@
/// Here, U is inside a normal, square matrix. /// Here, U is inside a normal, square matrix.
/// In this case diagIndex indicates where the rows of U start. /// In this case diagIndex indicates where the rows of U start.
__kernel void ILU_apply2( __kernel void ILU_apply2(
__global const unsigned *rowIndices,
__global const double *LUvals, __global const double *LUvals,
__global const int *LUcols, __global const int *LUcols,
__global const int *LUrows, __global const int *LUrows,
@ -30,14 +31,15 @@ __kernel void ILU_apply2(
target_block_row += nodesPerColorPrefix[color]; target_block_row += nodesPerColorPrefix[color];
while(target_block_row < nodesPerColorPrefix[color+1]){ while(target_block_row < nodesPerColorPrefix[color+1]){
const unsigned int first_block = diagIndex[target_block_row] + 1; unsigned row_idx = rowIndices[target_block_row];
const unsigned int last_block = LUrows[target_block_row+1]; const unsigned int first_block = diagIndex[row_idx] + 1;
const unsigned int last_block = LUrows[row_idx+1];
unsigned int block = first_block + lane / (bs*bs); unsigned int block = first_block + lane / (bs*bs);
double local_out = 0.0; double local_out = 0.0;
if(lane < num_active_threads){ if(lane < num_active_threads){
if(lane < bs){ if(lane < bs){
const unsigned int row = target_block_row*bs+lane; const unsigned int row = row_idx*bs+lane;
local_out = x[row]; local_out = x[row];
} }
for(; block < last_block; block += num_blocks_per_warp){ for(; block < last_block; block += num_blocks_per_warp){
@ -65,10 +67,10 @@ __kernel void ILU_apply2(
tmp[lane + bs*idx_t/warpsize] = local_out; tmp[lane + bs*idx_t/warpsize] = local_out;
double sum = 0.0; double sum = 0.0;
for(int i = 0; i < bs; ++i){ for(int i = 0; i < bs; ++i){
sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize]; sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
} }
const unsigned int row = target_block_row*bs + lane; const unsigned int row = row_idx*bs + lane;
x[row] = sum; x[row] = sum;
} }

View File

@ -68,6 +68,7 @@ __kernel void inverter(__global double *matrix, __global double *inverse)
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
__kernel void ilu_decomp(const unsigned int firstRow, __kernel void ilu_decomp(const unsigned int firstRow,
const unsigned int lastRow, const unsigned int lastRow,
__global const unsigned *rowIndices,
__global double *LUvals, __global double *LUvals,
__global const int *LUcols, __global const int *LUcols,
__global const int *LUrows, __global const int *LUrows,
@ -91,14 +92,15 @@ __kernel void ilu_decomp(const unsigned int firstRow,
// go through all rows // go through all rows
for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group) for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
{ {
int iRowStart = LUrows[i]; const unsigned row = rowIndices[i];
int iRowEnd = LUrows[i + 1]; int iRowStart = LUrows[row];
int iRowEnd = LUrows[row + 1];
// go through all elements of the row // go through all elements of the row
for (int ij = iRowStart; ij < iRowEnd; ij++) { for (int ij = iRowStart; ij < iRowEnd; ij++) {
int j = LUcols[ij]; int j = LUcols[ij];
if (j < i) { if (j < row) {
// calculate the pivot of this row // calculate the pivot of this row
block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset); block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);
@ -127,11 +129,11 @@ __kernel void ilu_decomp(const unsigned int firstRow,
} }
// store the inverse in the diagonal // store the inverse in the diagonal
inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs); inverter(LUvals + diagIndex[row] * bs * bs, invDiagVals + row * bs * bs);
// copy inverse // copy inverse
if (thread_id_in_hwarp < bs * bs) { if (thread_id_in_hwarp < bs * bs) {
LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp]; LUvals[diagIndex[row] * bs * bs + thread_id_in_hwarp] = invDiagVals[row * bs * bs + thread_id_in_hwarp];
} }
} }
} }

View File

@ -1,5 +1,4 @@
/// In this kernel there is reordering: the B/Ccols do not correspond with the x/y vector /// Applies sdtwells
/// the x/y vector is reordered, using toOrder to address that
__kernel void stdwell_apply( __kernel void stdwell_apply(
__global const double *Cnnzs, __global const double *Cnnzs,
__global const double *Dnnzs, __global const double *Dnnzs,
@ -8,7 +7,6 @@ __kernel void stdwell_apply(
__global const int *Bcols, __global const int *Bcols,
__global const double *x, __global const double *x,
__global double *y, __global double *y,
__global const int *toOrder,
const unsigned int dim, const unsigned int dim,
const unsigned int dim_wells, const unsigned int dim_wells,
__global const unsigned int *val_pointers, __global const unsigned int *val_pointers,
@ -32,7 +30,7 @@ __kernel void stdwell_apply(
if(wiId < numActiveWorkItems){ if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId]; int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){ while(b < valSize + val_pointers[wgId]){
int colIdx = toOrder[Bcols[b]]; int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp; b += numBlocksPerWarp;
} }
@ -78,7 +76,7 @@ __kernel void stdwell_apply(
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
} }
int colIdx = toOrder[Ccols[bb]]; int colIdx = Ccols[bb];
y[colIdx*dim + c] -= temp; y[colIdx*dim + c] -= temp;
} }
} }

View File

@ -1,82 +0,0 @@
/// Applies sdtwells without reordering
__kernel void stdwell_apply_no_reorder(
__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2)
{
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
// merge all blocks in this workgroup into 1 block
// if numBlocksPerWarp >= 3, should use loop
// block 1: block 2:
// 0 1 2 12 13 14
// 3 4 5 15 16 17
// 6 7 8 18 19 20
// 9 10 11 21 22 23
// workitem i will hold the sum of workitems i and i + valsPerBlock
if(wiId < valsPerBlock){
for (int i = 1; i < numBlocksPerWarp; ++i) {
localSum[wiId] += localSum[wiId + i*valsPerBlock];
}
}
if(c == 0 && wiId < valsPerBlock){
for(unsigned int i = dim - 1; i > 0; --i){
localSum[wiId] += localSum[wiId + i];
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = Ccols[bb];
y[colIdx*dim + c] -= temp;
}
}

View File

@ -61,7 +61,6 @@ std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k; std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k; std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k; std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k; std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k; std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k; std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
@ -106,7 +105,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
sources.emplace_back(ILU_apply2_fm_str); sources.emplace_back(ILU_apply2_fm_str);
#endif #endif
sources.emplace_back(stdwell_apply_str); sources.emplace_back(stdwell_apply_str);
sources.emplace_back(stdwell_apply_no_reorder_str);
sources.emplace_back(ILU_decomp_str); sources.emplace_back(ILU_decomp_str);
sources.emplace_back(isaiL_str); sources.emplace_back(isaiL_str);
sources.emplace_back(isaiU_str); sources.emplace_back(isaiU_str);
@ -136,7 +134,6 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL"))); isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU"))); isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
@ -390,7 +387,7 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
} }
} }
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color, cl::Buffer& rowsPerColor, int color,
@ -403,8 +400,8 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
Timer t_ilu_apply1; Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, diagIndex, y, x, rowIndices, vals, cols, rows, diagIndex,
rowsPerColor, color, block_size, y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
if (verbosity >= 5) { if (verbosity >= 5) {
@ -415,7 +412,7 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
} }
} }
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color, cl::Buffer& rowsPerColor, int color,
@ -428,8 +425,8 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
Timer t_ilu_apply2; Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, diagIndex, invDiagVals, rowIndices, vals, cols, rows, diagIndex,
x, rowsPerColor, color, block_size, invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
if (verbosity >= 5) { if (verbosity >= 5) {
@ -440,7 +437,7 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
} }
} }
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size) int rowsThisColor, unsigned int block_size)
@ -453,7 +450,8 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
Timer t_ilu_decomp; Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, vals, cols, rows, firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor, invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
@ -465,29 +463,7 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
} }
} }
void OpenclKernels::apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels apply_stdwells() time: " << t_apply_stdwells.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y, cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells) int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{ {
@ -497,7 +473,7 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
const unsigned int lmem2 = sizeof(double) * dim_wells; const unsigned int lmem2 = sizeof(double) * dim_wells;
Timer t_apply_stdwells; Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_no_reorder_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl, d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));

View File

@ -39,19 +39,15 @@ using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>; cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>; cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&, using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>; cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>; cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>; cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>; cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>; cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
@ -85,7 +81,6 @@ private:
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k; static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k; static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k; static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k; static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
static std::unique_ptr<isaiL_kernel_type> isaiL_k; static std::unique_ptr<isaiL_kernel_type> isaiL_k;
static std::unique_ptr<isaiU_kernel_type> isaiU_k; static std::unique_ptr<isaiU_kernel_type> isaiU_k;
@ -116,7 +111,6 @@ public:
static const std::string ILU_apply2_fm_str; static const std::string ILU_apply2_fm_str;
#endif #endif
static const std::string stdwell_apply_str; static const std::string stdwell_apply_str;
static const std::string stdwell_apply_no_reorder_str;
static const std::string ILU_decomp_str; static const std::string ILU_decomp_str;
static const std::string isaiL_str; static const std::string isaiL_str;
static const std::string isaiU_str; static const std::string isaiU_str;
@ -135,20 +129,16 @@ public:
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false); static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size); static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, static void ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, static void ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size); cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, static void apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
cl::Buffer &d_toOrder, int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);
static void apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y, cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells); int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells);

View File

@ -31,7 +31,6 @@
#include <opm/simulators/linalg/bda/opencl/openclWellContributions.hpp> #include <opm/simulators/linalg/bda/opencl/openclWellContributions.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
// iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done // iff true, the nonzeroes of the matrix are copied row-by-row into a contiguous, pinned memory array, then a single GPU memcpy is done
@ -47,7 +46,7 @@ using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
template <unsigned int block_size> template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) { openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, bool opencl_ilu_parallel_, std::string linsolver) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_parallel(opencl_ilu_parallel_) {
bool use_cpr, use_isai; bool use_cpr, use_isai;
@ -68,11 +67,11 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType; using PreconditionerType = typename Preconditioner<block_size>::PreconditionerType;
if (use_cpr) { if (use_cpr) {
prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_reorder); prec = Preconditioner<block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_parallel);
} else if (use_isai) { } else if (use_isai) {
prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_reorder); prec = Preconditioner<block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_parallel);
} else { } else {
prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_reorder); prec = Preconditioner<block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_parallel);
} }
std::ostringstream out; std::ostringstream out;
@ -219,11 +218,11 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
} }
template <unsigned int block_size> template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder_) : openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, bool opencl_ilu_parallel_) :
BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_reorder(opencl_ilu_reorder_) BdaSolver<block_size>(verbosity_, maxit_, tolerance_), opencl_ilu_parallel(opencl_ilu_parallel_)
{ {
// prec = std::make_unique<BILU0<block_size> >(opencl_ilu_reorder, verbosity_); // prec = std::make_unique<BILU0<block_size> >(opencl_ilu_parallel, verbosity_);
// cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_reorder, /*use_amg=*/false); // cpr = std::make_unique<CPR<block_size> >(verbosity_, opencl_ilu_parallel, /*use_amg=*/false);
} }
template <unsigned int block_size> template <unsigned int block_size>
@ -232,12 +231,6 @@ void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& co
queue = queue_; queue = queue_;
} }
template <unsigned int block_size>
openclSolverBackend<block_size>::~openclSolverBackend() {
finalize();
}
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@ -314,7 +307,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions // apply wellContributions
if(wellContribs.getNumWells() > 0){ if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder); static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v);
} }
if(verbosity >= 3) { if(verbosity >= 3) {
queue->finish(); queue->finish();
@ -359,7 +352,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions // apply wellContributions
if(wellContribs.getNumWells() > 0){ if(wellContribs.getNumWells() > 0){
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder); static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t);
} }
if (verbosity >= 3) { if (verbosity >= 3) {
queue->finish(); queue->finish();
@ -439,10 +432,8 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
prec->setOpencl(context, queue); prec->setOpencl(context, queue);
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
vals_contiguous = new double[N]; vals_contiguous.resize(nnz);
#endif #endif
// mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
// jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
mat = matrix; mat = matrix;
jacMat = jacMatrix; jacMat = jacMatrix;
@ -462,12 +453,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
if (reorder) {
rb = new double[N];
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
} catch (const cl::Error& error) { } catch (const cl::Error& error) {
std::ostringstream oss; std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -482,13 +467,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
initialized = true; initialized = true;
} // end initialize() } // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
} // end finalize()
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() { void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t; Timer t;
@ -497,23 +475,20 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i]; int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size; sum += size_row * block_size * block_size;
} }
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]); err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
#else #else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif #endif
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]); err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, mat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]); err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), mat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]); err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]); err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
if (opencl_ilu_reorder != ILUReorder::NONE) {
events.resize(6);
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
}
cl::WaitForEvents(events); cl::WaitForEvents(events);
events.clear(); events.clear();
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
@ -537,17 +512,18 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
#if COPY_ROW_BY_ROW #if COPY_ROW_BY_ROW
int sum = 0; int sum = 0;
for (int i = 0; i < Nb; ++i) { for (int i = 0; i < Nb; ++i) {
int size_row = rmat->rowPointers[i + 1] - rmat->rowPointers[i]; int size_row = mat->rowPointers[i + 1] - mat->rowPointers[i];
memcpy(vals_contiguous.data() + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size); memcpy(vals_contiguous.data() + sum, mat->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size; sum += size_row * block_size * block_size;
} }
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]); err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous.data(), nullptr, &events[0]);
#else #else
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]); err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, mat->nnzValues, nullptr, &events[0]);
#endif #endif
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]); err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]); err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
cl::WaitForEvents(events); cl::WaitForEvents(events);
events.clear(); events.clear();
if (err != CL_SUCCESS) { if (err != CL_SUCCESS) {
@ -573,18 +549,6 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
else else
success = prec->analyze_matrix(mat.get()); success = prec->analyze_matrix(mat.get());
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();
} else {
// toOrder = bilu0->getToOrder();
// fromOrder = bilu0->getFromOrder();
// rmat = bilu0->getRMat();
toOrder = prec->getToOrder();
fromOrder = prec->getFromOrder();
rmat = prec->getRMat();
}
if (verbosity > 2) { if (verbosity > 2) {
std::ostringstream out; std::ostringstream out;
out << "openclSolver::analyze_matrix(): " << t.stop() << " s"; out << "openclSolver::analyze_matrix(): " << t.stop() << " s";
@ -598,17 +562,11 @@ bool openclSolverBackend<block_size>::analyze_matrix() {
template <unsigned int block_size> template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) { void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
Timer t; Timer t;
mat->nnzValues = vals; mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) { h_b = b;
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
} else {
rb = b;
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
}
if (verbosity > 2) { if (verbosity > 2) {
std::ostringstream out; std::ostringstream out;
@ -670,12 +628,7 @@ template <unsigned int block_size>
void openclSolverBackend<block_size>::get_result(double *x) { void openclSolverBackend<block_size>::get_result(double *x) {
Timer t; Timer t;
if (opencl_ilu_reorder != ILUReorder::NONE) { queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
} else {
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
}
if (verbosity > 2) { if (verbosity > 2) {
std::ostringstream out; std::ostringstream out;
@ -699,13 +652,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
} }
} }
update_system(matrix->nnzValues, b, wellContribs); update_system(matrix->nnzValues, b);
if (!create_preconditioner()) { if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }
copy_system_to_gpu(); copy_system_to_gpu();
} else { } else {
update_system(matrix->nnzValues, b, wellContribs); update_system(matrix->nnzValues, b);
if (!create_preconditioner()) { if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
} }
@ -718,8 +671,8 @@ SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<Block
#define INSTANTIATE_BDA_FUNCTIONS(n) \ #define INSTANTIATE_BDA_FUNCTIONS(n) \
template openclSolverBackend<n>::openclSolverBackend( \ template openclSolverBackend<n>::openclSolverBackend( \
int, int, double, unsigned int, unsigned int, ILUReorder, std::string); \ int, int, double, unsigned int, unsigned int, bool, std::string); \
template openclSolverBackend<n>::openclSolverBackend(int, int, double, ILUReorder); \ template openclSolverBackend<n>::openclSolverBackend(int, int, double, bool); \
template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); template void openclSolverBackend<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(1);

View File

@ -23,7 +23,6 @@
#include <opm/simulators/linalg/bda/opencl/opencl.hpp> #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp> #include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolver.hpp> #include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp> #include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
@ -51,15 +50,14 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::initialized; using Base::initialized;
private: private:
double *rb = nullptr; // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b double *h_b = nullptr; // b vector, on host
std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp std::vector<double> vals_contiguous; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
// OpenCL variables must be reusable, they are initialized in initialize() // OpenCL variables must be reusable, they are initialized in initialize()
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU cl::Buffer d_Avals, d_Acols, d_Arows; // matrix in BSR format on GPU
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
cl::Buffer d_toOrder; // only used when reordering is used
std::vector<cl::Device> devices; std::vector<cl::Device> devices;
@ -68,65 +66,13 @@ private:
std::unique_ptr<Preconditioner<block_size> > prec; std::unique_ptr<Preconditioner<block_size> > prec;
// can perform blocked ILU0 and AMG on pressure component // can perform blocked ILU0 and AMG on pressure component
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
bool analysis_done = false; bool analysis_done = false;
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv bool opencl_ilu_parallel; // parallelize ILU operations (with level_scheduling)
ILUReorder opencl_ilu_reorder; // reordering strategy
std::vector<cl::Event> events; std::vector<cl::Event> events;
cl_int err; cl_int err;
/// Divide A by B, and round up: return (int)ceil(A/B)
/// \param[in] A dividend
/// \param[in] B divisor
/// \return rounded division result
unsigned int ceilDivision(const unsigned int A, const unsigned int B);
/// Calculate dot product between in1 and in2, partial sums are stored in out, which are summed on CPU
/// \param[in] in1 input vector 1
/// \param[in] in2 input vector 2
/// \param[out] out output vector containing partial sums
/// \return dot product
double dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out);
/// Calculate the norm of in, partial sums are stored in out, which are summed on the CPU
/// Equal to Dune::DenseVector::two_norm()
/// \param[in] in input vector
/// \param[out] out output vector containing partial sums
/// \return norm
double norm_w(cl::Buffer in, cl::Buffer out);
/// Perform axpy: out += a * in
/// \param[in] in input vector
/// \param[in] a scalar value to multiply input vector
/// \param[inout] out output vector
void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
/// Perform scale: vec *= a
/// \param[inout] vec vector to scale
/// \param[in] a scalar value to multiply vector
void scale_w(cl::Buffer vec, const double a);
/// Custom function that combines scale, axpy and add functions in bicgstab
/// p = (p - omega * v) * beta + r
/// \param[inout] p output vector
/// \param[in] v input vector
/// \param[in] r input vector
/// \param[in] omega scalar value
/// \param[in] beta scalar value
void custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta);
/// Sparse matrix-vector multiply, spmv
/// b = A * x
/// Matrix A, must be in BCRS format
/// \param[in] vals nnzs of matrix A
/// \param[in] cols columnindices of matrix A
/// \param[in] rows rowpointers of matrix A
/// \param[in] x input vector
/// \param[out] b output vector
void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
/// Solve linear system using ilu0-bicgstab /// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
@ -137,17 +83,13 @@ private:
/// \param[in] jacMatrix matrix for preconditioner /// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix); void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
/// Clean memory
void finalize();
/// Copy linear system to GPU /// Copy linear system to GPU
void copy_system_to_gpu(); void copy_system_to_gpu();
/// Reorder the linear system so it corresponds with the coloring /// Reassign pointers, in case the addresses of the Dune variables have changed
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vectors, contains N values /// \param[in] b input vector b, contains N values
/// \param[out] wellContribs WellContributions, to set reordering void update_system(double *vals, double *b);
void update_system(double *vals, double *b, WellContributions &wellContribs);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu(); void update_system_on_gpu();
@ -156,12 +98,13 @@ private:
/// \return true iff analysis was successful /// \return true iff analysis was successful
bool analyze_matrix(); bool analyze_matrix();
/// Perform ilu0-decomposition /// Create the preconditioner, only done once per linear solve
/// \return true iff decomposition was successful /// \return true iff decomposition was successful
bool create_preconditioner(); bool create_preconditioner();
/// Solve linear system /// Solve linear system
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// could be empty
/// \param[inout] res summary of solver result /// \param[inout] res summary of solver result
void solve_system(WellContributions &wellContribs, BdaResult &res); void solve_system(WellContributions &wellContribs, BdaResult &res);
@ -175,17 +118,14 @@ public:
/// \param[in] tolerance required relative tolerance for openclSolver /// \param[in] tolerance required relative tolerance for openclSolver
/// \param[in] platformID the OpenCL platform to be used /// \param[in] platformID the OpenCL platform to be used
/// \param[in] deviceID the device to be used /// \param[in] deviceID the device to be used
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see Reorder.hpp for explanation /// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument /// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
/// only ilu0, cpr_quasiimpes and isai are supported /// only ilu0, cpr_quasiimpes and isai are supported
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID,
ILUReorder opencl_ilu_reorder, std::string linsolver); bool opencl_ilu_parallel, std::string linsolver);
/// For the CPR coarse solver /// For the CPR coarse solver
openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, bool opencl_ilu_parallel);
/// Destroy a openclSolver, and free memory
~openclSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] matrix matrix A /// \param[in] matrix matrix A

View File

@ -37,20 +37,10 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue*
this->queue = queue_; this->queue = queue_;
} }
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
if (reorder) { OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
} }
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
@ -67,7 +57,6 @@ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
// actually apply MultisegmentWells // actually apply MultisegmentWells
for (auto& well : multisegments) { for (auto& well : multisegments) {
well->setReordering(h_toOrder, reorder);
well->apply(h_x.data(), h_y.data()); well->apply(h_x.data(), h_y.data());
} }
@ -78,9 +67,9 @@ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
events.clear(); events.clear();
} }
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y){
if(num_std_wells > 0){ if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder); apply_stdwells(d_x, d_y);
} }
if(num_ms_wells > 0){ if(num_ms_wells > 0){

View File

@ -37,14 +37,9 @@ class WellContributionsOCL : public WellContributions
public: public:
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y);
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int* toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y); void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); void apply(cl::Buffer d_x, cl::Buffer d_y);
protected: protected:
/// Allocate memory for the StandardWells /// Allocate memory for the StandardWells
@ -60,8 +55,6 @@ protected:
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl; std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl; std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool reorder = false;
int *h_toOrder = nullptr;
std::vector<double> h_x; std::vector<double> h_x;
std::vector<double> h_y; std::vector<double> h_y;
}; };

View File

@ -100,7 +100,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
const int linear_solver_verbosity = prm.get<int>("verbosity"); const int linear_solver_verbosity = prm.get<int>("verbosity");
const int maxit = prm.get<int>("maxiter"); const int maxit = prm.get<int>("maxiter");
const double tolerance = prm.get<double>("tol"); const double tolerance = prm.get<double>("tol");
const std::string opencl_ilu_reorder("none"); // unused const bool opencl_ilu_parallel(true); // unused
const int platformID = 0; // unused const int platformID = 0; // unused
const int deviceID = 0; const int deviceID = 0;
const std::string accelerator_mode("cusparse"); const std::string accelerator_mode("cusparse");
@ -113,7 +113,15 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
auto wellContribs = Opm::WellContributions::create("cusparse", false); auto wellContribs = Opm::WellContributions::create("cusparse", false);
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge; std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
try { try {
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver); bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
fpga_bitstream,
linear_solver_verbosity,
maxit,
tolerance,
platformID,
deviceID,
opencl_ilu_parallel,
linsolver);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory // matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result); bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);

View File

@ -93,29 +93,42 @@ getDuneSolution(Matrix<bz>& matrix, Vector<bz>& rhs)
} }
template <int bz> template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>> void
testOpenclSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vector<bz>& rhs) createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge)
{ {
const int linear_solver_verbosity = prm.get<int>("verbosity"); const int linear_solver_verbosity = prm.get<int>("verbosity");
const int maxit = prm.get<int>("maxiter"); const int maxit = prm.get<int>("maxiter");
const double tolerance = prm.get<double>("tol"); const double tolerance = prm.get<double>("tol");
const std::string opencl_ilu_reorder("none"); const bool opencl_ilu_parallel(true);
const int platformID = 0; const int platformID = 0;
const int deviceID = 0; const int deviceID = 0;
const std::string accelerator_mode("opencl"); const std::string accelerator_mode("opencl");
const std::string fpga_bitstream("empty"); // unused const std::string fpga_bitstream("empty"); // unused
const std::string linsolver("ilu0"); const std::string linsolver("ilu0");
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
try { try {
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver); bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
fpga_bitstream,
linear_solver_verbosity,
maxit,
tolerance,
platformID,
deviceID,
opencl_ilu_parallel,
linsolver);
} catch (const std::logic_error& error) { } catch (const std::logic_error& error) {
BOOST_WARN_MESSAGE(true, error.what()); BOOST_WARN_MESSAGE(true, error.what());
throw PlatformInitException(error.what()); throw PlatformInitException(error.what());
} }
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testOpenclSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory // matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result); bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
@ -124,6 +137,23 @@ testOpenclSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vec
return x; return x;
} }
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testOpenclSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
// the sparsity pattern and values are actually the same
bridge->solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
bridge->get_result(x);
return x;
}
namespace pt = boost::property_tree; namespace pt = boost::property_tree;
void test3(const pt::ptree& prm) void test3(const pt::ptree& prm)
@ -131,17 +161,31 @@ void test3(const pt::ptree& prm)
const int bz = 3; const int bz = 3;
Matrix<bz> matrix; Matrix<bz> matrix;
Vector<bz> rhs; Vector<bz> rhs;
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
readLinearSystem("matr33.txt", "rhs3.txt", matrix, rhs); readLinearSystem("matr33.txt", "rhs3.txt", matrix, rhs);
Vector<bz> rhs2 = rhs; // deep copy, getDuneSolution() changes values in rhs vector Vector<bz> rhs2 = rhs; // deep copy, getDuneSolution() changes values in rhs vector
auto duneSolution = getDuneSolution<bz>(matrix, rhs); auto duneSolution = getDuneSolution<bz>(matrix, rhs);
auto sol = testOpenclSolver<bz>(prm, matrix, rhs2);
createBridge(prm, bridge); // create bridge with openclSolver
// should create bridge only once
// test openclSolver without Jacobi matrix
auto sol = testOpenclSolver<bz>(bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size()); BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
for (size_t i = 0; i < sol.size(); ++i) { for (size_t i = 0; i < sol.size(); ++i) {
for (int row = 0; row < bz; ++row) { for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(sol[i][row], duneSolution[i][row], 1e-3); BOOST_CHECK_CLOSE(sol[i][row], duneSolution[i][row], 1e-3);
} }
} }
// test openclSolver with Jacobi matrix
auto solJacobi = testOpenclSolverJacobi<bz>(bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
for (size_t i = 0; i < solJacobi.size(); ++i) {
for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(solJacobi[i][row], duneSolution[i][row], 1e-3);
}
}
} }