mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #3815 from Tongdongq/subdomain-matrix-for-opencl
Subdomain matrix for opencl ILU preconditioner
This commit is contained in:
@@ -161,7 +161,6 @@ namespace Opm
|
||||
// Set it up manually
|
||||
ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout());
|
||||
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
|
||||
|
||||
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||
#if HAVE_FPGA
|
||||
// check usage of MatrixAddWellContributions: for FPGA they must be included
|
||||
@@ -212,6 +211,17 @@ namespace Opm
|
||||
// to the original one with a deleter that does nothing.
|
||||
// Outch! We need to be able to scale the linear system! Hence const_cast
|
||||
matrix_ = const_cast<Matrix*>(&M.istlMatrix());
|
||||
|
||||
// setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
|
||||
numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
|
||||
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||
if (numJacobiBlocks_ > 1) {
|
||||
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
|
||||
detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_,
|
||||
wellConnectionsGraph_, numJacobiBlocks_);
|
||||
std::cout << "Create block-Jacobi pattern" << std::endl;
|
||||
blockJacobiAdjacency();
|
||||
}
|
||||
} else {
|
||||
// Pointers should not change
|
||||
if ( &(M.istlMatrix()) != matrix_ ) {
|
||||
@@ -264,7 +274,7 @@ namespace Opm
|
||||
if (use_gpu || use_fpga) {
|
||||
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||
auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
|
||||
bdaBridge->initWellContributions(*wellContribs);
|
||||
bdaBridge->initWellContributions(*wellContribs, x.N() * x[0].N());
|
||||
|
||||
// the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
|
||||
#if HAVE_CUDA || HAVE_OPENCL
|
||||
@@ -273,8 +283,15 @@ namespace Opm
|
||||
}
|
||||
#endif
|
||||
|
||||
if (numJacobiBlocks_ > 1) {
|
||||
copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_);
|
||||
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
|
||||
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
|
||||
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), &*blockJacobiForGPUILU0_,
|
||||
numJacobiBlocks_, *rhs_, *wellContribs, result);
|
||||
}
|
||||
else
|
||||
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), const_cast<Matrix*>(&getMatrix()),
|
||||
numJacobiBlocks_, *rhs_, *wellContribs, result);
|
||||
if (result.converged) {
|
||||
// get result vector x from non-Dune backend, iff solve was successful
|
||||
bdaBridge->get_result(x);
|
||||
@@ -497,6 +514,68 @@ namespace Opm
|
||||
}
|
||||
}
|
||||
|
||||
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
|
||||
/// Do not initialize the values, that is done in copyMatToBlockJac()
|
||||
void blockJacobiAdjacency()
|
||||
{
|
||||
const auto& grid = simulator_.vanguard().grid();
|
||||
std::vector<int> cell_part = simulator_.vanguard().cellPartition();
|
||||
|
||||
typedef typename Matrix::size_type size_type;
|
||||
typedef typename Matrix::CreateIterator Iter;
|
||||
size_type numCells = grid.size( 0 );
|
||||
blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, getMatrix().nonzeroes(), Matrix::row_wise));
|
||||
|
||||
const auto& lid = grid.localIdSet();
|
||||
const auto& gridView = grid.leafGridView();
|
||||
auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
|
||||
|
||||
//Loop over cells
|
||||
for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
|
||||
{
|
||||
const auto& elem = *elemIt;
|
||||
size_type idx = lid.id(elem);
|
||||
row.insert(idx);
|
||||
|
||||
// Add well non-zero connections
|
||||
for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) {
|
||||
row.insert(*wc);
|
||||
}
|
||||
|
||||
int locPart = cell_part[idx];
|
||||
|
||||
//Add neighbor if it is on the same part
|
||||
auto isend = gridView.iend(elem);
|
||||
for (auto is = gridView.ibegin(elem); is!=isend; ++is)
|
||||
{
|
||||
//check if face has neighbor
|
||||
if (is->neighbor())
|
||||
{
|
||||
size_type nid = lid.id(is->outside());
|
||||
int nabPart = cell_part[nid];
|
||||
if (locPart == nabPart) {
|
||||
row.insert(nid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
|
||||
{
|
||||
auto rbegin = blockJac.begin();
|
||||
auto rend = blockJac.end();
|
||||
auto outerRow = mat.begin();
|
||||
for (auto row = rbegin; row != rend; ++row, ++outerRow) {
|
||||
auto outerCol = (*outerRow).begin();
|
||||
for (auto col = (*row).begin(); col != (*row).end(); ++col) {
|
||||
// outerRow is guaranteed to have all column entries that row has!
|
||||
while(outerCol.index() < col.index()) ++outerCol;
|
||||
assert(outerCol.index() == col.index());
|
||||
*col = *outerCol; // copy nonzero block
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Matrix& getMatrix()
|
||||
{
|
||||
@@ -517,6 +596,8 @@ namespace Opm
|
||||
Matrix* matrix_;
|
||||
Vector *rhs_;
|
||||
|
||||
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
|
||||
|
||||
std::unique_ptr<FlexibleSolverType> flexibleSolver_;
|
||||
std::unique_ptr<AbstractOperatorType> linearOperatorForFlexibleSolver_;
|
||||
std::unique_ptr<WellModelAsLinearOperator<WellModel, Vector, Vector>> wellOperator_;
|
||||
@@ -530,6 +611,7 @@ namespace Opm
|
||||
FlowLinearSolverParameters parameters_;
|
||||
PropertyTree prm_;
|
||||
bool scale_variables_;
|
||||
int numJacobiBlocks_;
|
||||
|
||||
std::shared_ptr< CommunicationType > comm_;
|
||||
}; // end ISTLSolver
|
||||
|
||||
@@ -191,23 +191,43 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unused]] BridgeMatrix* mat,
|
||||
[[maybe_unused]] BridgeVector& b,
|
||||
[[maybe_unused]] WellContributions& wellContribs,
|
||||
[[maybe_unused]] InverseOperatorResult& res)
|
||||
{
|
||||
// check if the nnz values of the matrix are in contiguous memory
|
||||
// this is done by checking if the distance between the last value of the last block of row 0 and
|
||||
// the first value of the first row of row 1 is equal to 1
|
||||
// if the matrix only has 1 row, it is always contiguous
|
||||
template <class BridgeMatrix>
|
||||
void checkMemoryContiguous(const BridgeMatrix& mat) {
|
||||
auto block_size = mat[0][0].N();
|
||||
auto row = mat.begin();
|
||||
auto last_of_row0 = row->begin();
|
||||
|
||||
// last_of_row0 points to last block, not to row->end()
|
||||
for(auto tmp = row->begin(); tmp != row->end(); ++tmp) {
|
||||
last_of_row0 = tmp;
|
||||
}
|
||||
|
||||
bool isContiguous = mat.N() < 2 || std::distance(&((*last_of_row0)[block_size-1][block_size-1]), &(*mat[1].begin())[0][0]) == 1;
|
||||
|
||||
if (!isContiguous) {
|
||||
OPM_THROW(std::logic_error, "Error memory of Matrix looks not contiguous");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
|
||||
BridgeMatrix* jacMat,
|
||||
int numJacobiBlocks,
|
||||
BridgeVector& b,
|
||||
WellContributions& wellContribs,
|
||||
InverseOperatorResult& res)
|
||||
{
|
||||
if (use_gpu || use_fpga) {
|
||||
BdaResult result;
|
||||
result.converged = false;
|
||||
static std::vector<int> h_rows;
|
||||
static std::vector<int> h_cols;
|
||||
const int dim = (*mat)[0][0].N();
|
||||
const int Nb = mat->N();
|
||||
const int N = Nb * dim;
|
||||
const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
|
||||
const int nnz = nnzb * dim * dim;
|
||||
const int dim = (*bridgeMat)[0][0].N();
|
||||
const int Nb = bridgeMat->N();
|
||||
const int nnzb = bridgeMat->nonzeroes();
|
||||
|
||||
if (dim != 3) {
|
||||
OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||
@@ -215,26 +235,47 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
|
||||
return;
|
||||
}
|
||||
|
||||
if (h_rows.capacity() == 0) {
|
||||
if (!matrix) {
|
||||
h_rows.reserve(Nb+1);
|
||||
h_cols.reserve(nnzb);
|
||||
copySparsityPatternFromISTL(*mat, h_rows, h_cols);
|
||||
copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
|
||||
checkMemoryContiguous(*bridgeMat);
|
||||
matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
|
||||
}
|
||||
|
||||
Dune::Timer t_zeros;
|
||||
int numZeros = checkZeroDiagonal(*mat);
|
||||
int numZeros = checkZeroDiagonal(*bridgeMat);
|
||||
if (verbosity >= 2) {
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
if (numJacobiBlocks >= 2) {
|
||||
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
|
||||
|
||||
if (!jacMatrix) {
|
||||
h_jacRows.reserve(Nb+1);
|
||||
h_jacCols.reserve(jacNnzb);
|
||||
copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
|
||||
checkMemoryContiguous(*jacMat);
|
||||
jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
|
||||
}
|
||||
|
||||
Dune::Timer t_zeros2;
|
||||
int jacNumZeros = checkZeroDiagonal(*jacMat);
|
||||
if (verbosity >= 2) {
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
// assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
|
||||
SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
|
||||
|
||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
|
||||
SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
|
||||
switch(status) {
|
||||
case SolverStatus::BDA_SOLVER_SUCCESS:
|
||||
//OpmLog::info("BdaSolver converged");
|
||||
@@ -268,7 +309,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unuse
|
||||
}
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
|
||||
[[maybe_unused]] unsigned N) {
|
||||
if(accelerator_mode.compare("opencl") == 0){
|
||||
#if HAVE_OPENCL
|
||||
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
|
||||
@@ -277,6 +319,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
}
|
||||
wellContribs.setVectorSize(N);
|
||||
}
|
||||
|
||||
// the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
|
||||
namespace Opm
|
||||
@@ -43,6 +44,10 @@ private:
|
||||
bool use_fpga = false;
|
||||
std::string accelerator_mode;
|
||||
std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
|
||||
std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
|
||||
std::vector<int> h_rows, h_cols; // store the sparsity pattern of the matrix
|
||||
std::vector<int> h_jacRows, h_jacCols; // store the sparsity pattern of the jacMatrix
|
||||
|
||||
public:
|
||||
/// Construct a BdaBridge
|
||||
@@ -61,11 +66,13 @@ public:
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
/// \warning Values of A might get overwritten!
|
||||
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
|
||||
/// \param[in] b vector b, should be of type Dune::BlockVector
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] result summary of solver result
|
||||
void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
|
||||
/// \param[in] bridgeMat matrix A, should be of type Dune::BCRSMatrix
|
||||
/// \param[in] jacMat matrix A, but modified for the preconditioner, should be of type Dune::BCRSMatrix
|
||||
/// \param[in] numJacobiBlocks number of jacobi blocks in jacMat
|
||||
/// \param[in] b vector b, should be of type Dune::BlockVector
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] result summary of solver result
|
||||
void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
|
||||
|
||||
/// Get the resulting x vector
|
||||
/// \param[inout] x vector x, should be of type Dune::BlockVector
|
||||
@@ -86,7 +93,8 @@ public:
|
||||
/// Initialize the WellContributions object with opencl context and queue
|
||||
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
|
||||
/// \param[in] wellContribs container to hold all WellContributions
|
||||
void initWellContributions(WellContributions& wellContribs);
|
||||
/// \param[in] N number of rows in scalar vector that wellContribs will be applied on
|
||||
void initWellContributions(WellContributions& wellContribs, unsigned N);
|
||||
|
||||
/// Return whether the BdaBridge will use the FPGA or not
|
||||
/// return whether the BdaBridge will use the FPGA or not
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
#include <string>
|
||||
|
||||
@@ -85,9 +86,8 @@ namespace Accelerator {
|
||||
virtual ~BdaSolver() {};
|
||||
|
||||
/// Define as pure virtual functions, so derivedclass must implement them
|
||||
virtual SolverStatus solve_system(int N, int nnz, int dim,
|
||||
double *vals, int *rows, int *cols,
|
||||
double *b, WellContributions& wellContribs, BdaResult &res) = 0;
|
||||
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
|
||||
|
||||
virtual void get_result(double *x) = 0;
|
||||
|
||||
|
||||
@@ -37,14 +37,12 @@ namespace Accelerator
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
||||
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
void sortRow(int *colIndices, int *data, int left, int right) {
|
||||
int l = left;
|
||||
int r = right;
|
||||
int middle = colIndices[(l + r) >> 1];
|
||||
double lDatum[bs * bs];
|
||||
do {
|
||||
while (colIndices[l] < middle)
|
||||
l++;
|
||||
@@ -54,9 +52,9 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
|
||||
int lColIndex = colIndices[l];
|
||||
colIndices[l] = colIndices[r];
|
||||
colIndices[r] = lColIndex;
|
||||
memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs);
|
||||
memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs);
|
||||
memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs);
|
||||
int tmp = data[l];
|
||||
data[l] = data[r];
|
||||
data[r] = tmp;
|
||||
|
||||
l++;
|
||||
r--;
|
||||
@@ -64,10 +62,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
|
||||
} while (l < r);
|
||||
|
||||
if (left < r)
|
||||
sortBlockedRow(colIndices, data, left, r, bs);
|
||||
sortRow(colIndices, data, left, r);
|
||||
|
||||
if (right > l)
|
||||
sortBlockedRow(colIndices, data, l, right, bs);
|
||||
sortRow(colIndices, data, l, right);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -127,13 +127,13 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/// Sort a row of matrix elements from a blocked CSR-format
|
||||
/// \param[inout] colIndices
|
||||
/// \param[inout] data
|
||||
/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
|
||||
/// These ints aren't actually nonzeroes, but represent a mapping used later
|
||||
/// \param[inout] colIndices represent keys in sorting
|
||||
/// \param[inout] data sorted according to the colIndices
|
||||
/// \param[in] left lower index of data of row
|
||||
/// \param[in] right upper index of data of row
|
||||
/// \param[in] block_size size of blocks in the row
|
||||
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
|
||||
void sortRow(int *colIndices, int *data, int left, int right);
|
||||
|
||||
/// Multiply and subtract blocks
|
||||
/// a = a - (b * c)
|
||||
|
||||
@@ -225,16 +225,20 @@ void FpgaSolverBackend<block_size>::get_result(double *x_)
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res)
|
||||
SolverStatus FpgaSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions&,
|
||||
BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices);
|
||||
if (!analyse_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
perf_call.emplace_back();
|
||||
update_system(vals, b);
|
||||
update_system(matrix->nnzValues, b);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
@@ -252,21 +256,21 @@ SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int d
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
|
||||
void FpgaSolverBackend<block_size>::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols)
|
||||
{
|
||||
double start = second();
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
this->Nb = Nb_;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = nnzbs;
|
||||
this->nnz = nnzbs * block_size * block_size;
|
||||
|
||||
// allocate host memory for matrices and vectors
|
||||
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
|
||||
mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
|
||||
"block size: " << dim << ", total nnz: " << this->nnz << "\n";
|
||||
oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \
|
||||
"block size: " << block_size << ", total nnz: " << this->nnz << "\n";
|
||||
oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(oss.str());
|
||||
|
||||
|
||||
@@ -201,13 +201,12 @@ private:
|
||||
unsigned short rst_settle_cycles = 0;
|
||||
|
||||
/// Allocate host memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of blocks
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
/// Reorder the linear system so it corresponds with the coloring
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
@@ -243,18 +242,15 @@ public:
|
||||
~FpgaSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
/// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -176,39 +177,58 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
|
||||
|
||||
/* Reorder a matrix by a specified input order.
|
||||
* Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
|
||||
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
|
||||
assert(mat->block_size == rmat->block_size);
|
||||
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.
|
||||
* reordermapping_nonzeroes is filled with increasing indices, and reordered using the translated colIndices as keys,
|
||||
* this means the resulting reordermapping_nonzeroes array contains the mapping
|
||||
*/
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat){
|
||||
|
||||
const unsigned int bs = mat->block_size;
|
||||
int rIndex = 0;
|
||||
int i, k;
|
||||
unsigned int j;
|
||||
std::vector<int> tmp(mat->nnzbs);
|
||||
|
||||
reordermapping_nonzeroes.resize(mat->nnzbs);
|
||||
for(int i = 0; i < mat->nnzbs; ++i){
|
||||
reordermapping_nonzeroes[i] = i;
|
||||
}
|
||||
|
||||
rmat->rowPointers[0] = 0;
|
||||
for (i = 0; i < mat->Nb; i++) {
|
||||
for(int i = 0; i < mat->Nb; i++){
|
||||
int thisRow = fromOrder[i];
|
||||
// put thisRow from the old matrix into row i of the new matrix
|
||||
rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
|
||||
for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
|
||||
for (j = 0; j < bs * bs; j++) {
|
||||
rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
|
||||
}
|
||||
rmat->rowPointers[i+1] = rmat->rowPointers[i] + mat->rowPointers[thisRow+1] - mat->rowPointers[thisRow];
|
||||
for(int k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow+1]; k++){
|
||||
tmp[rIndex] = reordermapping_nonzeroes[k]; // only get 1 entry per block
|
||||
rmat->colIndices[rIndex] = mat->colIndices[k];
|
||||
rIndex++;
|
||||
}
|
||||
}
|
||||
// re-assign column indices according to the new positions of the nodes referenced by the column indices
|
||||
for (i = 0; i < mat->nnzbs; i++) {
|
||||
for(int i = 0; i < mat->nnzbs; i++){
|
||||
rmat->colIndices[i] = toOrder[rmat->colIndices[i]];
|
||||
}
|
||||
// re-sort the column indices of every row.
|
||||
for (i = 0; i < mat->Nb; i++) {
|
||||
sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
|
||||
for(int i = 0; i < mat->Nb; i++){
|
||||
sortRow(rmat->colIndices, tmp.data(), rmat->rowPointers[i], rmat->rowPointers[i+1]-1);
|
||||
}
|
||||
for(int i = 0; i < mat->nnzbs; i++){
|
||||
reordermapping_nonzeroes[i] = tmp[i];
|
||||
}
|
||||
// std::copy();
|
||||
}
|
||||
|
||||
/* Reorder an array of nonzero blocks into another array, using a mapping */
|
||||
void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat){
|
||||
assert(mat->block_size == rmat->block_size);
|
||||
|
||||
const unsigned int bs = mat->block_size;
|
||||
|
||||
for(int i = 0; i < mat->nnzbs; i++){
|
||||
int old_idx = reordermapping_nonzeroes[i];
|
||||
memcpy(rmat->nnzValues+i*bs*bs, mat->nnzValues+old_idx*bs*bs, sizeof(double)*bs*bs); // copy nnz block
|
||||
}
|
||||
}
|
||||
|
||||
/* Reorder a matrix according to the colors that every node of the matrix has received*/
|
||||
/* Find a reorder mapping according to the colors that every node of the matrix has received */
|
||||
|
||||
void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
|
||||
int reordered = 0;
|
||||
|
||||
@@ -47,13 +47,22 @@ namespace Accelerator
|
||||
template <unsigned int block_size>
|
||||
int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
|
||||
|
||||
/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
|
||||
/// rMat must be allocated already
|
||||
/// \param[in] mat matrix to be reordered
|
||||
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[inout] rMat reordered Matrix
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
|
||||
/// Reorder the sparsity pattern of the matrix according to the mapping in toOrder and fromOrder
|
||||
/// Also find mapping for nnz blocks
|
||||
/// rmat must be allocated already
|
||||
/// \param[in] mat matrix to be reordered
|
||||
/// \param[out] reordermapping_nonzeroes contains new index for every nnz block
|
||||
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
|
||||
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
|
||||
/// \param[out] rmat reordered Matrix
|
||||
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
|
||||
|
||||
/// Write nnz blocks from mat to rmat, according to the mapping in reordermapping_nonzeroes
|
||||
/// rmat must be allocated already
|
||||
/// \param[in] mat matrix to be reordered
|
||||
/// \param[in] reordermapping_nonzeroes contains old index for every nnz block, so rmat_nnz[i] == mat_nnz[mapping[i]]
|
||||
/// \param[inout] rmat reordered Matrix
|
||||
void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat);
|
||||
|
||||
/// Compute reorder mapping from the color that each node has received
|
||||
/// The toOrder, fromOrder and iters arrays must be allocated already
|
||||
|
||||
@@ -101,6 +101,10 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::setVectorSize(unsigned N_) {
|
||||
N = N_;
|
||||
}
|
||||
|
||||
void WellContributions::addNumBlocks(unsigned int numBlocks)
|
||||
{
|
||||
if (allocated) {
|
||||
|
||||
@@ -101,6 +101,10 @@ public:
|
||||
/// \param[in] dim_wells number of rows
|
||||
void setBlockSize(unsigned int dim, unsigned int dim_wells);
|
||||
|
||||
/// Set size of vector that the wells are applied to
|
||||
/// \param[in] N size of vector
|
||||
void setVectorSize(unsigned N);
|
||||
|
||||
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
|
||||
/// \param[in] type indicate if C, D or B is sent
|
||||
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
|
||||
|
||||
@@ -52,14 +52,14 @@ amgclSolverBackend<block_size>::~amgclSolverBackend() {}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
|
||||
this->Nb = Nb_;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = nnzbs;
|
||||
this->nnz = nnzbs * block_size * block_size;
|
||||
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
|
||||
out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "DeviceID: " << deviceID << "\n";
|
||||
OpmLog::info(out.str());
|
||||
@@ -360,12 +360,17 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) {
|
||||
SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
[[maybe_unused]] WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim);
|
||||
convert_sparsity_pattern(rows, cols);
|
||||
initialize(matrix->Nb, matrix->nnzbs);
|
||||
convert_sparsity_pattern(matrix->rowPointers, matrix->colIndices);
|
||||
}
|
||||
convert_data(vals, rows);
|
||||
convert_data(matrix->nnzValues, matrix->rowPointers);
|
||||
solve_system(b, res);
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -96,10 +96,9 @@ private:
|
||||
std::once_flag vexcl_initialize;
|
||||
#endif
|
||||
/// Initialize host memory and determine amgcl parameters
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
void initialize(int N, int nnz, int dim);
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of blocks
|
||||
void initialize(int Nb, int nnzbs);
|
||||
|
||||
/// Convert the BCSR sparsity pattern to a CSR one
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
@@ -129,22 +128,15 @@ public:
|
||||
~amgclSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] vals_prec array of nonzeroes for preconditioner
|
||||
/// \param[in] rows_prec array of rowPointers for preconditioner
|
||||
/// \param[in] cols_prec array of columnIndices for preconditioner
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
@@ -104,10 +104,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
@@ -135,10 +135,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
@@ -188,17 +188,25 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->nnzb = nnz / block_size / block_size;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
if (jacMatrix) {
|
||||
useJacMatrix = true;
|
||||
nnzbs_prec = jacMatrix->nnzbs;
|
||||
} else {
|
||||
nnzbs_prec = nnzb;
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaSetDevice(deviceID);
|
||||
@@ -232,7 +240,15 @@ void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
if (useJacMatrix) {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
|
||||
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
|
||||
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
|
||||
} else {
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
}
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
@@ -261,6 +277,10 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
if (useJacMatrix) {
|
||||
cudaFree(d_mCols);
|
||||
cudaFree(d_mRows);
|
||||
}
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
@@ -283,23 +303,32 @@ void cusparseSolverBackend<block_size>::finalize() {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
|
||||
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
}
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
@@ -314,19 +343,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
|
||||
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
Timer t;
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
int sum = 0;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
int size_row = rows[i + 1] - rows[i];
|
||||
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
|
||||
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
|
||||
sum += size_row * block_size * block_size;
|
||||
}
|
||||
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
} else {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
@@ -341,12 +375,6 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
@@ -380,12 +408,12 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
|
||||
descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
|
||||
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
|
||||
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
@@ -393,7 +421,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
@@ -404,11 +432,11 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
@@ -428,10 +456,8 @@ template <unsigned int block_size>
|
||||
bool cusparseSolverBackend<block_size>::create_preconditioner() {
|
||||
Timer t;
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
block_size, info_M, policy, d_buffer);
|
||||
cudaCheckLastError("Could not perform ilu decomposition");
|
||||
|
||||
@@ -480,19 +506,23 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
initialize(matrix, jacMatrix);
|
||||
copy_system_to_gpu(matrix, b, jacMatrix);
|
||||
} else {
|
||||
update_system_on_gpu(vals, rows, b);
|
||||
update_system_on_gpu(matrix, b, jacMatrix);
|
||||
}
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if (create_preconditioner()) {
|
||||
solve_system(wellContribs, res);
|
||||
} else {
|
||||
|
||||
@@ -68,6 +68,9 @@ private:
|
||||
|
||||
bool analysis_done = false;
|
||||
|
||||
bool useJacMatrix = false;
|
||||
int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner
|
||||
// could be jacMatrix or matrix
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
@@ -75,29 +78,26 @@ private:
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
void initialize(int N, int nnz, int dim);
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
/// Copy linear system to GPU
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnzb values
|
||||
/// \param[in] b input vector, contains N values
|
||||
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(double *vals, int *rows, double *b);
|
||||
|
||||
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
|
||||
void reset_prec_on_gpu();
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// also copy matrix for preconditioner if needed
|
||||
/// \param[in] matrix matrix for spmv
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
@@ -126,18 +126,15 @@ public:
|
||||
~cusparseSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Get resulting vector x after linear solve, also includes post processing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
@@ -51,6 +51,13 @@ BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@@ -62,19 +69,27 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
std::vector<int> CSCRowIndices;
|
||||
std::vector<int> CSCColPointers;
|
||||
|
||||
auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
LUmat = std::make_unique<BlockedMatrix>(*mat);
|
||||
} else {
|
||||
toOrder.resize(Nb);
|
||||
fromOrder.resize(Nb);
|
||||
CSCRowIndices.resize(nnzb);
|
||||
CSCRowIndices.resize(matToDecompose->nnzbs);
|
||||
CSCColPointers.resize(Nb + 1);
|
||||
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rmat);
|
||||
|
||||
if (jacMat) {
|
||||
rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
|
||||
} else {
|
||||
LUmat = std::make_unique<BlockedMatrix>(*rmat);
|
||||
}
|
||||
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
|
||||
if (verbosity >= 3) {
|
||||
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
|
||||
if(verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
@@ -85,24 +100,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
std::ostringstream out;
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
if (jacMat) {
|
||||
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
|
||||
}
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
if (jacMat) {
|
||||
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
|
||||
}
|
||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
out << "BILU0 reordering strategy: none\n";
|
||||
// numColors = 1;
|
||||
// rowsPerColor.emplace_back(Nb);
|
||||
numColors = Nb;
|
||||
for (int i = 0; i < Nb; ++i) {
|
||||
for(int i = 0; i < Nb; ++i){
|
||||
rowsPerColor.emplace_back(1);
|
||||
}
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||
}
|
||||
if (verbosity >= 1) {
|
||||
if(verbosity >= 1){
|
||||
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||
}
|
||||
|
||||
#if CHOW_PATEL
|
||||
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||
#endif
|
||||
@@ -112,8 +136,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
invDiagVals.resize(mat->Nb * bs * bs);
|
||||
|
||||
#if CHOW_PATEL
|
||||
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
|
||||
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
|
||||
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
#endif
|
||||
|
||||
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
|
||||
@@ -137,7 +161,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
|
||||
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
|
||||
for (int i = 0; i < numColors; ++i) {
|
||||
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
|
||||
}
|
||||
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
|
||||
|
||||
@@ -149,21 +173,38 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end init()
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
auto *m = mat;
|
||||
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
m = rmat.get();
|
||||
auto *matToDecompose = jacMat;
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
|
||||
matToDecompose = jacMat ? jacMat : mat;
|
||||
} else {
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
if (jacMat) {
|
||||
matToDecompose = rJacMat.get();
|
||||
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
|
||||
reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
|
||||
} else {
|
||||
matToDecompose = rmat.get();
|
||||
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
@@ -173,18 +214,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
|
||||
Timer t_copy;
|
||||
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
|
||||
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 memcpy: " << t_copy.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
#if CHOW_PATEL
|
||||
chowPatelIlu.decomposition(queue.get(), context.get(),
|
||||
chowPatelIlu.decomposition(queue, context,
|
||||
LUmat.get(), Lmat.get(), Umat.get(),
|
||||
invDiagVals.data(), diagIndex,
|
||||
invDiagVals, diagIndex,
|
||||
s.diagIndex, s.invDiagVals,
|
||||
s.Lvals, s.Lcols, s.Lrows,
|
||||
s.Uvals, s.Ucols, s.Urows);
|
||||
@@ -192,23 +233,23 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
Timer t_copyToGpu;
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
|
||||
|
||||
std::call_once(pattern_uploaded, [&]() {
|
||||
std::call_once(pattern_uploaded, [&](){
|
||||
// find the positions of each diagonal block
|
||||
// must be done after reordering
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int rowStart = LUmat->rowPointers[row];
|
||||
int rowEnd = LUmat->rowPointers[row + 1];
|
||||
int rowEnd = LUmat->rowPointers[row+1];
|
||||
|
||||
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
|
||||
assert(candidate != LUmat->colIndices + rowEnd);
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
|
||||
cl::WaitForEvents(events);
|
||||
@@ -229,8 +270,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
cl::Event event;
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
|
||||
if (verbosity >= 4) {
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color+1];
|
||||
if (verbosity >= 5) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
|
||||
@@ -245,6 +286,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
|
||||
@@ -55,6 +55,7 @@ class BILU0 : public Preconditioner<block_size>
|
||||
private:
|
||||
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
|
||||
std::shared_ptr<BlockedMatrix> rJacMat = nullptr;
|
||||
#if CHOW_PATEL
|
||||
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
|
||||
#endif
|
||||
@@ -68,6 +69,9 @@ private:
|
||||
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
|
||||
std::vector<int> reordermappingNonzeroes; // maps nonzero blocks to new location in reordered matrix
|
||||
std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
|
||||
|
||||
typedef struct {
|
||||
cl::Buffer invDiagVals;
|
||||
cl::Buffer diagIndex;
|
||||
@@ -92,9 +96,11 @@ public:
|
||||
|
||||
// analysis, find reordering if specified
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
@@ -114,6 +120,11 @@ public:
|
||||
return rmat.get();
|
||||
}
|
||||
|
||||
BlockedMatrix* getRJacMat()
|
||||
{
|
||||
return rJacMat.get();
|
||||
}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
|
||||
{
|
||||
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
|
||||
@@ -127,7 +138,6 @@ public:
|
||||
return std::make_pair(s.LUvals, s.invDiagVals);
|
||||
#endif
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
|
||||
@@ -78,17 +78,30 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
auto *m = mat;
|
||||
|
||||
this->N = mat->Nb * bs;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * bs * bs;
|
||||
this->nnzb = mat->nnzbs;
|
||||
if (jacMat) {
|
||||
m = jacMat;
|
||||
}
|
||||
|
||||
bilu0->analyze_matrix(mat);
|
||||
this->N = m->Nb * bs;
|
||||
this->Nb = m->Nb;
|
||||
this->nnz = m->nnzbs * bs * bs;
|
||||
this->nnzb = m->nnzbs;
|
||||
|
||||
return true;
|
||||
if (jacMat) {
|
||||
return bilu0->analyze_matrix(mat, jacMat);
|
||||
} else {
|
||||
return bilu0->analyze_matrix(mat);
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
@@ -162,7 +175,7 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@@ -172,7 +185,11 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
|
||||
Dune::Timer t_preconditioner;
|
||||
|
||||
bilu0->create_preconditioner(mat);
|
||||
if (jacMat) {
|
||||
bilu0->create_preconditioner(mat, jacMat);
|
||||
} else {
|
||||
bilu0->create_preconditioner(mat);
|
||||
}
|
||||
|
||||
std::call_once(initialize, [&]() {
|
||||
std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
|
||||
@@ -255,6 +272,12 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
return true;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@@ -117,9 +117,11 @@ public:
|
||||
|
||||
// analysis, find reordering if specified
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
@@ -80,6 +80,42 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
|
||||
return success;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_, jacMat);
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
mat = mat_;
|
||||
} else {
|
||||
mat = bilu0->getRMat();
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
|
||||
@@ -101,6 +137,7 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// return the absolute value of the N elements for which the absolute value is highest
|
||||
double get_absmax(const double *data, const int N) {
|
||||
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));
|
||||
|
||||
@@ -125,6 +125,7 @@ public:
|
||||
CPR(int verbosity, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
bool analyze_matrix(BlockedMatrix *mat) override;
|
||||
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
|
||||
@@ -134,7 +135,8 @@ public:
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
|
||||
bool create_preconditioner(BlockedMatrix *mat) override;
|
||||
|
||||
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
|
||||
|
||||
int* getToOrder() override
|
||||
{
|
||||
return bilu0->getToOrder();
|
||||
|
||||
@@ -52,10 +52,21 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
|
||||
return analyze_matrix(mat);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
|
||||
return create_preconditioner(mat);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
|
||||
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
|
||||
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
|
||||
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
|
||||
template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \
|
||||
template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
|
||||
@@ -68,10 +68,14 @@ public:
|
||||
|
||||
// analyze matrix, e.g. the sparsity pattern
|
||||
// probably only called once
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
|
||||
virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
|
||||
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
|
||||
|
||||
// get reordering mappings
|
||||
virtual int* getToOrder() = 0;
|
||||
|
||||
@@ -370,7 +370,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
if (verbosity >= 4) {
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::prec_apply: " << t_prec.elapsed() << " s\n";
|
||||
out << "wellContributions::apply: " << t_well.elapsed() << " s\n";
|
||||
@@ -383,14 +383,21 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
if (jacMatrix) {
|
||||
useJacMatrix = true;
|
||||
}
|
||||
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
|
||||
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
|
||||
if (useJacMatrix) {
|
||||
out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
|
||||
}
|
||||
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
|
||||
out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
|
||||
OpmLog::info(out.str());
|
||||
@@ -401,9 +408,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
prec->setOpencl(context, queue);
|
||||
|
||||
#if COPY_ROW_BY_ROW
|
||||
vals_contiguous.resize(nnz);
|
||||
vals_contiguous = new double[N];
|
||||
#endif
|
||||
mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
|
||||
// mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
|
||||
// jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
|
||||
mat = matrix;
|
||||
jacMat = jacMatrix;
|
||||
|
||||
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
|
||||
@@ -441,7 +451,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::finalize() {
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
@@ -527,8 +536,11 @@ template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::analyze_matrix() {
|
||||
Timer t;
|
||||
|
||||
// bool success = bilu0->init(mat.get());
|
||||
bool success = prec->analyze_matrix(mat.get());
|
||||
bool success;
|
||||
if (useJacMatrix)
|
||||
success = prec->analyze_matrix(mat.get(), jacMat.get());
|
||||
else
|
||||
success = prec->analyze_matrix(mat.get());
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
rmat = mat.get();
|
||||
@@ -579,7 +591,11 @@ template <unsigned int block_size>
|
||||
bool openclSolverBackend<block_size>::create_preconditioner() {
|
||||
Timer t;
|
||||
|
||||
bool result = prec->create_preconditioner(mat.get());
|
||||
bool result;
|
||||
if (useJacMatrix)
|
||||
result = prec->create_preconditioner(mat.get(), jacMat.get());
|
||||
else
|
||||
result = prec->create_preconditioner(mat.get());
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
@@ -639,21 +655,26 @@ void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
|
||||
double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix,
|
||||
WellContributions& wellContribs,
|
||||
BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
initialize(matrix, jacMatrix);
|
||||
if (analysis_done == false) {
|
||||
if (!analyze_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
update_system(vals, b, wellContribs);
|
||||
update_system(matrix->nnzValues, b, wellContribs);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
copy_system_to_gpu();
|
||||
} else {
|
||||
update_system(vals, b, wellContribs);
|
||||
update_system(matrix->nnzValues, b, wellContribs);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
|
||||
@@ -63,12 +63,15 @@ private:
|
||||
|
||||
std::vector<cl::Device> devices;
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
std::unique_ptr<Preconditioner<block_size> > prec;
|
||||
// can perform blocked ILU0 and AMG on pressure component
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
bool analysis_done = false;
|
||||
std::unique_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
|
||||
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
|
||||
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
|
||||
ILUReorder opencl_ilu_reorder; // reordering strategy
|
||||
std::vector<cl::Event> events;
|
||||
@@ -130,13 +133,9 @@ private:
|
||||
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
@@ -189,21 +188,14 @@ public:
|
||||
~openclSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] vals_prec array of nonzeroes for preconditioner
|
||||
/// \param[in] rows_prec array of rowPointers for preconditioner
|
||||
/// \param[in] cols_prec array of columnIndices for preconditioner
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
|
||||
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
@@ -218,7 +210,7 @@ public:
|
||||
/// \param[in] context the opencl context to be used
|
||||
/// \param[in] queue the opencl queue to be used
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
|
||||
}; // end class openclSolverBackend
|
||||
|
||||
} // namespace Accelerator
|
||||
|
||||
@@ -35,12 +35,6 @@ using Accelerator::OpenclKernels;
|
||||
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
|
||||
this->context = context_;
|
||||
this->queue = queue_;
|
||||
}
|
||||
|
||||
void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
|
||||
Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
|
||||
this->kernel = kernel_;
|
||||
this->kernel_no_reorder = kernel_no_reorder_;
|
||||
}
|
||||
|
||||
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
|
||||
|
||||
@@ -35,8 +35,6 @@ namespace Opm
|
||||
class WellContributionsOCL : public WellContributions
|
||||
{
|
||||
public:
|
||||
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
|
||||
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
|
||||
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
||||
|
||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
||||
@@ -56,8 +54,6 @@ protected:
|
||||
|
||||
cl::Context* context;
|
||||
cl::CommandQueue* queue;
|
||||
Opm::Accelerator::stdwell_apply_kernel_type* kernel;
|
||||
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
|
||||
std::vector<cl::Event> events;
|
||||
|
||||
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
|
||||
|
||||
@@ -39,9 +39,9 @@ namespace detail
|
||||
/// \param useWellConn Boolean that is true when UseWellContribusion is true
|
||||
/// \param wellGraph Cell IDs of well cells stored in a graph.
|
||||
template<class Grid, class W>
|
||||
void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph)
|
||||
void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph, int numJacobiBlocks)
|
||||
{
|
||||
if ( grid.comm().size() > 1)
|
||||
if ( grid.comm().size() > 1 || numJacobiBlocks > 1)
|
||||
{
|
||||
Dune::CartesianIndexMapper< Grid > cartMapper( grid );
|
||||
const int numCells = cartMapper.compressedSize(); // grid.numCells()
|
||||
|
||||
Reference in New Issue
Block a user