Merge pull request #3815 from Tongdongq/subdomain-matrix-for-opencl

Subdomain matrix for opencl ILU preconditioner
This commit is contained in:
Markus Blatt
2022-05-20 16:05:14 +02:00
committed by GitHub
36 changed files with 679 additions and 320 deletions

View File

@@ -161,7 +161,6 @@ namespace Opm
// Set it up manually
ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout());
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
#if HAVE_FPGA
// check usage of MatrixAddWellContributions: for FPGA they must be included
@@ -212,6 +211,17 @@ namespace Opm
// to the original one with a deleter that does nothing.
// Outch! We need to be able to scale the linear system! Hence const_cast
matrix_ = const_cast<Matrix*>(&M.istlMatrix());
// setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
if (numJacobiBlocks_ > 1) {
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_,
wellConnectionsGraph_, numJacobiBlocks_);
std::cout << "Create block-Jacobi pattern" << std::endl;
blockJacobiAdjacency();
}
} else {
// Pointers should not change
if ( &(M.istlMatrix()) != matrix_ ) {
@@ -264,7 +274,7 @@ namespace Opm
if (use_gpu || use_fpga) {
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
bdaBridge->initWellContributions(*wellContribs);
bdaBridge->initWellContributions(*wellContribs, x.N() * x[0].N());
// the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
#if HAVE_CUDA || HAVE_OPENCL
@@ -273,8 +283,15 @@ namespace Opm
}
#endif
if (numJacobiBlocks_ > 1) {
copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_);
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), &*blockJacobiForGPUILU0_,
numJacobiBlocks_, *rhs_, *wellContribs, result);
}
else
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), const_cast<Matrix*>(&getMatrix()),
numJacobiBlocks_, *rhs_, *wellContribs, result);
if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x);
@@ -497,6 +514,68 @@ namespace Opm
}
}
/// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
/// Do not initialize the values, that is done in copyMatToBlockJac()
void blockJacobiAdjacency()
{
const auto& grid = simulator_.vanguard().grid();
std::vector<int> cell_part = simulator_.vanguard().cellPartition();
typedef typename Matrix::size_type size_type;
typedef typename Matrix::CreateIterator Iter;
size_type numCells = grid.size( 0 );
blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, getMatrix().nonzeroes(), Matrix::row_wise));
const auto& lid = grid.localIdSet();
const auto& gridView = grid.leafGridView();
auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
//Loop over cells
for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
{
const auto& elem = *elemIt;
size_type idx = lid.id(elem);
row.insert(idx);
// Add well non-zero connections
for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) {
row.insert(*wc);
}
int locPart = cell_part[idx];
//Add neighbor if it is on the same part
auto isend = gridView.iend(elem);
for (auto is = gridView.ibegin(elem); is!=isend; ++is)
{
//check if face has neighbor
if (is->neighbor())
{
size_type nid = lid.id(is->outside());
int nabPart = cell_part[nid];
if (locPart == nabPart) {
row.insert(nid);
}
}
}
}
}
void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
{
auto rbegin = blockJac.begin();
auto rend = blockJac.end();
auto outerRow = mat.begin();
for (auto row = rbegin; row != rend; ++row, ++outerRow) {
auto outerCol = (*outerRow).begin();
for (auto col = (*row).begin(); col != (*row).end(); ++col) {
// outerRow is guaranteed to have all column entries that row has!
while(outerCol.index() < col.index()) ++outerCol;
assert(outerCol.index() == col.index());
*col = *outerCol; // copy nonzero block
}
}
}
Matrix& getMatrix()
{
@@ -517,6 +596,8 @@ namespace Opm
Matrix* matrix_;
Vector *rhs_;
std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
std::unique_ptr<FlexibleSolverType> flexibleSolver_;
std::unique_ptr<AbstractOperatorType> linearOperatorForFlexibleSolver_;
std::unique_ptr<WellModelAsLinearOperator<WellModel, Vector, Vector>> wellOperator_;
@@ -530,6 +611,7 @@ namespace Opm
FlowLinearSolverParameters parameters_;
PropertyTree prm_;
bool scale_variables_;
int numJacobiBlocks_;
std::shared_ptr< CommunicationType > comm_;
}; // end ISTLSolver

View File

@@ -191,23 +191,43 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unused]] BridgeMatrix* mat,
[[maybe_unused]] BridgeVector& b,
[[maybe_unused]] WellContributions& wellContribs,
[[maybe_unused]] InverseOperatorResult& res)
{
// check if the nnz values of the matrix are in contiguous memory
// this is done by checking if the distance between the last value of the last block of row 0 and
// the first value of the first row of row 1 is equal to 1
// if the matrix only has 1 row, it is always contiguous
template <class BridgeMatrix>
void checkMemoryContiguous(const BridgeMatrix& mat) {
auto block_size = mat[0][0].N();
auto row = mat.begin();
auto last_of_row0 = row->begin();
// last_of_row0 points to last block, not to row->end()
for(auto tmp = row->begin(); tmp != row->end(); ++tmp) {
last_of_row0 = tmp;
}
bool isContiguous = mat.N() < 2 || std::distance(&((*last_of_row0)[block_size-1][block_size-1]), &(*mat[1].begin())[0][0]) == 1;
if (!isContiguous) {
OPM_THROW(std::logic_error, "Error memory of Matrix looks not contiguous");
}
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix* bridgeMat,
BridgeMatrix* jacMat,
int numJacobiBlocks,
BridgeVector& b,
WellContributions& wellContribs,
InverseOperatorResult& res)
{
if (use_gpu || use_fpga) {
BdaResult result;
result.converged = false;
static std::vector<int> h_rows;
static std::vector<int> h_cols;
const int dim = (*mat)[0][0].N();
const int Nb = mat->N();
const int N = Nb * dim;
const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
const int nnz = nnzb * dim * dim;
const int dim = (*bridgeMat)[0][0].N();
const int Nb = bridgeMat->N();
const int nnzb = bridgeMat->nonzeroes();
if (dim != 3) {
OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@@ -215,26 +235,47 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system([[maybe_unu
return;
}
if (h_rows.capacity() == 0) {
if (!matrix) {
h_rows.reserve(Nb+1);
h_cols.reserve(nnzb);
copySparsityPatternFromISTL(*mat, h_rows, h_cols);
copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
checkMemoryContiguous(*bridgeMat);
matrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, nnzb, block_size, static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data());
}
Dune::Timer t_zeros;
int numZeros = checkZeroDiagonal(*mat);
int numZeros = checkZeroDiagonal(*bridgeMat);
if (verbosity >= 2) {
std::ostringstream out;
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
OpmLog::info(out.str());
}
if (numJacobiBlocks >= 2) {
const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
if (!jacMatrix) {
h_jacRows.reserve(Nb+1);
h_jacCols.reserve(jacNnzb);
copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
checkMemoryContiguous(*jacMat);
jacMatrix = std::make_unique<Opm::Accelerator::BlockedMatrix>(Nb, jacNnzb, block_size, static_cast<double*>(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data());
}
Dune::Timer t_zeros2;
int jacNumZeros = checkZeroDiagonal(*jacMat);
if (verbosity >= 2) {
std::ostringstream out;
out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
OpmLog::info(out.str());
}
}
/////////////////////////
// actually solve
// assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
switch(status) {
case SolverStatus::BDA_SOLVER_SUCCESS:
//OpmLog::info("BdaSolver converged");
@@ -268,7 +309,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unuse
}
template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs) {
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[maybe_unused]] WellContributions& wellContribs,
[[maybe_unused]] unsigned N) {
if(accelerator_mode.compare("opencl") == 0){
#if HAVE_OPENCL
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
@@ -277,6 +319,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif
}
wellContribs.setVectorSize(N);
}
// the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock

View File

@@ -23,6 +23,7 @@
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
namespace Opm
@@ -43,6 +44,10 @@ private:
bool use_fpga = false;
std::string accelerator_mode;
std::unique_ptr<Opm::Accelerator::BdaSolver<block_size> > backend;
std::shared_ptr<Opm::Accelerator::BlockedMatrix> matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::shared_ptr<Opm::Accelerator::BlockedMatrix> jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
std::vector<int> h_rows, h_cols; // store the sparsity pattern of the matrix
std::vector<int> h_jacRows, h_jacCols; // store the sparsity pattern of the jacMatrix
public:
/// Construct a BdaBridge
@@ -61,11 +66,13 @@ public:
/// Solve linear system, A*x = b
/// \warning Values of A might get overwritten!
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
/// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] result summary of solver result
void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
/// \param[in] bridgeMat matrix A, should be of type Dune::BCRSMatrix
/// \param[in] jacMat matrix A, but modified for the preconditioner, should be of type Dune::BCRSMatrix
/// \param[in] numJacobiBlocks number of jacobi blocks in jacMat
/// \param[in] b vector b, should be of type Dune::BlockVector
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] result summary of solver result
void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result);
/// Get the resulting x vector
/// \param[inout] x vector x, should be of type Dune::BlockVector
@@ -86,7 +93,8 @@ public:
/// Initialize the WellContributions object with opencl context and queue
/// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
/// \param[in] wellContribs container to hold all WellContributions
void initWellContributions(WellContributions& wellContribs);
/// \param[in] N number of rows in scalar vector that wellContribs will be applied on
void initWellContributions(WellContributions& wellContribs, unsigned N);
/// Return whether the BdaBridge will use the FPGA or not
/// return whether the BdaBridge will use the FPGA or not

View File

@@ -22,6 +22,7 @@
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <string>
@@ -85,9 +86,8 @@ namespace Accelerator {
virtual ~BdaSolver() {};
/// Define as pure virtual functions, so derivedclass must implement them
virtual SolverStatus solve_system(int N, int nnz, int dim,
double *vals, int *rows, int *cols,
double *b, WellContributions& wellContribs, BdaResult &res) = 0;
virtual SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0;
virtual void get_result(double *x) = 0;

View File

@@ -37,14 +37,12 @@ namespace Accelerator
using Opm::OpmLog;
/*Sort a row of matrix elements from a blocked CSR-format.*/
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) {
const unsigned int bs = block_size;
void sortRow(int *colIndices, int *data, int left, int right) {
int l = left;
int r = right;
int middle = colIndices[(l + r) >> 1];
double lDatum[bs * bs];
do {
while (colIndices[l] < middle)
l++;
@@ -54,9 +52,9 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
int lColIndex = colIndices[l];
colIndices[l] = colIndices[r];
colIndices[r] = lColIndex;
memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs);
memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs);
memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs);
int tmp = data[l];
data[l] = data[r];
data[r] = tmp;
l++;
r--;
@@ -64,10 +62,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned
} while (l < r);
if (left < r)
sortBlockedRow(colIndices, data, left, r, bs);
sortRow(colIndices, data, left, r);
if (right > l)
sortBlockedRow(colIndices, data, l, right, bs);
sortRow(colIndices, data, l, right);
}

View File

@@ -127,13 +127,13 @@ public:
};
/// Sort a row of matrix elements from a blocked CSR-format
/// \param[inout] colIndices
/// \param[inout] data
/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints
/// These ints aren't actually nonzeroes, but represent a mapping used later
/// \param[inout] colIndices represent keys in sorting
/// \param[inout] data sorted according to the colIndices
/// \param[in] left lower index of data of row
/// \param[in] right upper index of data of row
/// \param[in] block_size size of blocks in the row
void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size);
void sortRow(int *colIndices, int *data, int left, int right);
/// Multiply and subtract blocks
/// a = a - (b * c)

View File

@@ -225,16 +225,20 @@ void FpgaSolverBackend<block_size>::get_result(double *x_)
template <unsigned int block_size>
SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res)
SolverStatus FpgaSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions&,
BdaResult &res)
{
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices);
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
perf_call.emplace_back();
update_system(vals, b);
update_system(matrix->nnzValues, b);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
@@ -252,21 +256,21 @@ SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int d
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
void FpgaSolverBackend<block_size>::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols)
{
double start = second();
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
Nb = (N + dim - 1) / dim;
this->Nb = Nb_;
this->N = Nb * block_size;
this->nnzb = nnzbs;
this->nnz = nnzbs * block_size * block_size;
// allocate host memory for matrices and vectors
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows));
std::ostringstream oss;
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
"block size: " << dim << ", total nnz: " << this->nnz << "\n";
oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \
"block size: " << block_size << ", total nnz: " << this->nnz << "\n";
oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
OpmLog::info(oss.str());

View File

@@ -201,13 +201,12 @@ private:
unsigned short rst_settle_cycles = 0;
/// Allocate host memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of blocks
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols);
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@@ -243,18 +242,15 @@ public:
~FpgaSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
/// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;

View File

@@ -29,6 +29,7 @@
#include <random>
#include <sstream>
#include <vector>
#include <cstring>
namespace {
@@ -176,39 +177,58 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
/* Reorder a matrix by a specified input order.
* Both a to order array, which contains for every node from the old matrix where it will move in the new matrix,
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) {
assert(mat->block_size == rmat->block_size);
* and the from order, which contains for every node in the new matrix where it came from in the old matrix.
* reordermapping_nonzeroes is filled with increasing indices, and reordered using the translated colIndices as keys,
* this means the resulting reordermapping_nonzeroes array contains the mapping
*/
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat){
const unsigned int bs = mat->block_size;
int rIndex = 0;
int i, k;
unsigned int j;
std::vector<int> tmp(mat->nnzbs);
reordermapping_nonzeroes.resize(mat->nnzbs);
for(int i = 0; i < mat->nnzbs; ++i){
reordermapping_nonzeroes[i] = i;
}
rmat->rowPointers[0] = 0;
for (i = 0; i < mat->Nb; i++) {
for(int i = 0; i < mat->Nb; i++){
int thisRow = fromOrder[i];
// put thisRow from the old matrix into row i of the new matrix
rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
for (j = 0; j < bs * bs; j++) {
rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
}
rmat->rowPointers[i+1] = rmat->rowPointers[i] + mat->rowPointers[thisRow+1] - mat->rowPointers[thisRow];
for(int k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow+1]; k++){
tmp[rIndex] = reordermapping_nonzeroes[k]; // only get 1 entry per block
rmat->colIndices[rIndex] = mat->colIndices[k];
rIndex++;
}
}
// re-assign column indices according to the new positions of the nodes referenced by the column indices
for (i = 0; i < mat->nnzbs; i++) {
for(int i = 0; i < mat->nnzbs; i++){
rmat->colIndices[i] = toOrder[rmat->colIndices[i]];
}
// re-sort the column indices of every row.
for (i = 0; i < mat->Nb; i++) {
sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs);
for(int i = 0; i < mat->Nb; i++){
sortRow(rmat->colIndices, tmp.data(), rmat->rowPointers[i], rmat->rowPointers[i+1]-1);
}
for(int i = 0; i < mat->nnzbs; i++){
reordermapping_nonzeroes[i] = tmp[i];
}
// std::copy();
}
/* Reorder an array of nonzero blocks into another array, using a mapping */
void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat){
assert(mat->block_size == rmat->block_size);
const unsigned int bs = mat->block_size;
for(int i = 0; i < mat->nnzbs; i++){
int old_idx = reordermapping_nonzeroes[i];
memcpy(rmat->nnzValues+i*bs*bs, mat->nnzValues+old_idx*bs*bs, sizeof(double)*bs*bs); // copy nnz block
}
}
/* Reorder a matrix according to the colors that every node of the matrix has received*/
/* Find a reorder mapping according to the colors that every node of the matrix has received */
void colorsToReordering(int Nb, std::vector<int>& colors, int numColors, int *toOrder, int *fromOrder, std::vector<int>& rowsPerColor) {
int reordered = 0;

View File

@@ -47,13 +47,22 @@ namespace Accelerator
template <unsigned int block_size>
int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector<int>& colors, int maxRowsPerColor, int maxColsPerColor);
/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder
/// rMat must be allocated already
/// \param[in] mat matrix to be reordered
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
/// \param[inout] rMat reordered Matrix
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
/// Reorder the sparsity pattern of the matrix according to the mapping in toOrder and fromOrder
/// Also find mapping for nnz blocks
/// rmat must be allocated already
/// \param[in] mat matrix to be reordered
/// \param[out] reordermapping_nonzeroes contains new index for every nnz block
/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved
/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved
/// \param[out] rmat reordered Matrix
void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat);
/// Write nnz blocks from mat to rmat, according to the mapping in reordermapping_nonzeroes
/// rmat must be allocated already
/// \param[in] mat matrix to be reordered
/// \param[in] reordermapping_nonzeroes contains old index for every nnz block, so rmat_nnz[i] == mat_nnz[mapping[i]]
/// \param[inout] rmat reordered Matrix
void reorderNonzeroes(BlockedMatrix *mat, std::vector<int>& reordermapping_nonzeroes, BlockedMatrix *rmat);
/// Compute reorder mapping from the color that each node has received
/// The toOrder, fromOrder and iters arrays must be allocated already

View File

@@ -101,6 +101,10 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
}
}
void WellContributions::setVectorSize(unsigned N_) {
N = N_;
}
void WellContributions::addNumBlocks(unsigned int numBlocks)
{
if (allocated) {

View File

@@ -101,6 +101,10 @@ public:
/// \param[in] dim_wells number of rows
void setBlockSize(unsigned int dim, unsigned int dim_wells);
/// Set size of vector that the wells are applied to
/// \param[in] N size of vector
void setVectorSize(unsigned N);
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D

View File

@@ -52,14 +52,14 @@ amgclSolverBackend<block_size>::~amgclSolverBackend() {}
template <unsigned int block_size>
void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
void amgclSolverBackend<block_size>::initialize(int Nb_, int nnzbs) {
this->Nb = Nb_;
this->N = Nb * block_size;
this->nnzb = nnzbs;
this->nnz = nnzbs * block_size * block_size;
Nb = (N + dim - 1) / dim;
std::ostringstream out;
out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n";
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "DeviceID: " << deviceID << "\n";
OpmLog::info(out.str());
@@ -360,12 +360,17 @@ void amgclSolverBackend<block_size>::get_result(double *x_) {
template <unsigned int block_size>
SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) {
SolverStatus amgclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
[[maybe_unused]] std::shared_ptr<BlockedMatrix> jacMatrix,
[[maybe_unused]] WellContributions& wellContribs,
BdaResult &res)
{
if (initialized == false) {
initialize(N_, nnz_, dim);
convert_sparsity_pattern(rows, cols);
initialize(matrix->Nb, matrix->nnzbs);
convert_sparsity_pattern(matrix->rowPointers, matrix->colIndices);
}
convert_data(vals, rows);
convert_data(matrix->nnzValues, matrix->rowPointers);
solve_system(b, res);
return SolverStatus::BDA_SOLVER_SUCCESS;
}

View File

@@ -96,10 +96,9 @@ private:
std::once_flag vexcl_initialize;
#endif
/// Initialize host memory and determine amgcl parameters
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
void initialize(int N, int nnz, int dim);
/// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of blocks
void initialize(int Nb, int nnzbs);
/// Convert the BCSR sparsity pattern to a CSR one
/// \param[in] rows array of rowPointers, contains N/dim+1 values
@@ -129,22 +128,15 @@ public:
~amgclSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] vals_prec array of nonzeroes for preconditioner
/// \param[in] rows_prec array of rowPointers for preconditioner
/// \param[in] cols_prec array of columnIndices for preconditioner
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;

View File

@@ -104,10 +104,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer);
// spmv
@@ -135,10 +135,10 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply ilu0
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
operation, Nb, nnzbs_prec, &one, \
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer);
cusparseDbsrsv2_solve(cusparseHandle, order, \
operation, Nb, nnzb, &one, \
operation, Nb, nnzbs_prec, &one, \
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer);
// spmv
@@ -188,17 +188,25 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
this->N = N;
this->nnz = nnz;
this->nnzb = nnz / block_size / block_size;
Nb = (N + dim - 1) / dim;
void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
this->nnz = nnzb * block_size * block_size;
if (jacMatrix) {
useJacMatrix = true;
nnzbs_prec = jacMatrix->nnzbs;
} else {
nnzbs_prec = nnzb;
}
std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
OpmLog::info(out.str());
out.str("");
out.clear();
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << nnzbs_prec << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
OpmLog::info(out.str());
cudaSetDevice(deviceID);
@@ -232,7 +240,15 @@ void cusparseSolverBackend<block_size>::initialize(int N, int nnz, int dim) {
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb);
cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1));
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
if (useJacMatrix) {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size);
cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec);
cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1));
} else {
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
d_mCols = d_bCols;
d_mRows = d_bRows;
}
cudaCheckLastError("Could not allocate enough memory on GPU");
cublasSetStream(cublasHandle, stream);
@@ -261,6 +277,10 @@ void cusparseSolverBackend<block_size>::finalize() {
cudaFree(d_t);
cudaFree(d_v);
cudaFree(d_mVals);
if (useJacMatrix) {
cudaFree(d_mCols);
cudaFree(d_mRows);
}
cudaFree(d_bVals);
cudaFree(d_bCols);
cudaFree(d_bRows);
@@ -283,23 +303,32 @@ void cusparseSolverBackend<block_size>::finalize() {
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
void cusparseSolverBackend<block_size>::copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
Timer t;
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = rows[i + 1] - rows[i];
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
#endif
cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream);
}
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
@@ -314,19 +343,24 @@ void cusparseSolverBackend<block_size>::copy_system_to_gpu(double *vals, int *ro
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *rows, double *b) {
void cusparseSolverBackend<block_size>::update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix) {
Timer t;
#if COPY_ROW_BY_ROW
int sum = 0;
for (int i = 0; i < Nb; ++i) {
int size_row = rows[i + 1] - rows[i];
memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size);
int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i];
memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
#else
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
if (useJacMatrix) {
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream);
} else {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
#endif
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
@@ -341,12 +375,6 @@ void cusparseSolverBackend<block_size>::update_system_on_gpu(double *vals, int *
} // end update_system_on_gpu()
template <unsigned int block_size>
void cusparseSolverBackend<block_size>::reset_prec_on_gpu() {
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
}
template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::analyse_matrix() {
@@ -380,12 +408,12 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
cusparseCreateBsrsv2Info(&info_U);
cudaCheckLastError("Could not create analysis info");
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U);
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec,
descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L);
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec,
descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U);
cudaCheckLastError();
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
@@ -393,7 +421,7 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
// analysis of ilu LU decomposition
cusparseDbsrilu02_analysis(cusparseHandle, order, \
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \
block_size, info_M, policy, d_buffer);
int structural_zero;
@@ -404,11 +432,11 @@ bool cusparseSolverBackend<block_size>::analyse_matrix() {
// analysis of ilu apply
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \
block_size, info_L, policy, d_buffer);
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \
block_size, info_U, policy, d_buffer);
cudaCheckLastError("Could not analyse level information");
@@ -428,10 +456,8 @@ template <unsigned int block_size>
bool cusparseSolverBackend<block_size>::create_preconditioner() {
Timer t;
d_mCols = d_bCols;
d_mRows = d_bRows;
cusparseDbsrilu02(cusparseHandle, order, \
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \
block_size, info_M, policy, d_buffer);
cudaCheckLastError("Could not perform ilu decomposition");
@@ -480,19 +506,23 @@ void cusparseSolverBackend<block_size>::get_result(double *x) {
template <unsigned int block_size>
SolverStatus cusparseSolverBackend<block_size>::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
SolverStatus cusparseSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
{
if (initialized == false) {
initialize(N, nnz, dim);
copy_system_to_gpu(vals, rows, cols, b);
initialize(matrix, jacMatrix);
copy_system_to_gpu(matrix, b, jacMatrix);
} else {
update_system_on_gpu(vals, rows, b);
update_system_on_gpu(matrix, b, jacMatrix);
}
if (analysis_done == false) {
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
reset_prec_on_gpu();
if (create_preconditioner()) {
solve_system(wellContribs, res);
} else {

View File

@@ -68,6 +68,9 @@ private:
bool analysis_done = false;
bool useJacMatrix = false;
int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner
// could be jacMatrix or matrix
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
@@ -75,29 +78,26 @@ private:
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
void initialize(int N, int nnz, int dim);
/// \param[in] matrix matrix for spmv
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
/// Clean memory
void finalize();
/// Copy linear system to GPU
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnzb values
/// \param[in] b input vector, contains N values
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
/// also copy matrix for preconditioner if needed
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void copy_system_to_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true
/// \param[in] b input vector, contains N values
void update_system_on_gpu(double *vals, int *rows, double *b);
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
void reset_prec_on_gpu();
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
/// also copy matrix for preconditioner if needed
/// \param[in] matrix matrix for spmv
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
void update_system_on_gpu(std::shared_ptr<BlockedMatrix> matrix, double *b, std::shared_ptr<BlockedMatrix> jacMatrix);
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@@ -126,18 +126,15 @@ public:
~cusparseSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
/// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
/// Get resulting vector x after linear solve, also includes post processing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;

View File

@@ -51,6 +51,13 @@ BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
template <unsigned int block_size>
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
{
return analyze_matrix(mat, nullptr);
}
template <unsigned int block_size>
bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
{
const unsigned int bs = block_size;
@@ -62,19 +69,27 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
std::vector<int> CSCRowIndices;
std::vector<int> CSCColPointers;
auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat
if (opencl_ilu_reorder == ILUReorder::NONE) {
LUmat = std::make_unique<BlockedMatrix>(*mat);
} else {
toOrder.resize(Nb);
fromOrder.resize(Nb);
CSCRowIndices.resize(nnzb);
CSCRowIndices.resize(matToDecompose->nnzbs);
CSCColPointers.resize(Nb + 1);
rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);
LUmat = std::make_unique<BlockedMatrix>(*rmat);
if (jacMat) {
rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
} else {
LUmat = std::make_unique<BlockedMatrix>(*rmat);
}
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
if (verbosity >= 3) {
csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
if(verbosity >= 3){
std::ostringstream out;
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
@@ -85,24 +100,33 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
}
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n";
// numColors = 1;
// rowsPerColor.emplace_back(Nb);
numColors = Nb;
for (int i = 0; i < Nb; ++i) {
for(int i = 0; i < Nb; ++i){
rowsPerColor.emplace_back(1);
}
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
}
if (verbosity >= 1) {
if(verbosity >= 1){
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
}
#if CHOW_PATEL
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
#endif
@@ -112,8 +136,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
invDiagVals.resize(mat->Nb * bs * bs);
#if CHOW_PATEL
Lmat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
Umat = std::make_unique<BlockedMatrix>(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs);
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
#endif
s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
@@ -137,7 +161,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
for (int i = 0; i < numColors; ++i) {
rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
}
err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
@@ -149,21 +173,38 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat)
}
return true;
} // end init()
}
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
{
return create_preconditioner(mat, nullptr);
}
template <unsigned int block_size>
bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
{
const unsigned int bs = block_size;
auto *m = mat;
if (opencl_ilu_reorder != ILUReorder::NONE) {
m = rmat.get();
auto *matToDecompose = jacMat;
if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
matToDecompose = jacMat ? jacMat : mat;
} else {
Timer t_reorder;
reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get());
if (jacMat) {
matToDecompose = rJacMat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
} else {
matToDecompose = rmat.get();
reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
}
if (verbosity >= 3) {
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
@@ -173,18 +214,18 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
// this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
Timer t_copy;
memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);
memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs);
if (verbosity >= 3) {
if (verbosity >= 3){
std::ostringstream out;
out << "BILU0 memcpy: " << t_copy.stop() << " s";
OpmLog::info(out.str());
}
#if CHOW_PATEL
chowPatelIlu.decomposition(queue.get(), context.get(),
chowPatelIlu.decomposition(queue, context,
LUmat.get(), Lmat.get(), Umat.get(),
invDiagVals.data(), diagIndex,
invDiagVals, diagIndex,
s.diagIndex, s.invDiagVals,
s.Lvals, s.Lcols, s.Lrows,
s.Uvals, s.Ucols, s.Urows);
@@ -192,23 +233,23 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
Timer t_copyToGpu;
events.resize(1);
err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]);
std::call_once(pattern_uploaded, [&]() {
std::call_once(pattern_uploaded, [&](){
// find the positions of each diagonal block
// must be done after reordering
for (int row = 0; row < Nb; ++row) {
int rowStart = LUmat->rowPointers[row];
int rowEnd = LUmat->rowPointers[row + 1];
int rowEnd = LUmat->rowPointers[row+1];
auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row);
assert(candidate != LUmat->colIndices + rowEnd);
diagIndex[row] = candidate - LUmat->colIndices;
}
events.resize(4);
err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
});
cl::WaitForEvents(events);
@@ -229,8 +270,8 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
cl::Event event;
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color + 1];
if (verbosity >= 4) {
const unsigned int lastRow = rowsPerColorPrefix[color+1];
if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
@@ -245,6 +286,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat)
return true;
} // end create_preconditioner()
// kernels are blocking on an NVIDIA GPU, so waiting for events is not needed
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested

View File

@@ -55,6 +55,7 @@ class BILU0 : public Preconditioner<block_size>
private:
std::unique_ptr<BlockedMatrix> LUmat = nullptr;
std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
std::shared_ptr<BlockedMatrix> rJacMat = nullptr;
#if CHOW_PATEL
std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
#endif
@@ -68,6 +69,9 @@ private:
ILUReorder opencl_ilu_reorder;
std::vector<int> reordermappingNonzeroes; // maps nonzero blocks to new location in reordered matrix
std::vector<int> jacReordermappingNonzeroes; // same but for jacMatrix
typedef struct {
cl::Buffer invDiagVals;
cl::Buffer diagIndex;
@@ -92,9 +96,11 @@ public:
// analysis, find reordering if specified
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) override;
@@ -114,6 +120,11 @@ public:
return rmat.get();
}
BlockedMatrix* getRJacMat()
{
return rJacMat.get();
}
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
{
return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex};
@@ -127,7 +138,6 @@ public:
return std::make_pair(s.LUvals, s.invDiagVals);
#endif
}
};
} // namespace Accelerator

View File

@@ -78,17 +78,30 @@ std::vector<int> buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vecto
template <unsigned int block_size>
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat)
{
return analyze_matrix(mat, nullptr);
}
template <unsigned int block_size>
bool BISAI<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat)
{
const unsigned int bs = block_size;
auto *m = mat;
this->N = mat->Nb * bs;
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * bs * bs;
this->nnzb = mat->nnzbs;
if (jacMat) {
m = jacMat;
}
bilu0->analyze_matrix(mat);
this->N = m->Nb * bs;
this->Nb = m->Nb;
this->nnz = m->nnzbs * bs * bs;
this->nnzb = m->nnzbs;
return true;
if (jacMat) {
return bilu0->analyze_matrix(mat, jacMat);
} else {
return bilu0->analyze_matrix(mat);
}
}
template <unsigned int block_size>
@@ -162,7 +175,7 @@ void BISAI<block_size>::buildUpperSubsystemsStructures(){
}
template <unsigned int block_size>
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat)
{
const unsigned int bs = block_size;
@@ -172,7 +185,11 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
Dune::Timer t_preconditioner;
bilu0->create_preconditioner(mat);
if (jacMat) {
bilu0->create_preconditioner(mat, jacMat);
} else {
bilu0->create_preconditioner(mat);
}
std::call_once(initialize, [&]() {
std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure();
@@ -255,6 +272,12 @@ bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
return true;
}
template <unsigned int block_size>
bool BISAI<block_size>::create_preconditioner(BlockedMatrix *mat)
{
return create_preconditioner(mat, nullptr);
}
template <unsigned int block_size>
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
const unsigned int bs = block_size;

View File

@@ -117,9 +117,11 @@ public:
// analysis, find reordering if specified
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// ilu_decomposition
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// apply preconditioner, x = prec(y)
void apply(const cl::Buffer& y, cl::Buffer& x) override;

View File

@@ -80,6 +80,42 @@ bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_) {
return success;
}
template <unsigned int block_size>
bool CPR<block_size>::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
this->Nb = mat_->Nb;
this->nnzb = mat_->nnzbs;
this->N = Nb * block_size;
this->nnz = nnzb * block_size * block_size;
bool success = bilu0->analyze_matrix(mat_, jacMat);
if (opencl_ilu_reorder == ILUReorder::NONE) {
mat = mat_;
} else {
mat = bilu0->getRMat();
}
return success;
}
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) {
Dune::Timer t_bilu0;
bool result = bilu0->create_preconditioner(mat_, jacMat);
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
OpmLog::info(out.str());
}
Dune::Timer t_amg;
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
if (verbosity >= 3) {
std::ostringstream out;
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
OpmLog::info(out.str());
}
return result;
}
template <unsigned int block_size>
bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
@@ -101,6 +137,7 @@ bool CPR<block_size>::create_preconditioner(BlockedMatrix *mat_) {
return result;
}
// return the absolute value of the N elements for which the absolute value is highest
double get_absmax(const double *data, const int N) {
return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);}));

View File

@@ -125,6 +125,7 @@ public:
CPR(int verbosity, ILUReorder opencl_ilu_reorder);
bool analyze_matrix(BlockedMatrix *mat) override;
bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
// set own Opencl variables, but also that of the bilu0 preconditioner
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue) override;
@@ -134,7 +135,8 @@ public:
void apply(const cl::Buffer& y, cl::Buffer& x) override;
bool create_preconditioner(BlockedMatrix *mat) override;
bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override;
int* getToOrder() override
{
return bilu0->getToOrder();

View File

@@ -52,10 +52,21 @@ std::unique_ptr<Preconditioner<block_size> > Preconditioner<block_size>::create(
}
}
template <unsigned int block_size>
bool Preconditioner<block_size>::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
return analyze_matrix(mat);
}
template <unsigned int block_size>
bool Preconditioner<block_size>::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) {
return create_preconditioner(mat);
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&);
template std::unique_ptr<Preconditioner<n> > Preconditioner<n>::create(PreconditionerType, int, ILUReorder); \
template void Preconditioner<n>::setOpencl(std::shared_ptr<cl::Context>&, std::shared_ptr<cl::CommandQueue>&); \
template bool Preconditioner<n>::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \
template bool Preconditioner<n>::create_preconditioner(BlockedMatrix *, BlockedMatrix *);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);

View File

@@ -68,10 +68,14 @@ public:
// analyze matrix, e.g. the sparsity pattern
// probably only called once
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool analyze_matrix(BlockedMatrix *mat) = 0;
virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat);
// create/update preconditioner, probably used every linear solve
// the version with two params can be overloaded, if not, it will default to using the one param version
virtual bool create_preconditioner(BlockedMatrix *mat) = 0;
virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat);
// get reordering mappings
virtual int* getToOrder() = 0;

View File

@@ -370,7 +370,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
", time per iteration: " << res.elapsed / it << ", iterations: " << it;
OpmLog::info(out.str());
}
if (verbosity >= 4) {
if (verbosity >= 3) {
std::ostringstream out;
out << "openclSolver::prec_apply: " << t_prec.elapsed() << " s\n";
out << "wellContributions::apply: " << t_well.elapsed() << " s\n";
@@ -383,14 +383,21 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix) {
this->Nb = matrix->Nb;
this->N = Nb * block_size;
this->nnzb = matrix->nnzbs;
this->nnz = nnzb * block_size * block_size;
if (jacMatrix) {
useJacMatrix = true;
}
Nb = (N + dim - 1) / dim;
std::ostringstream out;
out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n";
if (useJacMatrix) {
out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n";
}
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n";
OpmLog::info(out.str());
@@ -401,9 +408,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
prec->setOpencl(context, queue);
#if COPY_ROW_BY_ROW
vals_contiguous.resize(nnz);
vals_contiguous = new double[N];
#endif
mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
// mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows));
// jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2));
mat = matrix;
jacMat = jacMatrix;
d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N);
@@ -441,7 +451,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
@@ -527,8 +536,11 @@ template <unsigned int block_size>
bool openclSolverBackend<block_size>::analyze_matrix() {
Timer t;
// bool success = bilu0->init(mat.get());
bool success = prec->analyze_matrix(mat.get());
bool success;
if (useJacMatrix)
success = prec->analyze_matrix(mat.get(), jacMat.get());
else
success = prec->analyze_matrix(mat.get());
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();
@@ -579,7 +591,11 @@ template <unsigned int block_size>
bool openclSolverBackend<block_size>::create_preconditioner() {
Timer t;
bool result = prec->create_preconditioner(mat.get());
bool result;
if (useJacMatrix)
result = prec->create_preconditioner(mat.get(), jacMat.get());
else
result = prec->create_preconditioner(mat.get());
if (verbosity > 2) {
std::ostringstream out;
@@ -639,21 +655,26 @@ void openclSolverBackend<block_size>::get_result(double *x) {
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
SolverStatus openclSolverBackend<block_size>::solve_system(std::shared_ptr<BlockedMatrix> matrix,
double *b,
std::shared_ptr<BlockedMatrix> jacMatrix,
WellContributions& wellContribs,
BdaResult &res)
{
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
initialize(matrix, jacMatrix);
if (analysis_done == false) {
if (!analyze_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
update_system(vals, b, wellContribs);
update_system(matrix->nnzValues, b, wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu();
} else {
update_system(vals, b, wellContribs);
update_system(matrix->nnzValues, b, wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}

View File

@@ -63,12 +63,15 @@ private:
std::vector<cl::Device> devices;
bool useJacMatrix = false;
std::unique_ptr<Preconditioner<block_size> > prec;
// can perform blocked ILU0 and AMG on pressure component
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
bool analysis_done = false;
std::unique_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> mat = nullptr; // original matrix
std::shared_ptr<BlockedMatrix> jacMat = nullptr; // matrix for preconditioner
BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
ILUReorder opencl_ilu_reorder; // reordering strategy
std::vector<cl::Event> events;
@@ -130,13 +133,9 @@ private:
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// \param[in] matrix matrix A
/// \param[in] jacMatrix matrix for preconditioner
void initialize(std::shared_ptr<BlockedMatrix> matrix, std::shared_ptr<BlockedMatrix> jacMatrix);
/// Clean memory
void finalize();
@@ -189,21 +188,14 @@ public:
~openclSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] vals_prec array of nonzeroes for preconditioner
/// \param[in] rows_prec array of rowPointers for preconditioner
/// \param[in] cols_prec array of columnIndices for preconditioner
/// \param[in] matrix matrix A
/// \param[in] b input vector, contains N values
/// \param[in] jacMatrix matrix for preconditioner
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
/// Data is already on the GPU
@@ -218,7 +210,7 @@ public:
/// \param[in] context the opencl context to be used
/// \param[in] queue the opencl queue to be used
void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
}; // end class openclSolverBackend
} // namespace Accelerator

View File

@@ -35,12 +35,6 @@ using Accelerator::OpenclKernels;
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
this->context = context_;
this->queue = queue_;
}
void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)

View File

@@ -35,8 +35,6 @@ namespace Opm
class WellContributionsOCL : public WellContributions
{
public:
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
@@ -56,8 +54,6 @@ protected:
cl::Context* context;
cl::CommandQueue* queue;
Opm::Accelerator::stdwell_apply_kernel_type* kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;

View File

@@ -39,9 +39,9 @@ namespace detail
/// \param useWellConn Boolean that is true when UseWellContribusion is true
/// \param wellGraph Cell IDs of well cells stored in a graph.
template<class Grid, class W>
void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph)
void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector<std::set<int>>& wellGraph, int numJacobiBlocks)
{
if ( grid.comm().size() > 1)
if ( grid.comm().size() > 1 || numJacobiBlocks > 1)
{
Dune::CartesianIndexMapper< Grid > cartMapper( grid );
const int numCells = cartMapper.compressedSize(); // grid.numCells()