diff --git a/ebos/eclbasevanguard.hh b/ebos/eclbasevanguard.hh index 2b29f9282..30a3c1aa8 100644 --- a/ebos/eclbasevanguard.hh +++ b/ebos/eclbasevanguard.hh @@ -85,6 +85,10 @@ struct EdgeWeightsMethod { using type = UndefinedProperty; }; template +struct NumJacobiBlocks { + using type = UndefinedProperty; +}; +template struct OwnerCellsFirst { using type = UndefinedProperty; }; @@ -133,6 +137,10 @@ struct EdgeWeightsMethod { static constexpr int value = 1; }; template +struct NumJacobiBlocks { + static constexpr int value = 0; +}; +template struct OwnerCellsFirst { static constexpr bool value = true; }; @@ -211,6 +219,8 @@ public: "When restarting: should we try to initialize wells and groups from historical SCHEDULE section."); EWOMS_REGISTER_PARAM(TypeTag, int, EdgeWeightsMethod, "Choose edge-weighing strategy: 0=uniform, 1=trans, 2=log(trans)."); + EWOMS_REGISTER_PARAM(TypeTag, int, NumJacobiBlocks, + "Number of blocks to be created for the Block-Jacobi preconditioner."); EWOMS_REGISTER_PARAM(TypeTag, bool, OwnerCellsFirst, "Order cells owned by rank before ghost/overlap cells."); EWOMS_REGISTER_PARAM(TypeTag, bool, SerialPartitioning, @@ -235,6 +245,7 @@ public: { fileName_ = EWOMS_GET_PARAM(TypeTag, std::string, EclDeckFileName); edgeWeightsMethod_ = Dune::EdgeWeightMethod(EWOMS_GET_PARAM(TypeTag, int, EdgeWeightsMethod)); + numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks); ownersFirst_ = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst); serialPartitioning_ = EWOMS_GET_PARAM(TypeTag, bool, SerialPartitioning); zoltanImbalanceTol_ = EWOMS_GET_PARAM(TypeTag, double, ZoltanImbalanceTol); diff --git a/ebos/eclcpgridvanguard.hh b/ebos/eclcpgridvanguard.hh index c35fed0bd..34b0439c8 100644 --- a/ebos/eclcpgridvanguard.hh +++ b/ebos/eclcpgridvanguard.hh @@ -136,7 +136,7 @@ public: this->serialPartitioning(), this->enableDistributedWells(), this->zoltanImbalanceTol(), this->gridView(), this->schedule(), this->centroids_, - this->eclState(), this->parallelWells_); + this->eclState(), this->parallelWells_, this->numJacobiBlocks()); #endif this->updateGridView_(); diff --git a/ebos/eclgenericcpgridvanguard.cc b/ebos/eclgenericcpgridvanguard.cc index 27025a882..2692c3cb4 100644 --- a/ebos/eclgenericcpgridvanguard.cc +++ b/ebos/eclgenericcpgridvanguard.cc @@ -82,12 +82,13 @@ void EclGenericCpGridVanguard::doLoadBalance_(Dun const Schedule& schedule, std::vector& centroids, EclipseState& eclState1, - EclGenericVanguard::ParallelWellStruct& parallelWells) + EclGenericVanguard::ParallelWellStruct& parallelWells, + int numJacobiBlocks) { int mpiSize = 1; MPI_Comm_size(grid_->comm(), &mpiSize); - if (mpiSize > 1) { + if (mpiSize > 1 || numJacobiBlocks > 1) { // the CpGrid's loadBalance() method likes to have the transmissibilities as // its edge weights. since this is (kind of) a layering violation and // transmissibilities are relatively expensive to compute, we only do it if @@ -131,53 +132,61 @@ void EclGenericCpGridVanguard::doLoadBalance_(Dun } //distribute the grid and switch to the distributed view. - { - const auto wells = schedule.getWellsatEnd(); - - try + if (mpiSize > 1) { { - auto& eclState = dynamic_cast(eclState1); - const EclipseGrid* eclGrid = nullptr; + const auto wells = schedule.getWellsatEnd(); - if (grid_->comm().rank() == 0) + try { - eclGrid = &eclState.getInputGrid(); - } + auto& eclState = dynamic_cast(eclState1); + const EclipseGrid* eclGrid = nullptr; - PropsCentroidsDataHandle handle(*grid_, eclState, eclGrid, centroids, - cartesianIndexMapper()); - if (loadBalancerSet) - { - std::vector parts; if (grid_->comm().rank() == 0) { - parts = (*externalLoadBalancer)(*grid_); + eclGrid = &eclState.getInputGrid(); } - parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1)); - } - else - { - parallelWells = - std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning, - faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol, - enableDistributedWells)); - } - } - catch(const std::bad_cast& e) - { - std::ostringstream message; - message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState (" - << e.what() <<")"<switchToDistributedView(); + PropsCentroidsDataHandle handle(*grid_, eclState, eclGrid, centroids, + cartesianIndexMapper()); + if (loadBalancerSet) + { + std::vector parts; + if (grid_->comm().rank() == 0) + { + parts = (*externalLoadBalancer)(*grid_); + } + parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1)); + } + else + { + parallelWells = + std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning, + faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol, + enableDistributedWells)); + } + } + catch(const std::bad_cast& e) + { + std::ostringstream message; + message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState (" + << e.what() <<")"<switchToDistributedView(); + } + // Calling Schedule::filterConnections would remove any perforated // cells that exist only on other ranks even in the case of distributed wells // But we need all connections to figure out the first cell of a well (e.g. for // pressure). Hence this is now skipped. Rank 0 had everything even before. + + if (numJacobiBlocks > 1 && mpiSize == 1) { + const auto wells = schedule.getWellsatEnd(); + cell_part_.resize(grid_->numCells()); + cell_part_ = grid_->zoltanPartitionWithoutScatter(&wells, faceTrans.data(), numJacobiBlocks, zoltanImbalanceTol); + } } } diff --git a/ebos/eclgenericcpgridvanguard.hh b/ebos/eclgenericcpgridvanguard.hh index 310c2a14c..13a7172e0 100644 --- a/ebos/eclgenericcpgridvanguard.hh +++ b/ebos/eclgenericcpgridvanguard.hh @@ -101,6 +101,10 @@ public: */ const CartesianIndexMapper& equilCartesianIndexMapper() const; + std::vector cellPartition() const + { + return cell_part_; + } protected: /*! * \brief Distribute the simulation grid over multiple processes @@ -114,7 +118,8 @@ protected: const GridView& gridv, const Schedule& schedule, std::vector& centroids, EclipseState& eclState, - EclGenericVanguard::ParallelWellStruct& parallelWells); + EclGenericVanguard::ParallelWellStruct& parallelWells, + int numJacobiBlocks); void distributeFieldProps_(EclipseState& eclState); #endif @@ -137,6 +142,7 @@ protected: std::unique_ptr equilCartesianIndexMapper_; int mpiRank; + std::vector cell_part_; }; } // namespace Opm diff --git a/ebos/eclgenericvanguard.hh b/ebos/eclgenericvanguard.hh index 05b441584..200ecb508 100644 --- a/ebos/eclgenericvanguard.hh +++ b/ebos/eclgenericvanguard.hh @@ -249,6 +249,12 @@ public: Dune::EdgeWeightMethod edgeWeightsMethod() const { return edgeWeightsMethod_; } + /*! + * \brief Number of blocks in the Block-Jacobi preconditioner. + */ + int numJacobiBlocks() const + { return numJacobiBlocks_; } + /*! * \brief Parameter that decide if cells owned by rank are ordered before ghost cells. */ @@ -323,6 +329,7 @@ protected: std::string caseName_; std::string fileName_; Dune::EdgeWeightMethod edgeWeightsMethod_; + int numJacobiBlocks_; bool ownersFirst_; bool serialPartitioning_; double zoltanImbalanceTol_; diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index a77737b87..890fcc4ae 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -161,7 +161,6 @@ namespace Opm // Set it up manually ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout()); detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_); - useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); #if HAVE_FPGA // check usage of MatrixAddWellContributions: for FPGA they must be included @@ -212,6 +211,17 @@ namespace Opm // to the original one with a deleter that does nothing. // Outch! We need to be able to scale the linear system! Hence const_cast matrix_ = const_cast(&M.istlMatrix()); + + // setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver) + numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks); + useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); + if (numJacobiBlocks_ > 1) { + const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd(); + detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_, + wellConnectionsGraph_, numJacobiBlocks_); + std::cout << "Create block-Jacobi pattern" << std::endl; + blockJacobiAdjacency(); + } } else { // Pointers should not change if ( &(M.istlMatrix()) != matrix_ ) { @@ -264,7 +274,7 @@ namespace Opm if (use_gpu || use_fpga) { const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_); - bdaBridge->initWellContributions(*wellContribs); + bdaBridge->initWellContributions(*wellContribs, x.N() * x[0].N()); // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl #if HAVE_CUDA || HAVE_OPENCL @@ -273,8 +283,15 @@ namespace Opm } #endif + if (numJacobiBlocks_ > 1) { + copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_); // Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. - bdaBridge->solve_system(const_cast(&getMatrix()), *rhs_, *wellContribs, result); + bdaBridge->solve_system(const_cast(&getMatrix()), &*blockJacobiForGPUILU0_, + numJacobiBlocks_, *rhs_, *wellContribs, result); + } + else + bdaBridge->solve_system(const_cast(&getMatrix()), const_cast(&getMatrix()), + numJacobiBlocks_, *rhs_, *wellContribs, result); if (result.converged) { // get result vector x from non-Dune backend, iff solve was successful bdaBridge->get_result(x); @@ -497,6 +514,68 @@ namespace Opm } } + /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid. + /// Do not initialize the values, that is done in copyMatToBlockJac() + void blockJacobiAdjacency() + { + const auto& grid = simulator_.vanguard().grid(); + std::vector cell_part = simulator_.vanguard().cellPartition(); + + typedef typename Matrix::size_type size_type; + typedef typename Matrix::CreateIterator Iter; + size_type numCells = grid.size( 0 ); + blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, getMatrix().nonzeroes(), Matrix::row_wise)); + + const auto& lid = grid.localIdSet(); + const auto& gridView = grid.leafGridView(); + auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows + + //Loop over cells + for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row) + { + const auto& elem = *elemIt; + size_type idx = lid.id(elem); + row.insert(idx); + + // Add well non-zero connections + for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) { + row.insert(*wc); + } + + int locPart = cell_part[idx]; + + //Add neighbor if it is on the same part + auto isend = gridView.iend(elem); + for (auto is = gridView.ibegin(elem); is!=isend; ++is) + { + //check if face has neighbor + if (is->neighbor()) + { + size_type nid = lid.id(is->outside()); + int nabPart = cell_part[nid]; + if (locPart == nabPart) { + row.insert(nid); + } + } + } + } + } + + void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac) + { + auto rbegin = blockJac.begin(); + auto rend = blockJac.end(); + auto outerRow = mat.begin(); + for (auto row = rbegin; row != rend; ++row, ++outerRow) { + auto outerCol = (*outerRow).begin(); + for (auto col = (*row).begin(); col != (*row).end(); ++col) { + // outerRow is guaranteed to have all column entries that row has! + while(outerCol.index() < col.index()) ++outerCol; + assert(outerCol.index() == col.index()); + *col = *outerCol; // copy nonzero block + } + } + } Matrix& getMatrix() { @@ -517,6 +596,8 @@ namespace Opm Matrix* matrix_; Vector *rhs_; + std::unique_ptr blockJacobiForGPUILU0_; + std::unique_ptr flexibleSolver_; std::unique_ptr linearOperatorForFlexibleSolver_; std::unique_ptr> wellOperator_; @@ -530,6 +611,7 @@ namespace Opm FlowLinearSolverParameters parameters_; PropertyTree prm_; bool scale_variables_; + int numJacobiBlocks_; std::shared_ptr< CommunicationType > comm_; }; // end ISTLSolver diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index ee7c6e7c8..822561773 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -191,23 +191,43 @@ void BdaBridge::copySparsityPatternFromI } -template -void BdaBridge::solve_system([[maybe_unused]] BridgeMatrix* mat, - [[maybe_unused]] BridgeVector& b, - [[maybe_unused]] WellContributions& wellContribs, - [[maybe_unused]] InverseOperatorResult& res) -{ +// check if the nnz values of the matrix are in contiguous memory +// this is done by checking if the distance between the last value of the last block of row 0 and +// the first value of the first row of row 1 is equal to 1 +// if the matrix only has 1 row, it is always contiguous +template +void checkMemoryContiguous(const BridgeMatrix& mat) { + auto block_size = mat[0][0].N(); + auto row = mat.begin(); + auto last_of_row0 = row->begin(); + // last_of_row0 points to last block, not to row->end() + for(auto tmp = row->begin(); tmp != row->end(); ++tmp) { + last_of_row0 = tmp; + } + + bool isContiguous = mat.N() < 2 || std::distance(&((*last_of_row0)[block_size-1][block_size-1]), &(*mat[1].begin())[0][0]) == 1; + + if (!isContiguous) { + OPM_THROW(std::logic_error, "Error memory of Matrix looks not contiguous"); + } +} + + +template +void BdaBridge::solve_system(BridgeMatrix* bridgeMat, + BridgeMatrix* jacMat, + int numJacobiBlocks, + BridgeVector& b, + WellContributions& wellContribs, + InverseOperatorResult& res) +{ if (use_gpu || use_fpga) { BdaResult result; result.converged = false; - static std::vector h_rows; - static std::vector h_cols; - const int dim = (*mat)[0][0].N(); - const int Nb = mat->N(); - const int N = Nb * dim; - const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back(); - const int nnz = nnzb * dim * dim; + const int dim = (*bridgeMat)[0][0].N(); + const int Nb = bridgeMat->N(); + const int nnzb = bridgeMat->nonzeroes(); if (dim != 3) { OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); @@ -215,26 +235,47 @@ void BdaBridge::solve_system([[maybe_unu return; } - if (h_rows.capacity() == 0) { + if (!matrix) { h_rows.reserve(Nb+1); h_cols.reserve(nnzb); - copySparsityPatternFromISTL(*mat, h_rows, h_cols); + copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols); + checkMemoryContiguous(*bridgeMat); + matrix = std::make_unique(Nb, nnzb, block_size, static_cast(&(((*bridgeMat)[0][0][0][0]))), h_cols.data(), h_rows.data()); } Dune::Timer t_zeros; - int numZeros = checkZeroDiagonal(*mat); + int numZeros = checkZeroDiagonal(*bridgeMat); if (verbosity >= 2) { std::ostringstream out; out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros"; OpmLog::info(out.str()); } + if (numJacobiBlocks >= 2) { + const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back(); + + if (!jacMatrix) { + h_jacRows.reserve(Nb+1); + h_jacCols.reserve(jacNnzb); + copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols); + checkMemoryContiguous(*jacMat); + jacMatrix = std::make_unique(Nb, jacNnzb, block_size, static_cast(&(((*jacMat)[0][0][0][0]))), h_jacCols.data(), h_jacRows.data()); + } + + Dune::Timer t_zeros2; + int jacNumZeros = checkZeroDiagonal(*jacMat); + if (verbosity >= 2) { + std::ostringstream out; + out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros"; + OpmLog::info(out.str()); + } + } ///////////////////////// // actually solve + // assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour + SolverStatus status = backend->solve_system(matrix, static_cast(&(b[0][0])), jacMatrix, wellContribs, result); - // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour - SolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); switch(status) { case SolverStatus::BDA_SOLVER_SUCCESS: //OpmLog::info("BdaSolver converged"); @@ -268,7 +309,8 @@ void BdaBridge::get_result([[maybe_unuse } template -void BdaBridge::initWellContributions([[maybe_unused]] WellContributions& wellContribs) { +void BdaBridge::initWellContributions([[maybe_unused]] WellContributions& wellContribs, + [[maybe_unused]] unsigned N) { if(accelerator_mode.compare("opencl") == 0){ #if HAVE_OPENCL const auto openclBackend = static_cast*>(backend.get()); @@ -277,6 +319,7 @@ void BdaBridge::initWellContributions([[ OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); #endif } + wellContribs.setVectorSize(N); } // the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 3d24051bb..4e79b9eae 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -23,6 +23,7 @@ #include "dune/istl/solver.hh" // for struct InverseOperatorResult #include +#include #include namespace Opm @@ -43,6 +44,10 @@ private: bool use_fpga = false; std::string accelerator_mode; std::unique_ptr > backend; + std::shared_ptr matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes + std::shared_ptr jacMatrix; // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes + std::vector h_rows, h_cols; // store the sparsity pattern of the matrix + std::vector h_jacRows, h_jacCols; // store the sparsity pattern of the jacMatrix public: /// Construct a BdaBridge @@ -61,11 +66,13 @@ public: /// Solve linear system, A*x = b /// \warning Values of A might get overwritten! - /// \param[in] mat matrix A, should be of type Dune::BCRSMatrix - /// \param[in] b vector b, should be of type Dune::BlockVector - /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A - /// \param[inout] result summary of solver result - void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result); + /// \param[in] bridgeMat matrix A, should be of type Dune::BCRSMatrix + /// \param[in] jacMat matrix A, but modified for the preconditioner, should be of type Dune::BCRSMatrix + /// \param[in] numJacobiBlocks number of jacobi blocks in jacMat + /// \param[in] b vector b, should be of type Dune::BlockVector + /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] result summary of solver result + void solve_system(BridgeMatrix *bridgeMat, BridgeMatrix *jacMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result); /// Get the resulting x vector /// \param[inout] x vector x, should be of type Dune::BlockVector @@ -86,7 +93,8 @@ public: /// Initialize the WellContributions object with opencl context and queue /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL /// \param[in] wellContribs container to hold all WellContributions - void initWellContributions(WellContributions& wellContribs); + /// \param[in] N number of rows in scalar vector that wellContribs will be applied on + void initWellContributions(WellContributions& wellContribs, unsigned N); /// Return whether the BdaBridge will use the FPGA or not /// return whether the BdaBridge will use the FPGA or not diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index d84652d28..dd5b97094 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -22,6 +22,7 @@ #include +#include #include @@ -85,9 +86,8 @@ namespace Accelerator { virtual ~BdaSolver() {}; /// Define as pure virtual functions, so derivedclass must implement them - virtual SolverStatus solve_system(int N, int nnz, int dim, - double *vals, int *rows, int *cols, - double *b, WellContributions& wellContribs, BdaResult &res) = 0; + virtual SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) = 0; virtual void get_result(double *x) = 0; diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp index 5fbb73e19..c5b2074e7 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.cpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp @@ -37,14 +37,12 @@ namespace Accelerator using Opm::OpmLog; -/*Sort a row of matrix elements from a blocked CSR-format.*/ -void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size) { - const unsigned int bs = block_size; + +void sortRow(int *colIndices, int *data, int left, int right) { int l = left; int r = right; int middle = colIndices[(l + r) >> 1]; - double lDatum[bs * bs]; do { while (colIndices[l] < middle) l++; @@ -54,9 +52,9 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned int lColIndex = colIndices[l]; colIndices[l] = colIndices[r]; colIndices[r] = lColIndex; - memcpy(lDatum, data + l * bs * bs, sizeof(double) * bs * bs); - memcpy(data + l * bs * bs, data + r * bs * bs, sizeof(double) * bs * bs); - memcpy(data + r * bs * bs, lDatum, sizeof(double) * bs * bs); + int tmp = data[l]; + data[l] = data[r]; + data[r] = tmp; l++; r--; @@ -64,10 +62,10 @@ void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned } while (l < r); if (left < r) - sortBlockedRow(colIndices, data, left, r, bs); + sortRow(colIndices, data, left, r); if (right > l) - sortBlockedRow(colIndices, data, l, right, bs); + sortRow(colIndices, data, l, right); } diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp index eab5c06ee..ba120e523 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.hpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp @@ -127,13 +127,13 @@ public: }; -/// Sort a row of matrix elements from a blocked CSR-format -/// \param[inout] colIndices -/// \param[inout] data +/// Sort a row of matrix elements from a CSR-format, where the nonzeroes are ints +/// These ints aren't actually nonzeroes, but represent a mapping used later +/// \param[inout] colIndices represent keys in sorting +/// \param[inout] data sorted according to the colIndices /// \param[in] left lower index of data of row /// \param[in] right upper index of data of row -/// \param[in] block_size size of blocks in the row -void sortBlockedRow(int *colIndices, double *data, int left, int right, unsigned block_size); +void sortRow(int *colIndices, int *data, int left, int right); /// Multiply and subtract blocks /// a = a - (b * c) diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp index 9087d4074..875b96676 100644 --- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp +++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp @@ -225,16 +225,20 @@ void FpgaSolverBackend::get_result(double *x_) template -SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) +SolverStatus FpgaSolverBackend::solve_system(std::shared_ptr matrix, + double *b, + [[maybe_unused]] std::shared_ptr jacMatrix, + WellContributions&, + BdaResult &res) { if (initialized == false) { - initialize(N_, nnz_, dim, vals, rows, cols); + initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices); if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } perf_call.emplace_back(); - update_system(vals, b); + update_system(matrix->nnzValues, b); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } @@ -252,21 +256,21 @@ SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int d template -void FpgaSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) +void FpgaSolverBackend::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols) { double start = second(); - this->N = N_; - this->nnz = nnz_; - this->nnzb = nnz_ / block_size / block_size; - Nb = (N + dim - 1) / dim; + this->Nb = Nb_; + this->N = Nb * block_size; + this->nnzb = nnzbs; + this->nnz = nnzbs * block_size * block_size; // allocate host memory for matrices and vectors // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows)); std::ostringstream oss; - oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \ - "block size: " << dim << ", total nnz: " << this->nnz << "\n"; + oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \ + "block size: " << block_size << ", total nnz: " << this->nnz << "\n"; oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; OpmLog::info(oss.str()); diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp index 4d1e766f3..05e7c6d33 100644 --- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp +++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp @@ -201,13 +201,12 @@ private: unsigned short rst_settle_cycles = 0; /// Allocate host memory - /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block + /// \param[in] Nb number of blockrows + /// \param[in] nnzbs number of blocks /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] rows array of rowPointers, contains N/dim+1 values /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols); /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values @@ -243,18 +242,15 @@ public: ~FpgaSolverBackend(); /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] N number of rows, divide by dim to get number of blockrows - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values + /// \param[in] matrix matrix A /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner /// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A /// \param[inout] res summary of solver result /// \return status code - SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; - + SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; + /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp index 3711665b3..d46d2f303 100644 --- a/opm/simulators/linalg/bda/Reorder.cpp +++ b/opm/simulators/linalg/bda/Reorder.cpp @@ -29,6 +29,7 @@ #include #include #include +#include namespace { @@ -176,39 +177,58 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi /* Reorder a matrix by a specified input order. * Both a to order array, which contains for every node from the old matrix where it will move in the new matrix, - * and the from order, which contains for every node in the new matrix where it came from in the old matrix.*/ -void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat) { - assert(mat->block_size == rmat->block_size); + * and the from order, which contains for every node in the new matrix where it came from in the old matrix. + * reordermapping_nonzeroes is filled with increasing indices, and reordered using the translated colIndices as keys, + * this means the resulting reordermapping_nonzeroes array contains the mapping + */ +void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat){ - const unsigned int bs = mat->block_size; int rIndex = 0; - int i, k; - unsigned int j; + std::vector tmp(mat->nnzbs); + + reordermapping_nonzeroes.resize(mat->nnzbs); + for(int i = 0; i < mat->nnzbs; ++i){ + reordermapping_nonzeroes[i] = i; + } rmat->rowPointers[0] = 0; - for (i = 0; i < mat->Nb; i++) { + for(int i = 0; i < mat->Nb; i++){ int thisRow = fromOrder[i]; // put thisRow from the old matrix into row i of the new matrix - rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow]; - for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) { - for (j = 0; j < bs * bs; j++) { - rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j]; - } + rmat->rowPointers[i+1] = rmat->rowPointers[i] + mat->rowPointers[thisRow+1] - mat->rowPointers[thisRow]; + for(int k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow+1]; k++){ + tmp[rIndex] = reordermapping_nonzeroes[k]; // only get 1 entry per block rmat->colIndices[rIndex] = mat->colIndices[k]; rIndex++; } } // re-assign column indices according to the new positions of the nodes referenced by the column indices - for (i = 0; i < mat->nnzbs; i++) { + for(int i = 0; i < mat->nnzbs; i++){ rmat->colIndices[i] = toOrder[rmat->colIndices[i]]; } // re-sort the column indices of every row. - for (i = 0; i < mat->Nb; i++) { - sortBlockedRow(rmat->colIndices, rmat->nnzValues, rmat->rowPointers[i], rmat->rowPointers[i + 1] - 1, bs); + for(int i = 0; i < mat->Nb; i++){ + sortRow(rmat->colIndices, tmp.data(), rmat->rowPointers[i], rmat->rowPointers[i+1]-1); + } + for(int i = 0; i < mat->nnzbs; i++){ + reordermapping_nonzeroes[i] = tmp[i]; + } + // std::copy(); +} + +/* Reorder an array of nonzero blocks into another array, using a mapping */ +void reorderNonzeroes(BlockedMatrix *mat, std::vector& reordermapping_nonzeroes, BlockedMatrix *rmat){ + assert(mat->block_size == rmat->block_size); + + const unsigned int bs = mat->block_size; + + for(int i = 0; i < mat->nnzbs; i++){ + int old_idx = reordermapping_nonzeroes[i]; + memcpy(rmat->nnzValues+i*bs*bs, mat->nnzValues+old_idx*bs*bs, sizeof(double)*bs*bs); // copy nnz block } } -/* Reorder a matrix according to the colors that every node of the matrix has received*/ +/* Find a reorder mapping according to the colors that every node of the matrix has received */ void colorsToReordering(int Nb, std::vector& colors, int numColors, int *toOrder, int *fromOrder, std::vector& rowsPerColor) { int reordered = 0; diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp index 7d90a67d0..fd02e8b4f 100644 --- a/opm/simulators/linalg/bda/Reorder.hpp +++ b/opm/simulators/linalg/bda/Reorder.hpp @@ -47,13 +47,22 @@ namespace Accelerator template int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndices, const int *CSCColPointers, const int *CSCRowIndices, std::vector& colors, int maxRowsPerColor, int maxColsPerColor); -/// Reorder the rows of the matrix according to the mapping in toOrder and fromOrder -/// rMat must be allocated already -/// \param[in] mat matrix to be reordered -/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved -/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved -/// \param[inout] rMat reordered Matrix -void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, int *fromOrder, BlockedMatrix *rmat); +/// Reorder the sparsity pattern of the matrix according to the mapping in toOrder and fromOrder +/// Also find mapping for nnz blocks +/// rmat must be allocated already +/// \param[in] mat matrix to be reordered +/// \param[out] reordermapping_nonzeroes contains new index for every nnz block +/// \param[in] toOrder reorder pattern that lists for each index in the original order, to which index in the new order it should be moved +/// \param[in] fromOrder reorder pattern that lists for each index in the new order, from which index in the original order it was moved +/// \param[out] rmat reordered Matrix +void reorderBlockedMatrixByPattern(BlockedMatrix *mat, std::vector& reordermapping_nonzeroes, int *toOrder, int *fromOrder, BlockedMatrix *rmat); + +/// Write nnz blocks from mat to rmat, according to the mapping in reordermapping_nonzeroes +/// rmat must be allocated already +/// \param[in] mat matrix to be reordered +/// \param[in] reordermapping_nonzeroes contains old index for every nnz block, so rmat_nnz[i] == mat_nnz[mapping[i]] +/// \param[inout] rmat reordered Matrix +void reorderNonzeroes(BlockedMatrix *mat, std::vector& reordermapping_nonzeroes, BlockedMatrix *rmat); /// Compute reorder mapping from the color that each node has received /// The toOrder, fromOrder and iters arrays must be allocated already diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index b4b170bd0..c92067775 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -101,6 +101,10 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_) } } +void WellContributions::setVectorSize(unsigned N_) { + N = N_; +} + void WellContributions::addNumBlocks(unsigned int numBlocks) { if (allocated) { diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index d78c02111..619ce2466 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -101,6 +101,10 @@ public: /// \param[in] dim_wells number of rows void setBlockSize(unsigned int dim, unsigned int dim_wells); + /// Set size of vector that the wells are applied to + /// \param[in] N size of vector + void setVectorSize(unsigned N); + /// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called /// \param[in] type indicate if C, D or B is sent /// \param[in] colIndices columnindices of blocks in C or B, ignored for D diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp index 957575fbf..58046abb3 100644 --- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp @@ -52,14 +52,14 @@ amgclSolverBackend::~amgclSolverBackend() {} template -void amgclSolverBackend::initialize(int N_, int nnz_, int dim) { - this->N = N_; - this->nnz = nnz_; - this->nnzb = nnz_ / block_size / block_size; +void amgclSolverBackend::initialize(int Nb_, int nnzbs) { + this->Nb = Nb_; + this->N = Nb * block_size; + this->nnzb = nnzbs; + this->nnz = nnzbs * block_size * block_size; - Nb = (N + dim - 1) / dim; std::ostringstream out; - out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n"; + out << "Initializing amgclSolverBackend, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << " blocks\n"; out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; out << "DeviceID: " << deviceID << "\n"; OpmLog::info(out.str()); @@ -360,12 +360,17 @@ void amgclSolverBackend::get_result(double *x_) { template -SolverStatus amgclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) { +SolverStatus amgclSolverBackend::solve_system(std::shared_ptr matrix, + double *b, + [[maybe_unused]] std::shared_ptr jacMatrix, + [[maybe_unused]] WellContributions& wellContribs, + BdaResult &res) +{ if (initialized == false) { - initialize(N_, nnz_, dim); - convert_sparsity_pattern(rows, cols); + initialize(matrix->Nb, matrix->nnzbs); + convert_sparsity_pattern(matrix->rowPointers, matrix->colIndices); } - convert_data(vals, rows); + convert_data(matrix->nnzValues, matrix->rowPointers); solve_system(b, res); return SolverStatus::BDA_SOLVER_SUCCESS; } diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp index 930f81ff9..8896ea65e 100644 --- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp @@ -96,10 +96,9 @@ private: std::once_flag vexcl_initialize; #endif /// Initialize host memory and determine amgcl parameters - /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - void initialize(int N, int nnz, int dim); + /// \param[in] Nb number of blockrows + /// \param[in] nnzbs number of blocks + void initialize(int Nb, int nnzbs); /// Convert the BCSR sparsity pattern to a CSR one /// \param[in] rows array of rowPointers, contains N/dim+1 values @@ -129,22 +128,15 @@ public: ~amgclSolverBackend(); /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] N number of rows, divide by dim to get number of blockrows - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values - /// \param[in] vals_prec array of nonzeroes for preconditioner - /// \param[in] rows_prec array of rowPointers for preconditioner - /// \param[in] cols_prec array of columnIndices for preconditioner + /// \param[in] matrix matrix A /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; - + SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; + /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu index 46f345784..f6728cebb 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu @@ -104,10 +104,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellCon // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ - operation, Nb, nnzb, &one, \ + operation, Nb, nnzbs_prec, &one, \ descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_p, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ - operation, Nb, nnzb, &one, \ + operation, Nb, nnzbs_prec, &one, \ descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_pw, policy, d_buffer); // spmv @@ -135,10 +135,10 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellCon // apply ilu0 cusparseDbsrsv2_solve(cusparseHandle, order, \ - operation, Nb, nnzb, &one, \ + operation, Nb, nnzbs_prec, &one, \ descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, d_r, d_t, policy, d_buffer); cusparseDbsrsv2_solve(cusparseHandle, order, \ - operation, Nb, nnzb, &one, \ + operation, Nb, nnzbs_prec, &one, \ descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, d_t, d_s, policy, d_buffer); // spmv @@ -188,17 +188,25 @@ void cusparseSolverBackend::gpu_pbicgstab(WellContributions& wellCon template -void cusparseSolverBackend::initialize(int N, int nnz, int dim) { - this->N = N; - this->nnz = nnz; - this->nnzb = nnz / block_size / block_size; - Nb = (N + dim - 1) / dim; +void cusparseSolverBackend::initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix) { + this->Nb = matrix->Nb; + this->N = Nb * block_size; + this->nnzb = matrix->nnzbs; + this->nnz = nnzb * block_size * block_size; + + if (jacMatrix) { + useJacMatrix = true; + nnzbs_prec = jacMatrix->nnzbs; + } else { + nnzbs_prec = nnzb; + } + std::ostringstream out; - out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks"; - OpmLog::info(out.str()); - out.str(""); - out.clear(); - out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; + out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnz: " << nnzb << " blocks\n"; + if (useJacMatrix) { + out << "Blocks in ILU matrix: " << nnzbs_prec << "\n"; + } + out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; OpmLog::info(out.str()); cudaSetDevice(deviceID); @@ -232,7 +240,15 @@ void cusparseSolverBackend::initialize(int N, int nnz, int dim) { cudaMalloc((void**)&d_bVals, sizeof(double) * nnz); cudaMalloc((void**)&d_bCols, sizeof(int) * nnzb); cudaMalloc((void**)&d_bRows, sizeof(int) * (Nb + 1)); - cudaMalloc((void**)&d_mVals, sizeof(double) * nnz); + if (useJacMatrix) { + cudaMalloc((void**)&d_mVals, sizeof(double) * nnzbs_prec * block_size * block_size); + cudaMalloc((void**)&d_mCols, sizeof(int) * nnzbs_prec); + cudaMalloc((void**)&d_mRows, sizeof(int) * (Nb + 1)); + } else { + cudaMalloc((void**)&d_mVals, sizeof(double) * nnz); + d_mCols = d_bCols; + d_mRows = d_bRows; + } cudaCheckLastError("Could not allocate enough memory on GPU"); cublasSetStream(cublasHandle, stream); @@ -261,6 +277,10 @@ void cusparseSolverBackend::finalize() { cudaFree(d_t); cudaFree(d_v); cudaFree(d_mVals); + if (useJacMatrix) { + cudaFree(d_mCols); + cudaFree(d_mRows); + } cudaFree(d_bVals); cudaFree(d_bCols); cudaFree(d_bRows); @@ -283,23 +303,32 @@ void cusparseSolverBackend::finalize() { template -void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) { +void cusparseSolverBackend::copy_system_to_gpu(std::shared_ptr matrix, double *b, std::shared_ptr jacMatrix) { Timer t; #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { - int size_row = rows[i + 1] - rows[i]; - memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size); + int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; + memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); #else - cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + if (useJacMatrix) { + cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); + } else { + cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); + } #endif - cudaMemcpyAsync(d_bCols, cols, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(d_bRows, rows, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bCols, matrix->colIndices, nnzb * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bRows, matrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); + if (useJacMatrix) { + cudaMemcpyAsync(d_mCols, jacMatrix->colIndices, nnzbs_prec * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_mRows, jacMatrix->rowPointers, (Nb + 1) * sizeof(int), cudaMemcpyHostToDevice, stream); + } cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream); @@ -314,19 +343,24 @@ void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *ro // don't copy rowpointers and colindices, they stay the same template -void cusparseSolverBackend::update_system_on_gpu(double *vals, int *rows, double *b) { +void cusparseSolverBackend::update_system_on_gpu(std::shared_ptr matrix, double *b, std::shared_ptr jacMatrix) { Timer t; #if COPY_ROW_BY_ROW int sum = 0; for (int i = 0; i < Nb; ++i) { - int size_row = rows[i + 1] - rows[i]; - memcpy(vals_contiguous + sum, vals + sum, size_row * sizeof(double) * block_size * block_size); + int size_row = matrix->rowPointers[i + 1] - matrix->rowPointers[i]; + memcpy(vals_contiguous + sum, matrix->nnzValues + sum, size_row * sizeof(double) * block_size * block_size); sum += size_row * block_size * block_size; } cudaMemcpyAsync(d_bVals, vals_contiguous, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); #else - cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(d_bVals, matrix->nnzValues, nnz * sizeof(double), cudaMemcpyHostToDevice, stream); + if (useJacMatrix) { + cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues, nnzbs_prec * block_size * block_size * sizeof(double), cudaMemcpyHostToDevice, stream); + } else { + cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); + } #endif cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream); @@ -341,12 +375,6 @@ void cusparseSolverBackend::update_system_on_gpu(double *vals, int * } // end update_system_on_gpu() -template -void cusparseSolverBackend::reset_prec_on_gpu() { - cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream); -} - - template bool cusparseSolverBackend::analyse_matrix() { @@ -380,12 +408,12 @@ bool cusparseSolverBackend::analyse_matrix() { cusparseCreateBsrsv2Info(&info_U); cudaCheckLastError("Could not create analysis info"); - cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb, - descr_M, d_bVals, d_bRows, d_bCols, block_size, info_M, &d_bufferSize_M); - cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, - descr_L, d_bVals, d_bRows, d_bCols, block_size, info_L, &d_bufferSize_L); - cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb, - descr_U, d_bVals, d_bRows, d_bCols, block_size, info_U, &d_bufferSize_U); + cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzbs_prec, + descr_M, d_mVals, d_mRows, d_mCols, block_size, info_M, &d_bufferSize_M); + cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec, + descr_L, d_mVals, d_mRows, d_mCols, block_size, info_L, &d_bufferSize_L); + cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzbs_prec, + descr_U, d_mVals, d_mRows, d_mCols, block_size, info_U, &d_bufferSize_U); cudaCheckLastError(); d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U)); @@ -393,7 +421,7 @@ bool cusparseSolverBackend::analyse_matrix() { // analysis of ilu LU decomposition cusparseDbsrilu02_analysis(cusparseHandle, order, \ - Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \ + Nb, nnzbs_prec, descr_B, d_mVals, d_mRows, d_mCols, \ block_size, info_M, policy, d_buffer); int structural_zero; @@ -404,11 +432,11 @@ bool cusparseSolverBackend::analyse_matrix() { // analysis of ilu apply cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ - Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \ + Nb, nnzbs_prec, descr_L, d_mVals, d_mRows, d_mCols, \ block_size, info_L, policy, d_buffer); cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \ - Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \ + Nb, nnzbs_prec, descr_U, d_mVals, d_mRows, d_mCols, \ block_size, info_U, policy, d_buffer); cudaCheckLastError("Could not analyse level information"); @@ -428,10 +456,8 @@ template bool cusparseSolverBackend::create_preconditioner() { Timer t; - d_mCols = d_bCols; - d_mRows = d_bRows; cusparseDbsrilu02(cusparseHandle, order, \ - Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \ + Nb, nnzbs_prec, descr_M, d_mVals, d_mRows, d_mCols, \ block_size, info_M, policy, d_buffer); cudaCheckLastError("Could not perform ilu decomposition"); @@ -480,19 +506,23 @@ void cusparseSolverBackend::get_result(double *x) { template -SolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { +SolverStatus cusparseSolverBackend::solve_system(std::shared_ptr matrix, + double *b, + std::shared_ptr jacMatrix, + WellContributions& wellContribs, + BdaResult &res) +{ if (initialized == false) { - initialize(N, nnz, dim); - copy_system_to_gpu(vals, rows, cols, b); + initialize(matrix, jacMatrix); + copy_system_to_gpu(matrix, b, jacMatrix); } else { - update_system_on_gpu(vals, rows, b); + update_system_on_gpu(matrix, b, jacMatrix); } if (analysis_done == false) { if (!analyse_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } - reset_prec_on_gpu(); if (create_preconditioner()) { solve_system(wellContribs, res); } else { diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp index 791bf32fc..54a99f608 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp @@ -68,6 +68,9 @@ private: bool analysis_done = false; + bool useJacMatrix = false; + int nnzbs_prec; // number of nonzero blocks in the matrix for preconditioner + // could be jacMatrix or matrix /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A @@ -75,29 +78,26 @@ private: void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); /// Initialize GPU and allocate memory - /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - void initialize(int N, int nnz, int dim); + /// \param[in] matrix matrix for spmv + /// \param[in] jacMatrix matrix for preconditioner + void initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix); /// Clean memory void finalize(); /// Copy linear system to GPU - /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnzb values - /// \param[in] b input vector, contains N values - void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b); + /// also copy matrix for preconditioner if needed + /// \param[in] matrix matrix for spmv + /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner + void copy_system_to_gpu(std::shared_ptr matrix, double *b, std::shared_ptr jacMatrix); - // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same - /// \param[in] vals array of nonzeroes, each block is stored row-wise, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values, only used if COPY_ROW_BY_ROW is true - /// \param[in] b input vector, contains N values - void update_system_on_gpu(double *vals, int *rows, double *b); - - /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse - void reset_prec_on_gpu(); + /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same + /// also copy matrix for preconditioner if needed + /// \param[in] matrix matrix for spmv + /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner + void update_system_on_gpu(std::shared_ptr matrix, double *b, std::shared_ptr jacMatrix); /// Analyse sparsity pattern to extract parallelism /// \return true iff analysis was successful @@ -126,18 +126,15 @@ public: ~cusparseSolverBackend(); /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] N number of rows, divide by dim to get number of blockrows - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values + /// \param[in] matrix matrix A /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; - + SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; + /// Get resulting vector x after linear solve, also includes post processing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index 0c2bc6364..b63f5959b 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -51,6 +51,13 @@ BILU0::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) : template bool BILU0::analyze_matrix(BlockedMatrix *mat) +{ + return analyze_matrix(mat, nullptr); +} + + +template +bool BILU0::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) { const unsigned int bs = block_size; @@ -62,19 +69,27 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) std::vector CSCRowIndices; std::vector CSCColPointers; + auto *matToDecompose = jacMat ? jacMat : mat; // decompose jacMat if valid, otherwise decompose mat + if (opencl_ilu_reorder == ILUReorder::NONE) { LUmat = std::make_unique(*mat); } else { toOrder.resize(Nb); fromOrder.resize(Nb); - CSCRowIndices.resize(nnzb); + CSCRowIndices.resize(matToDecompose->nnzbs); CSCColPointers.resize(Nb + 1); rmat = std::make_shared(mat->Nb, mat->nnzbs, block_size); - LUmat = std::make_unique(*rmat); + + if (jacMat) { + rJacMat = std::make_shared(jacMat->Nb, jacMat->nnzbs, block_size); + LUmat = std::make_unique(*rJacMat); + } else { + LUmat = std::make_unique(*rmat); + } Timer t_convert; - csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb); - if (verbosity >= 3) { + csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb); + if(verbosity >= 3){ std::ostringstream out; out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s"; OpmLog::info(out.str()); @@ -85,24 +100,33 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) std::ostringstream out; if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { out << "BILU0 reordering strategy: " << "level_scheduling\n"; - findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get()); + if (jacMat) { + reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get()); + } } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { out << "BILU0 reordering strategy: " << "graph_coloring\n"; - findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + findGraphColoring(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get()); + if (jacMat) { + reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get()); + } } else if (opencl_ilu_reorder == ILUReorder::NONE) { out << "BILU0 reordering strategy: none\n"; // numColors = 1; // rowsPerColor.emplace_back(Nb); numColors = Nb; - for (int i = 0; i < Nb; ++i) { + for(int i = 0; i < Nb; ++i){ rowsPerColor.emplace_back(1); } } else { OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); } - if (verbosity >= 1) { + if(verbosity >= 1){ out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n"; } + #if CHOW_PATEL out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU; #endif @@ -112,8 +136,8 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) invDiagVals.resize(mat->Nb * bs * bs); #if CHOW_PATEL - Lmat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs); - Umat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, bs); + Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); + Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); #endif s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); @@ -137,7 +161,7 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 for (int i = 0; i < numColors; ++i) { - rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i]; + rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; } err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]); @@ -149,21 +173,38 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) } return true; -} // end init() +} + template bool BILU0::create_preconditioner(BlockedMatrix *mat) +{ + return create_preconditioner(mat, nullptr); +} + + +template +bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) { const unsigned int bs = block_size; - auto *m = mat; - if (opencl_ilu_reorder != ILUReorder::NONE) { - m = rmat.get(); + auto *matToDecompose = jacMat; + + if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug + matToDecompose = jacMat ? jacMat : mat; + } else { Timer t_reorder; - reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get()); + if (jacMat) { + matToDecompose = rJacMat.get(); + reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get()); + reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get()); + } else { + matToDecompose = rmat.get(); + reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get()); + } - if (verbosity >= 3) { + if (verbosity >= 3){ std::ostringstream out; out << "BILU0 reorder matrix: " << t_reorder.stop() << " s"; OpmLog::info(out.str()); @@ -173,18 +214,18 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat) // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy Timer t_copy; - memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs); + memcpy(LUmat->nnzValues, matToDecompose->nnzValues, sizeof(double) * bs * bs * matToDecompose->nnzbs); - if (verbosity >= 3) { + if (verbosity >= 3){ std::ostringstream out; out << "BILU0 memcpy: " << t_copy.stop() << " s"; OpmLog::info(out.str()); } #if CHOW_PATEL - chowPatelIlu.decomposition(queue.get(), context.get(), + chowPatelIlu.decomposition(queue, context, LUmat.get(), Lmat.get(), Umat.get(), - invDiagVals.data(), diagIndex, + invDiagVals, diagIndex, s.diagIndex, s.invDiagVals, s.Lvals, s.Lcols, s.Lrows, s.Uvals, s.Ucols, s.Urows); @@ -192,23 +233,23 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat) Timer t_copyToGpu; events.resize(1); - err = queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); + queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); - std::call_once(pattern_uploaded, [&]() { + std::call_once(pattern_uploaded, [&](){ // find the positions of each diagonal block // must be done after reordering for (int row = 0; row < Nb; ++row) { int rowStart = LUmat->rowPointers[row]; - int rowEnd = LUmat->rowPointers[row + 1]; + int rowEnd = LUmat->rowPointers[row+1]; auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row); assert(candidate != LUmat->colIndices + rowEnd); diagIndex[row] = candidate - LUmat->colIndices; } events.resize(4); - err |= queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); - err |= queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); - err |= queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); + queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); + queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); }); cl::WaitForEvents(events); @@ -229,8 +270,8 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat) cl::Event event; for (int color = 0; color < numColors; ++color) { const unsigned int firstRow = rowsPerColorPrefix[color]; - const unsigned int lastRow = rowsPerColorPrefix[color + 1]; - if (verbosity >= 4) { + const unsigned int lastRow = rowsPerColorPrefix[color+1]; + if (verbosity >= 5) { out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; } OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size); @@ -245,6 +286,7 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat) return true; } // end create_preconditioner() + // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed // however, if individual kernel calls are timed, waiting for events is needed // behavior on other GPUs is untested diff --git a/opm/simulators/linalg/bda/opencl/BILU0.hpp b/opm/simulators/linalg/bda/opencl/BILU0.hpp index b3b9ce125..c7d217eb0 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.hpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp @@ -55,6 +55,7 @@ class BILU0 : public Preconditioner private: std::unique_ptr LUmat = nullptr; std::shared_ptr rmat = nullptr; // only used with PAR_SIM + std::shared_ptr rJacMat = nullptr; #if CHOW_PATEL std::unique_ptr Lmat = nullptr, Umat = nullptr; #endif @@ -68,6 +69,9 @@ private: ILUReorder opencl_ilu_reorder; + std::vector reordermappingNonzeroes; // maps nonzero blocks to new location in reordered matrix + std::vector jacReordermappingNonzeroes; // same but for jacMatrix + typedef struct { cl::Buffer invDiagVals; cl::Buffer diagIndex; @@ -92,9 +96,11 @@ public: // analysis, find reordering if specified bool analyze_matrix(BlockedMatrix *mat) override; + bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; // ilu_decomposition bool create_preconditioner(BlockedMatrix *mat) override; + bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override; // apply preconditioner, x = prec(y) void apply(const cl::Buffer& y, cl::Buffer& x) override; @@ -114,6 +120,11 @@ public: return rmat.get(); } + BlockedMatrix* getRJacMat() + { + return rJacMat.get(); + } + std::tuple, std::vector, std::vector> get_preconditioner_structure() { return {{LUmat->rowPointers, LUmat->rowPointers + (Nb + 1)}, {LUmat->colIndices, LUmat->colIndices + nnzb}, diagIndex}; @@ -127,7 +138,6 @@ public: return std::make_pair(s.LUvals, s.invDiagVals); #endif } - }; } // namespace Accelerator diff --git a/opm/simulators/linalg/bda/opencl/BISAI.cpp b/opm/simulators/linalg/bda/opencl/BISAI.cpp index 0febeebd4..695045ca5 100644 --- a/opm/simulators/linalg/bda/opencl/BISAI.cpp +++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp @@ -78,17 +78,30 @@ std::vector buildCsrToCscOffsetMap(std::vector colPointers, std::vecto template bool BISAI::analyze_matrix(BlockedMatrix *mat) +{ + return analyze_matrix(mat, nullptr); +} + +template +bool BISAI::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) { const unsigned int bs = block_size; + auto *m = mat; - this->N = mat->Nb * bs; - this->Nb = mat->Nb; - this->nnz = mat->nnzbs * bs * bs; - this->nnzb = mat->nnzbs; + if (jacMat) { + m = jacMat; + } - bilu0->analyze_matrix(mat); + this->N = m->Nb * bs; + this->Nb = m->Nb; + this->nnz = m->nnzbs * bs * bs; + this->nnzb = m->nnzbs; - return true; + if (jacMat) { + return bilu0->analyze_matrix(mat, jacMat); + } else { + return bilu0->analyze_matrix(mat); + } } template @@ -162,7 +175,7 @@ void BISAI::buildUpperSubsystemsStructures(){ } template -bool BISAI::create_preconditioner(BlockedMatrix *mat) +bool BISAI::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) { const unsigned int bs = block_size; @@ -172,7 +185,11 @@ bool BISAI::create_preconditioner(BlockedMatrix *mat) Dune::Timer t_preconditioner; - bilu0->create_preconditioner(mat); + if (jacMat) { + bilu0->create_preconditioner(mat, jacMat); + } else { + bilu0->create_preconditioner(mat); + } std::call_once(initialize, [&]() { std::tie(colPointers, rowIndices, diagIndex) = bilu0->get_preconditioner_structure(); @@ -255,6 +272,12 @@ bool BISAI::create_preconditioner(BlockedMatrix *mat) return true; } +template +bool BISAI::create_preconditioner(BlockedMatrix *mat) +{ + return create_preconditioner(mat, nullptr); +} + template void BISAI::apply(const cl::Buffer& x, cl::Buffer& y){ const unsigned int bs = block_size; diff --git a/opm/simulators/linalg/bda/opencl/BISAI.hpp b/opm/simulators/linalg/bda/opencl/BISAI.hpp index c73b1fd9d..f0fe85242 100644 --- a/opm/simulators/linalg/bda/opencl/BISAI.hpp +++ b/opm/simulators/linalg/bda/opencl/BISAI.hpp @@ -117,9 +117,11 @@ public: // analysis, find reordering if specified bool analyze_matrix(BlockedMatrix *mat) override; + bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; // ilu_decomposition bool create_preconditioner(BlockedMatrix *mat) override; + bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override; // apply preconditioner, x = prec(y) void apply(const cl::Buffer& y, cl::Buffer& x) override; diff --git a/opm/simulators/linalg/bda/opencl/CPR.cpp b/opm/simulators/linalg/bda/opencl/CPR.cpp index 8de00dab5..9ae847c99 100644 --- a/opm/simulators/linalg/bda/opencl/CPR.cpp +++ b/opm/simulators/linalg/bda/opencl/CPR.cpp @@ -80,6 +80,42 @@ bool CPR::analyze_matrix(BlockedMatrix *mat_) { return success; } +template +bool CPR::analyze_matrix(BlockedMatrix *mat_, BlockedMatrix *jacMat) { + this->Nb = mat_->Nb; + this->nnzb = mat_->nnzbs; + this->N = Nb * block_size; + this->nnz = nnzb * block_size * block_size; + + bool success = bilu0->analyze_matrix(mat_, jacMat); + + if (opencl_ilu_reorder == ILUReorder::NONE) { + mat = mat_; + } else { + mat = bilu0->getRMat(); + } + return success; +} + +template +bool CPR::create_preconditioner(BlockedMatrix *mat_, BlockedMatrix *jacMat) { + Dune::Timer t_bilu0; + bool result = bilu0->create_preconditioner(mat_, jacMat); + if (verbosity >= 3) { + std::ostringstream out; + out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s"; + OpmLog::info(out.str()); + } + + Dune::Timer t_amg; + create_preconditioner_amg(mat); // already points to bilu0::rmat if needed + if (verbosity >= 3) { + std::ostringstream out; + out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s"; + OpmLog::info(out.str()); + } + return result; +} template bool CPR::create_preconditioner(BlockedMatrix *mat_) { @@ -101,6 +137,7 @@ bool CPR::create_preconditioner(BlockedMatrix *mat_) { return result; } + // return the absolute value of the N elements for which the absolute value is highest double get_absmax(const double *data, const int N) { return std::abs(*std::max_element(data, data + N, [](double a, double b){return std::fabs(a) < std::fabs(b);})); diff --git a/opm/simulators/linalg/bda/opencl/CPR.hpp b/opm/simulators/linalg/bda/opencl/CPR.hpp index 949148dd0..c81cee725 100644 --- a/opm/simulators/linalg/bda/opencl/CPR.hpp +++ b/opm/simulators/linalg/bda/opencl/CPR.hpp @@ -125,6 +125,7 @@ public: CPR(int verbosity, ILUReorder opencl_ilu_reorder); bool analyze_matrix(BlockedMatrix *mat) override; + bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) override; // set own Opencl variables, but also that of the bilu0 preconditioner void setOpencl(std::shared_ptr& context, std::shared_ptr& queue) override; @@ -134,7 +135,8 @@ public: void apply(const cl::Buffer& y, cl::Buffer& x) override; bool create_preconditioner(BlockedMatrix *mat) override; - + bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) override; + int* getToOrder() override { return bilu0->getToOrder(); diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp index f7256039b..8bf670b8c 100644 --- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp +++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp @@ -52,10 +52,21 @@ std::unique_ptr > Preconditioner::create( } } +template +bool Preconditioner::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) { + return analyze_matrix(mat); +} + +template +bool Preconditioner::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) { + return create_preconditioner(mat); +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ -template std::unique_ptr > Preconditioner::create(PreconditionerType, int, ILUReorder); \ -template void Preconditioner::setOpencl(std::shared_ptr&, std::shared_ptr&); +template std::unique_ptr > Preconditioner::create(PreconditionerType, int, ILUReorder); \ +template void Preconditioner::setOpencl(std::shared_ptr&, std::shared_ptr&); \ +template bool Preconditioner::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \ +template bool Preconditioner::create_preconditioner(BlockedMatrix *, BlockedMatrix *); INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp index a288f952d..f0af75d25 100644 --- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp +++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp @@ -68,10 +68,14 @@ public: // analyze matrix, e.g. the sparsity pattern // probably only called once + // the version with two params can be overloaded, if not, it will default to using the one param version virtual bool analyze_matrix(BlockedMatrix *mat) = 0; + virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat); // create/update preconditioner, probably used every linear solve + // the version with two params can be overloaded, if not, it will default to using the one param version virtual bool create_preconditioner(BlockedMatrix *mat) = 0; + virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat); // get reordering mappings virtual int* getToOrder() = 0; diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp index 855631887..dec83ce4c 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp @@ -370,7 +370,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr ", time per iteration: " << res.elapsed / it << ", iterations: " << it; OpmLog::info(out.str()); } - if (verbosity >= 4) { + if (verbosity >= 3) { std::ostringstream out; out << "openclSolver::prec_apply: " << t_prec.elapsed() << " s\n"; out << "wellContributions::apply: " << t_well.elapsed() << " s\n"; @@ -383,14 +383,21 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr template -void openclSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) { - this->N = N_; - this->nnz = nnz_; - this->nnzb = nnz_ / block_size / block_size; +void openclSolverBackend::initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix) { + this->Nb = matrix->Nb; + this->N = Nb * block_size; + this->nnzb = matrix->nnzbs; + this->nnz = nnzb * block_size * block_size; + + if (jacMatrix) { + useJacMatrix = true; + } - Nb = (N + dim - 1) / dim; std::ostringstream out; - out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n"; + out << "Initializing GPU, matrix size: " << Nb << " blockrows, nnzb: " << nnzb << "\n"; + if (useJacMatrix) { + out << "Blocks in ILU matrix: " << jacMatrix->nnzbs << "\n"; + } out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n"; OpmLog::info(out.str()); @@ -401,9 +408,12 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub prec->setOpencl(context, queue); #if COPY_ROW_BY_ROW - vals_contiguous.resize(nnz); + vals_contiguous = new double[N]; #endif - mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows)); + // mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows)); + // jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2)); + mat = matrix; + jacMat = jacMatrix; d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); @@ -441,7 +451,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() - template void openclSolverBackend::finalize() { if (opencl_ilu_reorder != ILUReorder::NONE) { @@ -527,8 +536,11 @@ template bool openclSolverBackend::analyze_matrix() { Timer t; - // bool success = bilu0->init(mat.get()); - bool success = prec->analyze_matrix(mat.get()); + bool success; + if (useJacMatrix) + success = prec->analyze_matrix(mat.get(), jacMat.get()); + else + success = prec->analyze_matrix(mat.get()); if (opencl_ilu_reorder == ILUReorder::NONE) { rmat = mat.get(); @@ -579,7 +591,11 @@ template bool openclSolverBackend::create_preconditioner() { Timer t; - bool result = prec->create_preconditioner(mat.get()); + bool result; + if (useJacMatrix) + result = prec->create_preconditioner(mat.get(), jacMat.get()); + else + result = prec->create_preconditioner(mat.get()); if (verbosity > 2) { std::ostringstream out; @@ -639,21 +655,26 @@ void openclSolverBackend::get_result(double *x) { template -SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) { +SolverStatus openclSolverBackend::solve_system(std::shared_ptr matrix, + double *b, + std::shared_ptr jacMatrix, + WellContributions& wellContribs, + BdaResult &res) +{ if (initialized == false) { - initialize(N_, nnz_, dim, vals, rows, cols); + initialize(matrix, jacMatrix); if (analysis_done == false) { if (!analyze_matrix()) { return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } - update_system(vals, b, wellContribs); + update_system(matrix->nnzValues, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); } else { - update_system(vals, b, wellContribs); + update_system(matrix->nnzValues, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp index d356520e6..cc9eb4fe0 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp @@ -63,12 +63,15 @@ private: std::vector devices; + bool useJacMatrix = false; + std::unique_ptr > prec; // can perform blocked ILU0 and AMG on pressure component bool is_root; // allow for nested solvers, the root solver is called by BdaBridge int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings bool analysis_done = false; - std::unique_ptr mat = nullptr; // original matrix + std::shared_ptr mat = nullptr; // original matrix + std::shared_ptr jacMat = nullptr; // matrix for preconditioner BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv ILUReorder opencl_ilu_reorder; // reordering strategy std::vector events; @@ -130,13 +133,9 @@ private: void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res); /// Initialize GPU and allocate memory - /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + /// \param[in] matrix matrix A + /// \param[in] jacMatrix matrix for preconditioner + void initialize(std::shared_ptr matrix, std::shared_ptr jacMatrix); /// Clean memory void finalize(); @@ -189,21 +188,14 @@ public: ~openclSolverBackend(); /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] N number of rows, divide by dim to get number of blockrows - /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks - /// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks - /// \param[in] dim size of block - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values - /// \param[in] vals_prec array of nonzeroes for preconditioner - /// \param[in] rows_prec array of rowPointers for preconditioner - /// \param[in] cols_prec array of columnIndices for preconditioner + /// \param[in] matrix matrix A /// \param[in] b input vector, contains N values + /// \param[in] jacMatrix matrix for preconditioner /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result /// \return status code - SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + SolverStatus solve_system(std::shared_ptr matrix, double *b, + std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; /// Solve scalar linear system, for example a coarse system of an AMG preconditioner /// Data is already on the GPU @@ -218,7 +210,7 @@ public: /// \param[in] context the opencl context to be used /// \param[in] queue the opencl queue to be used void setOpencl(std::shared_ptr& context, std::shared_ptr& queue); - + }; // end class openclSolverBackend } // namespace Accelerator diff --git a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp index c6314c54a..6076a088e 100644 --- a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp +++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp @@ -35,12 +35,6 @@ using Accelerator::OpenclKernels; void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) { this->context = context_; this->queue = queue_; - } - -void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_, - Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) { - this->kernel = kernel_; - this->kernel_no_reorder = kernel_no_reorder_; } void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_) diff --git a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp index c2be1a843..82215563c 100644 --- a/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp +++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.hpp @@ -35,8 +35,6 @@ namespace Opm class WellContributionsOCL : public WellContributions { public: - void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_, - Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_); void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect @@ -56,8 +54,6 @@ protected: cl::Context* context; cl::CommandQueue* queue; - Opm::Accelerator::stdwell_apply_kernel_type* kernel; - Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder; std::vector events; std::unique_ptr d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; diff --git a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp index e8d37237f..21d512172 100644 --- a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp +++ b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp @@ -39,9 +39,9 @@ namespace detail /// \param useWellConn Boolean that is true when UseWellContribusion is true /// \param wellGraph Cell IDs of well cells stored in a graph. template - void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector>& wellGraph) + void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector>& wellGraph, int numJacobiBlocks) { - if ( grid.comm().size() > 1) + if ( grid.comm().size() > 1 || numJacobiBlocks > 1) { Dune::CartesianIndexMapper< Grid > cartMapper( grid ); const int numCells = cartMapper.compressedSize(); // grid.numCells() diff --git a/tests/test_cusparseSolver.cpp b/tests/test_cusparseSolver.cpp index 6c533134e..132989cd2 100644 --- a/tests/test_cusparseSolver.cpp +++ b/tests/test_cusparseSolver.cpp @@ -116,7 +116,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix& matrix, V bridge = std::make_unique, Vector, bz> >(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder, linsolver); auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory // matrix created by readMatrixMarket() did not have contiguous memory - bridge->solve_system(&mat2, rhs, *wellContribs, result); + bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result); bridge->get_result(x); return x; diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp index 94709ce3c..7705d2ec8 100644 --- a/tests/test_openclSolver.cpp +++ b/tests/test_openclSolver.cpp @@ -118,7 +118,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, Matrix& matrix, Vec } auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory // matrix created by readMatrixMarket() did not have contiguous memory - bridge->solve_system(&mat2, rhs, *wellContribs, result); + bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result); bridge->get_result(x); return x;