From e360c00b73dca89be9f1716c9a704905576b7792 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Thu, 21 Apr 2022 17:18:32 +0200 Subject: [PATCH] add block-jacobi partitioner option. Add block-jacobi matrix for use in OpenCL preconditioner Rebased --- ebos/eclbasevanguard.hh | 11 + ebos/eclcpgridvanguard.hh | 3 +- ebos/eclgenericcpgridvanguard.cc | 84 ++++--- ebos/eclgenericcpgridvanguard.hh | 8 +- ebos/eclgenericvanguard.hh | 7 + opm/simulators/linalg/ISTLSolverEbos.hpp | 94 +++++++- opm/simulators/linalg/bda/BdaBridge.cpp | 24 +- opm/simulators/linalg/bda/BdaBridge.hpp | 2 +- opm/simulators/linalg/bda/BdaSolver.hpp | 5 + .../linalg/bda/FPGASolverBackend.cpp | 9 +- .../linalg/bda/FPGASolverBackend.hpp | 5 + .../linalg/bda/amgclSolverBackend.cpp | 10 + .../linalg/bda/amgclSolverBackend.hpp | 5 + .../linalg/bda/cuda/cusparseSolverBackend.cu | 9 +- .../linalg/bda/cuda/cusparseSolverBackend.hpp | 5 + opm/simulators/linalg/bda/opencl/BILU0.cpp | 220 +++++++++++++++++- opm/simulators/linalg/bda/opencl/BILU0.hpp | 5 + .../linalg/bda/opencl/Preconditioner.cpp | 15 +- .../linalg/bda/opencl/Preconditioner.hpp | 4 + .../linalg/bda/opencl/openclSolverBackend.cpp | 96 +++++++- .../linalg/bda/opencl/openclSolverBackend.hpp | 12 +- .../linalg/findOverlapRowsAndColumns.hpp | 4 +- tests/test_openclSolver.cpp | 2 +- 23 files changed, 582 insertions(+), 57 deletions(-) diff --git a/ebos/eclbasevanguard.hh b/ebos/eclbasevanguard.hh index 2b29f9282..30a3c1aa8 100644 --- a/ebos/eclbasevanguard.hh +++ b/ebos/eclbasevanguard.hh @@ -85,6 +85,10 @@ struct EdgeWeightsMethod { using type = UndefinedProperty; }; template +struct NumJacobiBlocks { + using type = UndefinedProperty; +}; +template struct OwnerCellsFirst { using type = UndefinedProperty; }; @@ -133,6 +137,10 @@ struct EdgeWeightsMethod { static constexpr int value = 1; }; template +struct NumJacobiBlocks { + static constexpr int value = 0; +}; +template struct OwnerCellsFirst { static constexpr bool value = true; }; @@ -211,6 +219,8 @@ public: "When restarting: should we try to initialize wells and groups from historical SCHEDULE section."); EWOMS_REGISTER_PARAM(TypeTag, int, EdgeWeightsMethod, "Choose edge-weighing strategy: 0=uniform, 1=trans, 2=log(trans)."); + EWOMS_REGISTER_PARAM(TypeTag, int, NumJacobiBlocks, + "Number of blocks to be created for the Block-Jacobi preconditioner."); EWOMS_REGISTER_PARAM(TypeTag, bool, OwnerCellsFirst, "Order cells owned by rank before ghost/overlap cells."); EWOMS_REGISTER_PARAM(TypeTag, bool, SerialPartitioning, @@ -235,6 +245,7 @@ public: { fileName_ = EWOMS_GET_PARAM(TypeTag, std::string, EclDeckFileName); edgeWeightsMethod_ = Dune::EdgeWeightMethod(EWOMS_GET_PARAM(TypeTag, int, EdgeWeightsMethod)); + numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks); ownersFirst_ = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst); serialPartitioning_ = EWOMS_GET_PARAM(TypeTag, bool, SerialPartitioning); zoltanImbalanceTol_ = EWOMS_GET_PARAM(TypeTag, double, ZoltanImbalanceTol); diff --git a/ebos/eclcpgridvanguard.hh b/ebos/eclcpgridvanguard.hh index c35fed0bd..d399327f1 100644 --- a/ebos/eclcpgridvanguard.hh +++ b/ebos/eclcpgridvanguard.hh @@ -136,7 +136,7 @@ public: this->serialPartitioning(), this->enableDistributedWells(), this->zoltanImbalanceTol(), this->gridView(), this->schedule(), this->centroids_, - this->eclState(), this->parallelWells_); + this->eclState(), this->parallelWells_, this->numJacobiBlocks()); #endif this->updateGridView_(); @@ -192,6 +192,7 @@ protected: } std::unique_ptr globalTrans_; + //std::vector cell_part_; }; } // namespace Opm diff --git a/ebos/eclgenericcpgridvanguard.cc b/ebos/eclgenericcpgridvanguard.cc index 27025a882..57ead7bcc 100644 --- a/ebos/eclgenericcpgridvanguard.cc +++ b/ebos/eclgenericcpgridvanguard.cc @@ -82,12 +82,13 @@ void EclGenericCpGridVanguard::doLoadBalance_(Dun const Schedule& schedule, std::vector& centroids, EclipseState& eclState1, - EclGenericVanguard::ParallelWellStruct& parallelWells) + EclGenericVanguard::ParallelWellStruct& parallelWells, + int numJacobiBlocks) { int mpiSize = 1; MPI_Comm_size(grid_->comm(), &mpiSize); - - if (mpiSize > 1) { + + if (mpiSize > 1 || numJacobiBlocks > 0) { // the CpGrid's loadBalance() method likes to have the transmissibilities as // its edge weights. since this is (kind of) a layering violation and // transmissibilities are relatively expensive to compute, we only do it if @@ -131,53 +132,60 @@ void EclGenericCpGridVanguard::doLoadBalance_(Dun } //distribute the grid and switch to the distributed view. - { - const auto wells = schedule.getWellsatEnd(); - - try + if (mpiSize > 1) { { - auto& eclState = dynamic_cast(eclState1); - const EclipseGrid* eclGrid = nullptr; + const auto wells = schedule.getWellsatEnd(); - if (grid_->comm().rank() == 0) + try { - eclGrid = &eclState.getInputGrid(); - } + auto& eclState = dynamic_cast(eclState1); + const EclipseGrid* eclGrid = nullptr; - PropsCentroidsDataHandle handle(*grid_, eclState, eclGrid, centroids, - cartesianIndexMapper()); - if (loadBalancerSet) - { - std::vector parts; if (grid_->comm().rank() == 0) { - parts = (*externalLoadBalancer)(*grid_); + eclGrid = &eclState.getInputGrid(); } - parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1)); - } - else - { - parallelWells = - std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning, - faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol, - enableDistributedWells)); - } - } - catch(const std::bad_cast& e) - { - std::ostringstream message; - message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState (" - << e.what() <<")"<switchToDistributedView(); + PropsCentroidsDataHandle handle(*grid_, eclState, eclGrid, centroids, + cartesianIndexMapper()); + if (loadBalancerSet) + { + std::vector parts; + if (grid_->comm().rank() == 0) + { + parts = (*externalLoadBalancer)(*grid_); + } + parallelWells = std::get<1>(grid_->loadBalance(handle, parts, &wells, ownersFirst, false, 1)); + } + else + { + parallelWells = + std::get<1>(grid_->loadBalance(handle, edgeWeightsMethod, &wells, serialPartitioning, + faceTrans.data(), ownersFirst, false, 1, true, zoltanImbalanceTol, + enableDistributedWells)); + } + } + catch(const std::bad_cast& e) + { + std::ostringstream message; + message << "Parallel simulator setup is incorrect as it does not use ParallelEclipseState (" + << e.what() <<")"<switchToDistributedView(); + } + // Calling Schedule::filterConnections would remove any perforated // cells that exist only on other ranks even in the case of distributed wells // But we need all connections to figure out the first cell of a well (e.g. for // pressure). Hence this is now skipped. Rank 0 had everything even before. + if (numJacobiBlocks > 0 && mpiSize == 1) { + const auto wells = schedule.getWellsatEnd(); + cell_part_.resize(grid_->numCells()); + cell_part_ = grid_->zoltanPartitionWithoutScatter(&wells, faceTrans.data(), numJacobiBlocks, zoltanImbalanceTol); + } } } diff --git a/ebos/eclgenericcpgridvanguard.hh b/ebos/eclgenericcpgridvanguard.hh index 310c2a14c..13a7172e0 100644 --- a/ebos/eclgenericcpgridvanguard.hh +++ b/ebos/eclgenericcpgridvanguard.hh @@ -101,6 +101,10 @@ public: */ const CartesianIndexMapper& equilCartesianIndexMapper() const; + std::vector cellPartition() const + { + return cell_part_; + } protected: /*! * \brief Distribute the simulation grid over multiple processes @@ -114,7 +118,8 @@ protected: const GridView& gridv, const Schedule& schedule, std::vector& centroids, EclipseState& eclState, - EclGenericVanguard::ParallelWellStruct& parallelWells); + EclGenericVanguard::ParallelWellStruct& parallelWells, + int numJacobiBlocks); void distributeFieldProps_(EclipseState& eclState); #endif @@ -137,6 +142,7 @@ protected: std::unique_ptr equilCartesianIndexMapper_; int mpiRank; + std::vector cell_part_; }; } // namespace Opm diff --git a/ebos/eclgenericvanguard.hh b/ebos/eclgenericvanguard.hh index 05b441584..200ecb508 100644 --- a/ebos/eclgenericvanguard.hh +++ b/ebos/eclgenericvanguard.hh @@ -249,6 +249,12 @@ public: Dune::EdgeWeightMethod edgeWeightsMethod() const { return edgeWeightsMethod_; } + /*! + * \brief Number of blocks in the Block-Jacobi preconditioner. + */ + int numJacobiBlocks() const + { return numJacobiBlocks_; } + /*! * \brief Parameter that decide if cells owned by rank are ordered before ghost cells. */ @@ -323,6 +329,7 @@ protected: std::string caseName_; std::string fileName_; Dune::EdgeWeightMethod edgeWeightsMethod_; + int numJacobiBlocks_; bool ownersFirst_; bool serialPartitioning_; double zoltanImbalanceTol_; diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index a77737b87..6fade71a4 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -161,7 +161,15 @@ namespace Opm // Set it up manually ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout()); detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_); - + numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks); + useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); + if (numJacobiBlocks_ > 1) { + const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd(); + detail::setWellConnections(simulator_.vanguard().grid(), wellsForConn, useWellConn_, + wellConnectionsGraph_, numJacobiBlocks_); + std::cout << "Create block-Jacobi pattern" << std::endl; + blockJacobiAdjacency(); + } useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); #if HAVE_FPGA // check usage of MatrixAddWellContributions: for FPGA they must be included @@ -273,8 +281,15 @@ namespace Opm } #endif + if (numJacobiBlocks_ > 1) { + copyMatToBlockJac(getMatrix(), *blockJacobiForGPUILU0_); // Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. - bdaBridge->solve_system(const_cast(&getMatrix()), *rhs_, *wellContribs, result); + bdaBridge->solve_system(const_cast(&getMatrix()), &*blockJacobiForGPUILU0_, + numJacobiBlocks_, *rhs_, *wellContribs, result); + } + else + bdaBridge->solve_system(const_cast(&getMatrix()), const_cast(&getMatrix()), + numJacobiBlocks_, *rhs_, *wellContribs, result); if (result.converged) { // get result vector x from non-Dune backend, iff solve was successful bdaBridge->get_result(x); @@ -497,6 +512,78 @@ namespace Opm } } + /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid. + void blockJacobiAdjacency() + { + const auto& grid = simulator_.vanguard().grid(); + std::vector cell_part = simulator_.vanguard().cellPartition(); + + typedef typename Matrix::size_type size_type; + size_type numCells = grid.size( 0 ); + blockJacobiForGPUILU0_.reset(new Matrix(numCells, numCells, Matrix::random)); + + std::vector> pattern; + pattern.resize(numCells); + + const auto& lid = grid.localIdSet(); + const auto& gridView = grid.leafGridView(); + auto elemIt = gridView.template begin<0>(); + const auto& elemEndIt = gridView.template end<0>(); + + //Loop over cells + for (; elemIt != elemEndIt; ++elemIt) + { + + const auto& elem = *elemIt; + size_type idx = lid.id(elem); + pattern[idx].insert(idx); + + // Add well non-zero connections + for (auto wc = wellConnectionsGraph_[idx].begin(); wc!=wellConnectionsGraph_[idx].end(); ++wc) + pattern[idx].insert(*wc); + + int locPart = cell_part[idx]; + + //Add neighbor if it is on the same part + auto isend = gridView.iend(elem); + for (auto is = gridView.ibegin(elem); is!=isend; ++is) + { + //check if face has neighbor + if (is->neighbor()) + { + size_type nid = lid.id(is->outside()); + int nabPart = cell_part[nid]; + if (locPart == nabPart) + pattern[idx].insert(nid); + } + + blockJacobiForGPUILU0_->setrowsize(idx, pattern[idx].size()); + } + } + blockJacobiForGPUILU0_->endrowsizes(); + for (size_type dofId = 0; dofId < numCells; ++dofId) + { + auto nabIdx = pattern[dofId].begin(); + auto endNab = pattern[dofId].end(); + for (; nabIdx != endNab; ++nabIdx) + { + blockJacobiForGPUILU0_->addindex(dofId, *nabIdx); + } + } + + blockJacobiForGPUILU0_->endindices(); + } + + void copyMatToBlockJac(Matrix& mat, Matrix& blockJac) + { + auto rbegin = blockJac.begin(); + auto rend = blockJac.end(); + for (auto row = rbegin; row != rend; ++row) { + for (auto col = (*row).begin(); col != (*row).end(); ++col) { + blockJac[row.index()][col.index()] = mat[row.index()][col.index()]; + } + } + } Matrix& getMatrix() { @@ -517,6 +604,8 @@ namespace Opm Matrix* matrix_; Vector *rhs_; + std::unique_ptr blockJacobiForGPUILU0_; + std::unique_ptr flexibleSolver_; std::unique_ptr linearOperatorForFlexibleSolver_; std::unique_ptr> wellOperator_; @@ -530,6 +619,7 @@ namespace Opm FlowLinearSolverParameters parameters_; PropertyTree prm_; bool scale_variables_; + int numJacobiBlocks_; std::shared_ptr< CommunicationType > comm_; }; // end ISTLSolver diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index ee7c6e7c8..11080b99c 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -193,6 +193,8 @@ void BdaBridge::copySparsityPatternFromI template void BdaBridge::solve_system([[maybe_unused]] BridgeMatrix* mat, + [[maybe_unused]] BridgeMatrix* blockMat, + [[maybe_unused]] int numJacobiBlocks, [[maybe_unused]] BridgeVector& b, [[maybe_unused]] WellContributions& wellContribs, [[maybe_unused]] InverseOperatorResult& res) @@ -209,6 +211,11 @@ void BdaBridge::solve_system([[maybe_unu const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back(); const int nnz = nnzb * dim * dim; + static std::vector bm_h_rows; + static std::vector bm_h_cols; + const int bm_nnzb = (bm_h_rows.empty()) ? blockMat->nonzeroes() : bm_h_rows.back(); + const int bm_nnz = bm_nnzb * dim * dim; + if (dim != 3) { OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); use_gpu = use_fpga = false; @@ -221,8 +228,15 @@ void BdaBridge::solve_system([[maybe_unu copySparsityPatternFromISTL(*mat, h_rows, h_cols); } + if (bm_h_rows.capacity() == 0) { + bm_h_rows.reserve(Nb+1); + bm_h_cols.reserve(bm_nnzb); + copySparsityPatternFromISTL(*blockMat, bm_h_rows, bm_h_cols); + } + Dune::Timer t_zeros; int numZeros = checkZeroDiagonal(*mat); + int bm_numZeros = checkZeroDiagonal(*blockMat); if (verbosity >= 2) { std::ostringstream out; out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros"; @@ -232,9 +246,17 @@ void BdaBridge::solve_system([[maybe_unu ///////////////////////// // actually solve + SolverStatus status; + if (numJacobiBlocks > 1) + status = backend->solve_system2(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), + bm_nnz, static_cast(&(((*blockMat)[0][0][0][0]))), bm_h_rows.data(), bm_h_cols.data(), + wellContribs, result); + else + status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); + // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour - SolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); + //SolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); switch(status) { case SolverStatus::BDA_SOLVER_SUCCESS: //OpmLog::info("BdaSolver converged"); diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 3d24051bb..29e09aa46 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -65,7 +65,7 @@ public: /// \param[in] b vector b, should be of type Dune::BlockVector /// \param[in] wellContribs contains all WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] result summary of solver result - void solve_system(BridgeMatrix *mat, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result); + void solve_system(BridgeMatrix *mat, BridgeMatrix *blockMat, int numJacobiBlocks, BridgeVector &b, WellContributions& wellContribs, InverseOperatorResult &result); /// Get the resulting x vector /// \param[inout] x vector x, should be of type Dune::BlockVector diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index d84652d28..7a65d458b 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -89,6 +89,11 @@ namespace Accelerator { double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) = 0; + virtual SolverStatus solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) = 0; + virtual void get_result(double *x) = 0; }; // end class BdaSolver diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp index 9087d4074..6ce712a9d 100644 --- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp +++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp @@ -223,7 +223,6 @@ void FpgaSolverBackend::get_result(double *x_) } } // end get_result() - template SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) { @@ -250,6 +249,14 @@ SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int d return SolverStatus::BDA_SOLVER_SUCCESS; } +template +SolverStatus FpgaSolverBackend::solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) +{ + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; +} template void FpgaSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp index 4d1e766f3..12293e2e7 100644 --- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp +++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp @@ -255,6 +255,11 @@ public: /// \return status code SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + SolverStatus solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) override; + /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp index 957575fbf..859320163 100644 --- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp @@ -359,6 +359,15 @@ void amgclSolverBackend::get_result(double *x_) { } // end get_result() +template +SolverStatus amgclSolverBackend::solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) +{ + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; +} + template SolverStatus amgclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions&, BdaResult &res) { if (initialized == false) { @@ -371,6 +380,7 @@ SolverStatus amgclSolverBackend::solve_system(int N_, int nnz_, int } + #define INSTANTIATE_BDA_FUNCTIONS(n) \ template amgclSolverBackend::amgclSolverBackend(int, int, double, unsigned int, unsigned int); \ diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp index 930f81ff9..95f3c928f 100644 --- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp @@ -145,6 +145,11 @@ public: /// \return status code SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + SolverStatus solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) override; + /// Get result after linear solve, and peform postprocessing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu index 46f345784..7439e39c5 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu @@ -500,7 +500,14 @@ SolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int } return SolverStatus::BDA_SOLVER_SUCCESS; } - +template +SolverStatus cusparseSolverBackend::solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) +{ + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ template cusparseSolverBackend::cusparseSolverBackend(int, int, double, unsigned int); \ diff --git a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp index 791bf32fc..8b05e1a8a 100644 --- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp +++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp @@ -138,6 +138,11 @@ public: /// \return status code SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + SolverStatus solve_system2(int N_, int nnz_, int dim, + double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) override; + /// Get resulting vector x after linear solve, also includes post processing if necessary /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array void get_result(double *x) override; diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index 0c2bc6364..f10d025c8 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -149,7 +149,127 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat) } return true; -} // end init() +} + + +template +bool BILU0::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat) +{ + const unsigned int bs = block_size; + + this->N = mat->Nb * block_size; + this->Nb = mat->Nb; + this->nnz = mat->nnzbs * block_size * block_size; + this->nnzb = mat->nnzbs; + + this->nnz_jm = jacMat->nnzbs * block_size * block_size; + this->nnzbs_jm = jacMat->nnzbs; + + int *CSCRowIndices = nullptr; + int *CSCColPointers = nullptr; + + if (opencl_ilu_reorder == ILUReorder::NONE) { + LUmat = std::make_unique(*mat); + } else { + toOrder.resize(Nb); + fromOrder.resize(Nb); + CSCRowIndices = new int[nnzbs_jm]; + CSCColPointers = new int[Nb + 1]; + rmat = std::make_shared(mat->Nb, mat->nnzbs, block_size); + rJacMat = std::make_shared(jacMat->Nb, jacMat->nnzbs, block_size); + LUmat = std::make_unique(*rJacMat); + + Timer t_convert; + csrPatternToCsc(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb); + if(verbosity >= 3){ + std::ostringstream out; + out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s"; + OpmLog::info(out.str()); + } + } + + Timer t_analysis; + std::ostringstream out; + if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { + out << "BILU0 reordering strategy: " << "level_scheduling\n"; + findLevelScheduling(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + for (int iii = 0; iii < numColors; ++iii) { out << "rpc: "<< rowsPerColor[iii] << "\n";} + out << "numColors: "<< numColors << "\n"; + } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { + out << "BILU0 reordering strategy: " << "graph_coloring\n"; + findGraphColoring(jacMat->colIndices, jacMat->rowPointers, CSCRowIndices, CSCColPointers, jacMat->Nb, jacMat->Nb, jacMat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + for (int iii = 0; iii < numColors; ++iii) { out << "rpc: "<< rowsPerColor[iii] << "\n";} + out << "numColors: "<< numColors << "\n"; + } else if (opencl_ilu_reorder == ILUReorder::NONE) { + out << "BILU0 reordering strategy: none\n"; + // numColors = 1; + // rowsPerColor.emplace_back(Nb); + numColors = Nb; + for(int i = 0; i < Nb; ++i){ + rowsPerColor.emplace_back(1); + } + } else { + OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); + } + if(verbosity >= 1){ + out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n"; + } +#if CHOW_PATEL + out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU; +#endif + OpmLog::info(out.str()); + + + if (opencl_ilu_reorder != ILUReorder::NONE) { + delete[] CSCRowIndices; + delete[] CSCColPointers; + } + + diagIndex.resize(mat->Nb); + invDiagVals.resize(mat->Nb * bs * bs); + +#if CHOW_PATEL + Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); + Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); +#endif + + LUmat->nnzValues = new double[jacMat->nnzbs * bs * bs]; + + s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); + s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1)); + s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb); +#if CHOW_PATEL + s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Lrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); + s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); + s.Ucols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); + s.Urows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Lmat->Nb + 1)); +#else + s.LUvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * LUmat->nnzbs); + s.LUcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->nnzbs); + s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1)); +#endif + + events.resize(2); + err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]); + + rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0 + for (int i = 0; i < numColors; ++i) { + rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i]; + } + err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); + } + + return true; +} + template @@ -245,6 +365,104 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat) return true; } // end create_preconditioner() + +template +bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat) +{ + const unsigned int bs = block_size; + auto *m = mat; + auto *jm = jacMat; + + if (opencl_ilu_reorder != ILUReorder::NONE) { + m = rmat.get(); + jm = rJacMat.get(); + Timer t_reorder; + reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get()); + reorderBlockedMatrixByPattern(jacMat, toOrder.data(), fromOrder.data(), rJacMat.get()); + + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 reorder matrix: " << t_reorder.stop() << " s"; + OpmLog::info(out.str()); + } + } + + // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp + // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy + Timer t_copy; + memcpy(LUmat->nnzValues, jm->nnzValues, sizeof(double) * bs * bs * jm->nnzbs); + + if (verbosity >= 3){ + std::ostringstream out; + out << "BILU0 memcpy: " << t_copy.stop() << " s"; + OpmLog::info(out.str()); + } + +#if CHOW_PATEL + chowPatelIlu.decomposition(queue, context, + LUmat.get(), Lmat.get(), Umat.get(), + invDiagVals, diagIndex, + s.diagIndex, s.invDiagVals, + s.Lvals, s.Lcols, s.Lrows, + s.Uvals, s.Ucols, s.Urows); +#else + Timer t_copyToGpu; + + events.resize(1); + queue->enqueueWriteBuffer(s.LUvals, CL_FALSE, 0, LUmat->nnzbs * bs * bs * sizeof(double), LUmat->nnzValues, nullptr, &events[0]); + + std::call_once(pattern_uploaded, [&](){ + // find the positions of each diagonal block + // must be done after reordering + for (int row = 0; row < Nb; ++row) { + int rowStart = LUmat->rowPointers[row]; + int rowEnd = LUmat->rowPointers[row+1]; + + auto candidate = std::find(LUmat->colIndices + rowStart, LUmat->colIndices + rowEnd, row); + assert(candidate != LUmat->colIndices + rowEnd); + diagIndex[row] = candidate - LUmat->colIndices; + } + events.resize(4); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); + queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); + queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); + }); + + cl::WaitForEvents(events); + events.clear(); + if (err != CL_SUCCESS) { + // enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL + OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); + } + + if (verbosity >= 3) { + std::ostringstream out; + out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s"; + OpmLog::info(out.str()); + } + + Timer t_decomposition; + std::ostringstream out; + cl::Event event; + for (int color = 0; color < numColors; ++color) { + const unsigned int firstRow = rowsPerColorPrefix[color]; + const unsigned int lastRow = rowsPerColorPrefix[color+1]; + if (verbosity >= 4) { + out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; + } + OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size); + } + + if (verbosity >= 3) { + out << "BILU0 decomposition: " << t_decomposition.stop() << " s"; + OpmLog::info(out.str()); + } +#endif // CHOW_PATEL + + return true; +} // end create_preconditioner() + + // kernels are blocking on an NVIDIA GPU, so waiting for events is not needed // however, if individual kernel calls are timed, waiting for events is needed // behavior on other GPUs is untested diff --git a/opm/simulators/linalg/bda/opencl/BILU0.hpp b/opm/simulators/linalg/bda/opencl/BILU0.hpp index b3b9ce125..e2a6b1a52 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.hpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp @@ -53,8 +53,11 @@ class BILU0 : public Preconditioner using Base::err; private: + int nnz_jm; // number of nonzeroes of the matrix (scalar) + int nnzbs_jm; // number of blocks of the matrix std::unique_ptr LUmat = nullptr; std::shared_ptr rmat = nullptr; // only used with PAR_SIM + std::shared_ptr rJacMat = nullptr; #if CHOW_PATEL std::unique_ptr Lmat = nullptr, Umat = nullptr; #endif @@ -92,9 +95,11 @@ public: // analysis, find reordering if specified bool analyze_matrix(BlockedMatrix *mat) override; + bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat); // ilu_decomposition bool create_preconditioner(BlockedMatrix *mat) override; + bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat); // apply preconditioner, x = prec(y) void apply(const cl::Buffer& y, cl::Buffer& x) override; diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp index f7256039b..8bf670b8c 100644 --- a/opm/simulators/linalg/bda/opencl/Preconditioner.cpp +++ b/opm/simulators/linalg/bda/opencl/Preconditioner.cpp @@ -52,10 +52,21 @@ std::unique_ptr > Preconditioner::create( } } +template +bool Preconditioner::analyze_matrix(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) { + return analyze_matrix(mat); +} + +template +bool Preconditioner::create_preconditioner(BlockedMatrix *mat, [[maybe_unused]] BlockedMatrix *jacMat) { + return create_preconditioner(mat); +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ -template std::unique_ptr > Preconditioner::create(PreconditionerType, int, ILUReorder); \ -template void Preconditioner::setOpencl(std::shared_ptr&, std::shared_ptr&); +template std::unique_ptr > Preconditioner::create(PreconditionerType, int, ILUReorder); \ +template void Preconditioner::setOpencl(std::shared_ptr&, std::shared_ptr&); \ +template bool Preconditioner::analyze_matrix(BlockedMatrix *, BlockedMatrix *); \ +template bool Preconditioner::create_preconditioner(BlockedMatrix *, BlockedMatrix *); INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp index a288f952d..f0af75d25 100644 --- a/opm/simulators/linalg/bda/opencl/Preconditioner.hpp +++ b/opm/simulators/linalg/bda/opencl/Preconditioner.hpp @@ -68,10 +68,14 @@ public: // analyze matrix, e.g. the sparsity pattern // probably only called once + // the version with two params can be overloaded, if not, it will default to using the one param version virtual bool analyze_matrix(BlockedMatrix *mat) = 0; + virtual bool analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat); // create/update preconditioner, probably used every linear solve + // the version with two params can be overloaded, if not, it will default to using the one param version virtual bool create_preconditioner(BlockedMatrix *mat) = 0; + virtual bool create_preconditioner(BlockedMatrix *mat, BlockedMatrix *jacMat); // get reordering mappings virtual int* getToOrder() = 0; diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp index 855631887..20ecdde97 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp @@ -441,6 +441,69 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() +template +void openclSolverBackend::initialize2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, + int nnz2, double *vals2, int *rows2, int *cols2) { + this->N = N_; + this->nnz = nnz_; + this->nnzb = nnz_ / block_size / block_size; + + this->jac_nnz = nnz2; + this->jac_nnzb = nnz2 / block_size / block_size; + + Nb = (N + dim - 1) / dim; + std::ostringstream out; + out << "Initializing GPU, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n"; + out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; + out << "PlatformID: " << platformID << ", deviceID: " << deviceID << "\n"; + OpmLog::info(out.str()); + out.str(""); + out.clear(); + + try { + prec->setOpencl(context, queue); + +#if COPY_ROW_BY_ROW + vals_contiguous = new double[N]; +#endif + mat.reset(new BlockedMatrix(Nb, nnzb, block_size, vals, cols, rows)); + jacMat.reset(new BlockedMatrix(Nb, jac_nnzb, block_size, vals2, cols2, rows2)); + + d_x = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_b = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_rb = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_r = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_rw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_p = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_pw = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_s = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_t = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_v = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + d_tmp = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * N); + + d_Avals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * nnz); + d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); + d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); + + bool reorder = (opencl_ilu_reorder != ILUReorder::NONE); + if (reorder) { + rb = new double[N]; + d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); + } + + } catch (const cl::Error& error) { + std::ostringstream oss; + oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; + oss << getErrorString(error.err()); + // rethrow exception + OPM_THROW(std::logic_error, oss.str()); + } catch (const std::logic_error& error) { + // rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs + throw error; + } + + initialized = true; +} // end initialize() template void openclSolverBackend::finalize() { @@ -527,8 +590,7 @@ template bool openclSolverBackend::analyze_matrix() { Timer t; - // bool success = bilu0->init(mat.get()); - bool success = prec->analyze_matrix(mat.get()); + bool success = prec->analyze_matrix(mat.get(), jacMat.get()); if (opencl_ilu_reorder == ILUReorder::NONE) { rmat = mat.get(); @@ -579,7 +641,7 @@ template bool openclSolverBackend::create_preconditioner() { Timer t; - bool result = prec->create_preconditioner(mat.get()); + bool result = prec->create_preconditioner(mat.get(), jacMat.get()); if (verbosity > 2) { std::ostringstream out; @@ -662,7 +724,33 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int solve_system(wellContribs, res); return SolverStatus::BDA_SOLVER_SUCCESS; } - +template +SolverStatus openclSolverBackend::solve_system2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) +{ + if (initialized == false) { + initialize2(N_, nnz_, dim, vals, rows, cols, nnz2, vals2, rows2, cols2); + if (analysis_done == false) { + if (!analyze_matrix()) { + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; + } + } + update_system(vals, b, wellContribs); + if (!create_preconditioner()) { + return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + } + copy_system_to_gpu(); + } else { + update_system(vals, b, wellContribs); + if (!create_preconditioner()) { + return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + } + update_system_on_gpu(); + } + solve_system(wellContribs, res); + return SolverStatus::BDA_SOLVER_SUCCESS; +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ template openclSolverBackend::openclSolverBackend( \ diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp index d356520e6..55cc42d2e 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp @@ -63,12 +63,15 @@ private: std::vector devices; + int jac_nnz; + int jac_nnzb; std::unique_ptr > prec; // can perform blocked ILU0 and AMG on pressure component bool is_root; // allow for nested solvers, the root solver is called by BdaBridge int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings bool analysis_done = false; std::unique_ptr mat = nullptr; // original matrix + std::unique_ptr jacMat = nullptr; // matrix for preconditioner BlockedMatrix *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv ILUReorder opencl_ilu_reorder; // reordering strategy std::vector events; @@ -138,6 +141,9 @@ private: /// \param[in] cols array of columnIndices, contains nnz values void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + void initialize2(int N, int nnz, int dim, double *vals, int *rows, int *cols, + int nnz2, double *vals2, int *rows2, int *cols2); + /// Clean memory void finalize(); @@ -205,6 +211,10 @@ public: /// \return status code SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + SolverStatus solve_system2(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, + int nnz2, double *vals2, int *rows2, int *cols2, + WellContributions& wellContribs, BdaResult &res) override; + /// Solve scalar linear system, for example a coarse system of an AMG preconditioner /// Data is already on the GPU // SolverStatus solve_system(BdaResult &res); @@ -218,7 +228,7 @@ public: /// \param[in] context the opencl context to be used /// \param[in] queue the opencl queue to be used void setOpencl(std::shared_ptr& context, std::shared_ptr& queue); - + }; // end class openclSolverBackend } // namespace Accelerator diff --git a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp index e8d37237f..21d512172 100644 --- a/opm/simulators/linalg/findOverlapRowsAndColumns.hpp +++ b/opm/simulators/linalg/findOverlapRowsAndColumns.hpp @@ -39,9 +39,9 @@ namespace detail /// \param useWellConn Boolean that is true when UseWellContribusion is true /// \param wellGraph Cell IDs of well cells stored in a graph. template - void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector>& wellGraph) + void setWellConnections(const Grid& grid, const W& wells, bool useWellConn, std::vector>& wellGraph, int numJacobiBlocks) { - if ( grid.comm().size() > 1) + if ( grid.comm().size() > 1 || numJacobiBlocks > 1) { Dune::CartesianIndexMapper< Grid > cartMapper( grid ); const int numCells = cartMapper.compressedSize(); // grid.numCells() diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp index 94709ce3c..edfa6b3ae 100644 --- a/tests/test_openclSolver.cpp +++ b/tests/test_openclSolver.cpp @@ -118,7 +118,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, Matrix& matrix, Vec } auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory // matrix created by readMatrixMarket() did not have contiguous memory - bridge->solve_system(&mat2, rhs, *wellContribs, result); + bridge->solve_system(&mat2, &mat2, 0, rhs, *wellContribs, result); bridge->get_result(x); return x;