mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #5408 from BigDataAccelerate/cpr_rocsparse
rocsparse CPR initial version
This commit is contained in:
commit
ca2ef490aa
@ -247,29 +247,36 @@ endif()
|
||||
|
||||
if(USE_BDA_BRIDGE)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp
|
||||
opm/simulators/linalg/bda/CprCreation.cpp
|
||||
opm/simulators/linalg/bda/Misc.cpp
|
||||
opm/simulators/linalg/bda/WellContributions.cpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
|
||||
opm/simulators/linalg/ISTLSolverBda.cpp)
|
||||
if(OPENCL_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclBILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/ChowPatelIlu.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BISAI.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/CPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclBISAI.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclCPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/opencl.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclKernels.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclWellContributions.cpp)
|
||||
endif()
|
||||
if(ROCALUTION_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.cpp)
|
||||
endif()
|
||||
if(rocsparse_FOUND AND rocblas_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseWellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseWellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/hipKernels.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp)
|
||||
endif()
|
||||
if(CUDA_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
|
||||
@ -660,25 +667,33 @@ if (USE_BDA_BRIDGE)
|
||||
opm/simulators/linalg/bda/BdaBridge.hpp
|
||||
opm/simulators/linalg/bda/BdaResult.hpp
|
||||
opm/simulators/linalg/bda/BdaSolver.hpp
|
||||
opm/simulators/linalg/bda/opencl/BILU0.hpp
|
||||
opm/simulators/linalg/bda/CprCreation.hpp
|
||||
opm/simulators/linalg/bda/Preconditioner.hpp
|
||||
opm/simulators/linalg/bda/Misc.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclBILU0.hpp
|
||||
opm/simulators/linalg/bda/BlockedMatrix.hpp
|
||||
opm/simulators/linalg/bda/opencl/CPR.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclCPR.hpp
|
||||
opm/simulators/linalg/bda/cuda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp
|
||||
opm/simulators/linalg/bda/opencl/BISAI.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclBISAI.hpp
|
||||
opm/simulators/linalg/bda/Reorder.hpp
|
||||
opm/simulators/linalg/bda/opencl/opencl.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclKernels.hpp
|
||||
opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp
|
||||
opm/simulators/linalg/bda/opencl/Preconditioner.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/opencl/openclWellContributions.hpp
|
||||
opm/simulators/linalg/bda/Matrix.hpp
|
||||
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
|
||||
opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocsparseWellContributions.hpp
|
||||
opm/simulators/linalg/bda/rocm/hipKernels.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp
|
||||
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
|
||||
opm/simulators/linalg/bda/WellContributions.hpp
|
||||
opm/simulators/linalg/ISTLSolverBda.hpp
|
||||
)
|
||||
|
@ -43,11 +43,11 @@
|
||||
|
||||
#include <opm/grid/polyhedralgrid.hh>
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
|
||||
std::shared_ptr<std::thread> copyThread;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm::detail {
|
||||
@ -113,9 +113,11 @@ apply(Vector& rhs,
|
||||
}
|
||||
#endif
|
||||
|
||||
bool use_multithreading = false;
|
||||
bool use_multithreading = true;
|
||||
#if HAVE_OPENMP
|
||||
use_multithreading = omp_get_max_threads() > 1;
|
||||
// if user manually sets --threads-per-process=1, do not use multithreading
|
||||
if (omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
if (numJacobiBlocks_ > 1) {
|
||||
@ -123,9 +125,9 @@ apply(Vector& rhs,
|
||||
//NOTE: copyThread can safely write to jacMat because in solve_system both matrix and *blockJacobiForGPUILU0_ diagonal entries
|
||||
//are checked and potentially overwritten in replaceZeroDiagonal() by mainThread. However, no matter the thread writing sequence,
|
||||
//the final entry in jacMat is correct.
|
||||
#if HAVE_OPENMP
|
||||
//#if HAVE_OPENMP
|
||||
copyThread = std::make_shared<std::thread>([&](){this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);});
|
||||
#endif // HAVE_OPENMP
|
||||
//#endif // HAVE_OPENMP
|
||||
}
|
||||
else {
|
||||
this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
|
||||
|
@ -43,11 +43,11 @@
|
||||
#endif
|
||||
|
||||
#if HAVE_ROCALUTION
|
||||
#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
#if HAVE_ROCSPARSE
|
||||
#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
@ -115,7 +115,7 @@ BdaBridge(std::string accelerator_mode_,
|
||||
use_gpu = true; // should be replaced by a 'use_bridge' boolean
|
||||
using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
|
||||
backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
|
||||
tolerance, platformID, deviceID);
|
||||
tolerance, platformID, deviceID, linsolver);
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error rocsparseSolver was chosen, but rocsparse/rocblas was not found by CMake");
|
||||
#endif
|
||||
|
295
opm/simulators/linalg/bda/CprCreation.cpp
Normal file
295
opm/simulators/linalg/bda/CprCreation.cpp
Normal file
@ -0,0 +1,295 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/CprCreation.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
CprCreation<Scalar, block_size>::CprCreation()
|
||||
{
|
||||
diagIndices.resize(1);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void CprCreation<Scalar, block_size>::
|
||||
create_preconditioner_amg(BlockedMatrix<Scalar> *mat_)
|
||||
{
|
||||
mat = mat_;
|
||||
cprNb = mat_->Nb;
|
||||
cprnnzb = mat_->nnzbs;
|
||||
cprN = cprNb * block_size;
|
||||
cprnnz = cprnnzb * block_size * block_size;
|
||||
|
||||
coarse_vals.resize(cprnnzb);
|
||||
coarse_x.resize(cprNb);
|
||||
coarse_y.resize(cprNb);
|
||||
weights.resize(cprN);
|
||||
|
||||
try{
|
||||
Scalar rhs[] = {0, 0, 0};
|
||||
rhs[pressure_idx] = 1;
|
||||
|
||||
// find diagonal index for each row
|
||||
if (diagIndices[0].empty()) {
|
||||
diagIndices[0].resize(cprNb);
|
||||
for (int row = 0; row < cprNb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
|
||||
assert(candidate != mat->colIndices + end);
|
||||
diagIndices[0][row] = candidate - mat->colIndices;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate weights for each row
|
||||
for (int row = 0; row < cprNb; ++row) {
|
||||
// solve to find weights
|
||||
Scalar *row_weights = weights.data() + block_size * row; // weights for this row
|
||||
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
|
||||
|
||||
// normalize weights for this row
|
||||
Scalar abs_max = get_absmax(row_weights, block_size);
|
||||
for(unsigned int i = 0; i < block_size; i++){
|
||||
row_weights[i] /= abs_max;
|
||||
}
|
||||
}
|
||||
|
||||
// extract pressure
|
||||
// transform blocks to scalars to create scalar linear system
|
||||
for (int row = 0; row < cprNb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
Scalar *block = mat->nnzValues + idx * block_size * block_size;
|
||||
Scalar *row_weights = weights.data() + block_size * row;
|
||||
Scalar value = 0.0;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
value += block[block_size * i + pressure_idx] * row_weights[i];
|
||||
}
|
||||
coarse_vals[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_MPI
|
||||
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
|
||||
#else
|
||||
using Communication = Dune::Amg::SequentialInformation;
|
||||
#endif
|
||||
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
|
||||
if (recalculate_aggregates) {
|
||||
dune_coarse = std::make_unique<DuneMat>(cprNb, cprNb, cprnnzb, DuneMat::row_wise);
|
||||
|
||||
// typedef DuneMat::CreateIterator Iter;
|
||||
using Iter = typename DuneMat::CreateIterator;
|
||||
|
||||
// setup sparsity pattern
|
||||
for(Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row){
|
||||
int start = mat->rowPointers[row.index()];
|
||||
int end = mat->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
row.insert(col);
|
||||
}
|
||||
}
|
||||
|
||||
// set values
|
||||
for (int row = 0; row < cprNb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
|
||||
Dune::Amg::SequentialInformation seqinfo;
|
||||
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
|
||||
|
||||
Opm::PropertyTree property_tree;
|
||||
property_tree.put("alpha", 0.333333333333);
|
||||
|
||||
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
|
||||
// Yet a SymmetricDependency is used in AMGCPR
|
||||
// An UnSymmetricCriterion is also available
|
||||
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
|
||||
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
const Criterion c = Opm::AMGHelper<MatrixOperator,Dune::Amg::SequentialInformation,DuneMat,DuneVec>::criterion(property_tree);
|
||||
num_pre_smooth_steps = c.getNoPreSmoothSteps();
|
||||
num_post_smooth_steps = c.getNoPostSmoothSteps();
|
||||
|
||||
dune_amg->template build<OverlapFlags>(c);
|
||||
|
||||
analyzeHierarchy();
|
||||
analyzeAggregateMaps();
|
||||
|
||||
recalculate_aggregates = false;
|
||||
} else {
|
||||
// update values of coarsest level in AMG
|
||||
// this works because that level is actually a reference to the DuneMat held by dune_coarse
|
||||
for (int row = 0; row < cprNb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
// update the rest of the AMG hierarchy
|
||||
dune_amg->recalculateGalerkin(OverlapFlags());
|
||||
analyzeHierarchy();
|
||||
}
|
||||
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "Caught exception: " << ex.what() << std::endl;
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void CprCreation<Scalar, block_size>::
|
||||
analyzeHierarchy()
|
||||
{
|
||||
const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
|
||||
// store coarsest AMG level in umfpack format, also performs LU decomposition
|
||||
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
|
||||
|
||||
num_levels = dune_amg->levels();
|
||||
level_sizes.resize(num_levels);
|
||||
diagIndices.resize(num_levels);
|
||||
|
||||
Amatrices.reserve(num_levels);
|
||||
Rmatrices.reserve(num_levels - 1); // coarsest level does not need one
|
||||
invDiags.reserve(num_levels);
|
||||
|
||||
Amatrices.clear();
|
||||
invDiags.clear();
|
||||
|
||||
// matrixIter.dereference() returns MatrixAdapter
|
||||
// matrixIter.dereference().getmat() returns BCRSMat
|
||||
typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for(int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
const auto& A = matrixIter.dereference().getmat();
|
||||
level_sizes[level] = A.N();
|
||||
diagIndices[level].reserve(A.N());
|
||||
|
||||
// extract matrix A
|
||||
Amatrices.emplace_back(A.N(), A.nonzeroes());
|
||||
// contiguous copy is not possible
|
||||
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
|
||||
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
|
||||
int nnz_idx = 0;
|
||||
const bool fillDiagIndices = diagIndices[level].empty();
|
||||
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
|
||||
if (fillDiagIndices && r.index() == c.index()) {
|
||||
diagIndices[level].emplace_back(nnz_idx);
|
||||
}
|
||||
nnz_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
Opm::BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers, Amatrices.back().colIndices);
|
||||
|
||||
// compute inverse diagonal values for current level
|
||||
invDiags.emplace_back(A.N());
|
||||
for (unsigned int row = 0; row < A.N(); ++row) {
|
||||
invDiags.back()[row] = 1 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void CprCreation<Scalar, block_size>::
|
||||
analyzeAggregateMaps()
|
||||
{
|
||||
PcolIndices.resize(num_levels - 1);
|
||||
Rmatrices.clear();
|
||||
|
||||
const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
|
||||
typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for(int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
typename DuneAmg::AggregatesMap *map = *mapIter;
|
||||
|
||||
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
|
||||
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
|
||||
|
||||
// get indices for each row of P and R
|
||||
std::vector<std::vector<unsigned> > indicesR(level_sizes[level+1]);
|
||||
PcolIndices[level].resize(level_sizes[level]);
|
||||
|
||||
using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
|
||||
for(AggregateIterator ai = map->begin(); ai != map->end(); ++ai){
|
||||
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
|
||||
const long int diff = ai - map->begin();
|
||||
PcolIndices[level][diff] = *ai;
|
||||
indicesR[*ai].emplace_back(diff);
|
||||
}
|
||||
}
|
||||
|
||||
int col_idx = 0;
|
||||
// set sparsity pattern of R
|
||||
Rmatrices.back().rowPointers[0] = 0;
|
||||
for (unsigned int i = 0; i < indicesR.size(); ++i) {
|
||||
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
|
||||
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
|
||||
Rmatrices.back().colIndices[col_idx++] = *it;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class CprCreation<T,1>; \
|
||||
template class CprCreation<T,2>; \
|
||||
template class CprCreation<T,3>; \
|
||||
template class CprCreation<T,4>; \
|
||||
template class CprCreation<T,5>; \
|
||||
template class CprCreation<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
|
86
opm/simulators/linalg/bda/CprCreation.hpp
Normal file
86
opm/simulators/linalg/bda/CprCreation.hpp
Normal file
@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_CPRCREATION_HPP
|
||||
#define OPM_CPRCREATION_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <dune/istl/paamg/matrixhierarchy.hh>
|
||||
#include <dune/istl/umfpack.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class CprCreation
|
||||
{
|
||||
int cprN;
|
||||
int cprNb;
|
||||
int cprnnz;
|
||||
int cprnnzb;
|
||||
|
||||
public:
|
||||
CprCreation();
|
||||
|
||||
protected:
|
||||
|
||||
int num_levels;
|
||||
std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<std::vector<Scalar> > invDiags; // inverse of diagonal of Amatrices
|
||||
|
||||
BlockedMatrix<Scalar> *mat = nullptr; // input matrix, blocked
|
||||
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
|
||||
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
|
||||
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
|
||||
std::unique_ptr<DuneAmg> dune_amg;
|
||||
std::unique_ptr<DuneMat> dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy
|
||||
std::shared_ptr<MatrixOperator> dune_op; // operator, input to Dune AMG
|
||||
std::vector<int> level_sizes; // size of each level in the AMG hierarchy
|
||||
std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
|
||||
Dune::UMFPack<DuneMat> umfpack; // dune/istl/umfpack object used to solve the coarsest level of AMG
|
||||
bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
|
||||
bool recalculate_aggregates = true; // only rerecalculate if true
|
||||
const int pressure_idx = 1; // hardcoded to mimic OPM
|
||||
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
|
||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
void analyzeHierarchy();
|
||||
|
||||
// Analyze the aggregateMaps from the AMG hierarchy
|
||||
// These can be reused, so only use when recalculate_aggregates is true
|
||||
void analyzeAggregateMaps();
|
||||
|
||||
void create_preconditioner_amg(BlockedMatrix<Scalar> *mat);
|
||||
};
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
62
opm/simulators/linalg/bda/Misc.cpp
Normal file
62
opm/simulators/linalg/bda/Misc.cpp
Normal file
@ -0,0 +1,62 @@
|
||||
#include <cmath>
|
||||
#include <algorithm>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A,
|
||||
const unsigned int B)
|
||||
{
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
// return the absolute value of the N elements for which the absolute value is highest
|
||||
template<class Scalar>
|
||||
Scalar get_absmax(const Scalar* data,
|
||||
const int N)
|
||||
{
|
||||
return std::abs(*std::max_element(data, data + N,
|
||||
[](Scalar a, Scalar b)
|
||||
{ return std::fabs(a) < std::fabs(b); }));
|
||||
}
|
||||
|
||||
// solve A^T * x = b
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar* A,
|
||||
const Scalar* b,
|
||||
Scalar* x)
|
||||
{
|
||||
const int B = 3;
|
||||
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
|
||||
Scalar t4 = A[0+0*B] * A[1+1*B];
|
||||
Scalar t6 = A[0+0*B] * A[1+2*B];
|
||||
Scalar t8 = A[0+1*B] * A[1+0*B];
|
||||
Scalar t10 = A[0+2*B] * A[1+0*B];
|
||||
Scalar t12 = A[0+1*B] * A[2+0*B];
|
||||
Scalar t14 = A[0+2*B] * A[2+0*B];
|
||||
|
||||
Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
|
||||
|
||||
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
|
||||
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
|
||||
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
|
||||
|
||||
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
|
||||
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
|
||||
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
|
||||
|
||||
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
|
||||
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
|
||||
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T)\
|
||||
template void solve_transposed_3x3<T>(const T* A, const T* b, T* x);\
|
||||
template T get_absmax<T>(const T* data, const int N);
|
||||
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
}
|
62
opm/simulators/linalg/bda/Misc.hpp
Normal file
62
opm/simulators/linalg/bda/Misc.hpp
Normal file
@ -0,0 +1,62 @@
|
||||
#ifndef OPM_MISC_HPP
|
||||
#define OPM_MISC_HPP
|
||||
|
||||
#ifdef HAVE_ROCSPARSE
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_version.h>
|
||||
#include <sstream>
|
||||
|
||||
#define HIP_CHECK(STAT) \
|
||||
do { \
|
||||
const hipError_t stat = (STAT); \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::hip "; \
|
||||
oss << "error: " << hipGetErrorString(stat); \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCSPARSE_CHECK(STAT) \
|
||||
do { \
|
||||
const rocsparse_status stat = (STAT); \
|
||||
if(stat != rocsparse_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocsparse "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCBLAS_CHECK(STAT) \
|
||||
do { \
|
||||
const rocblas_status stat = (STAT); \
|
||||
if(stat != rocblas_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocblas "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
unsigned int ceilDivision(const unsigned int A,
|
||||
const unsigned int B);
|
||||
|
||||
template<class Scalar>
|
||||
Scalar get_absmax(const Scalar *data,
|
||||
const int N);
|
||||
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar *A,
|
||||
const Scalar *b,
|
||||
Scalar *x);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
@ -20,12 +20,18 @@
|
||||
#ifndef OPM_PRECONDITIONER_HEADER_INCLUDED
|
||||
#define OPM_PRECONDITIONER_HEADER_INCLUDED
|
||||
|
||||
#if HAVE_OPENCL
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
|
||||
#include <memory>
|
||||
#endif
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
enum PreconditionerType {
|
||||
BILU0,
|
||||
CPR,
|
||||
BISAI
|
||||
};
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
@ -38,47 +44,38 @@ protected:
|
||||
int nnzb = 0; // number of blocks of the matrix
|
||||
int verbosity = 0;
|
||||
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
|
||||
Preconditioner(int verbosity_) :
|
||||
verbosity(verbosity_)
|
||||
{};
|
||||
|
||||
public:
|
||||
enum class Type {
|
||||
BILU0,
|
||||
CPR,
|
||||
BISAI
|
||||
};
|
||||
|
||||
static std::unique_ptr<Preconditioner> create(Type type,
|
||||
virtual ~Preconditioner() = default;
|
||||
|
||||
static std::unique_ptr<Preconditioner> create(PreconditionerType type,
|
||||
bool opencl_ilu_parallel,
|
||||
int verbosity);
|
||||
|
||||
virtual ~Preconditioner() = default;
|
||||
|
||||
// nested Preconditioners might need to override this
|
||||
virtual void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
#if HAVE_OPENCL
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
|
||||
#endif
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(Scalar& y, Scalar& x) = 0;
|
||||
|
||||
// analyze matrix, e.g. the sparsity pattern
|
||||
// probably only called once
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat) = 0;
|
||||
virtual bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat);
|
||||
BlockedMatrix<Scalar>* jacMat) = 0;
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat) = 0;
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat);
|
||||
BlockedMatrix<Scalar>* jacMat) = 0;
|
||||
};
|
||||
|
||||
} // namespace Opm::Accelerator
|
@ -34,7 +34,7 @@
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ROCSPARSE
|
||||
#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm {
|
||||
|
@ -38,10 +38,11 @@
|
||||
// otherwise, the nonzeroes of the matrix are assumed to be in a contiguous array, and a single GPU memcpy is enough
|
||||
#define COPY_ROW_BY_ROW 0
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif // HAVE_OPENMP
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
@ -342,11 +343,17 @@ copy_system_to_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
|
||||
bool use_multithreading = true;
|
||||
#if HAVE_OPENMP
|
||||
if(omp_get_max_threads() > 1)
|
||||
copyThread->join();
|
||||
if(omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif
|
||||
|
||||
if (useJacMatrix) {
|
||||
if(use_multithreading)
|
||||
copyThread->join();
|
||||
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
@ -399,12 +406,17 @@ update_system_on_gpu(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
#else
|
||||
cudaMemcpyAsync(d_bVals, matrix->nnzValues,
|
||||
nnz * sizeof(Scalar), cudaMemcpyHostToDevice, stream);
|
||||
if (useJacMatrix) {
|
||||
|
||||
bool use_multithreading = true;
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
if (omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif
|
||||
|
||||
if (useJacMatrix) {
|
||||
if (use_multithreading)
|
||||
copyThread->join();
|
||||
|
||||
cudaMemcpyAsync(d_mVals, jacMatrix->nnzValues,
|
||||
nnzbs_prec * block_size * block_size * sizeof(Scalar),
|
||||
cudaMemcpyHostToDevice, stream);
|
||||
|
@ -1,580 +0,0 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Dune::Timer;
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
CPR<Scalar,block_size>::CPR(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
bilu0 = std::make_unique<BILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
|
||||
diagIndices.resize(1);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_);
|
||||
mat = mat_;
|
||||
return success;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_, jacMat);
|
||||
mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool CPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
create_preconditioner_amg(mat); // already points to bilu0::rmat if needed
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "CPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// return the absolute value of the N elements for which the absolute value is highest
|
||||
template<class Scalar>
|
||||
Scalar get_absmax(const Scalar* data, const int N)
|
||||
{
|
||||
return std::abs(*std::max_element(data, data + N,
|
||||
[](Scalar a, Scalar b)
|
||||
{ return std::fabs(a) < std::fabs(b); }));
|
||||
}
|
||||
|
||||
// solve A^T * x = b
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x)
|
||||
{
|
||||
const int B = 3;
|
||||
// from dune-common/densematrix.hh, but transposed, so replace [r*B+c] with [r+c*B]
|
||||
Scalar t4 = A[0+0*B] * A[1+1*B];
|
||||
Scalar t6 = A[0+0*B] * A[1+2*B];
|
||||
Scalar t8 = A[0+1*B] * A[1+0*B];
|
||||
Scalar t10 = A[0+2*B] * A[1+0*B];
|
||||
Scalar t12 = A[0+1*B] * A[2+0*B];
|
||||
Scalar t14 = A[0+2*B] * A[2+0*B];
|
||||
|
||||
Scalar d = (t4*A[2+2*B]-t6*A[2+1*B]-t8*A[2+2*B]+
|
||||
t10*A[2+1*B]+t12*A[1+2*B]-t14*A[1+1*B]); // determinant
|
||||
|
||||
x[0] = (b[0]*A[1+1*B]*A[2+2*B] - b[0]*A[2+1*B]*A[1+2*B]
|
||||
- b[1] *A[0+1*B]*A[2+2*B] + b[1]*A[2+1*B]*A[0+2*B]
|
||||
+ b[2] *A[0+1*B]*A[1+2*B] - b[2]*A[1+1*B]*A[0+2*B]) / d;
|
||||
|
||||
x[1] = (A[0+0*B]*b[1]*A[2+2*B] - A[0+0*B]*b[2]*A[1+2*B]
|
||||
- A[1+0*B] *b[0]*A[2+2*B] + A[1+0*B]*b[2]*A[0+2*B]
|
||||
+ A[2+0*B] *b[0]*A[1+2*B] - A[2+0*B]*b[1]*A[0+2*B]) / d;
|
||||
|
||||
x[2] = (A[0+0*B]*A[1+1*B]*b[2] - A[0+0*B]*A[2+1*B]*b[1]
|
||||
- A[1+0*B] *A[0+1*B]*b[2] + A[1+0*B]*A[2+1*B]*b[0]
|
||||
+ A[2+0*B] *A[0+1*B]*b[1] - A[2+0*B]*A[1+1*B]*b[0]) / d;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar, block_size>::init_opencl_buffers()
|
||||
{
|
||||
d_Amatrices.reserve(num_levels);
|
||||
d_Rmatrices.reserve(num_levels - 1);
|
||||
d_invDiags.reserve(num_levels - 1);
|
||||
for (Matrix<Scalar>& m : Amatrices) {
|
||||
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
}
|
||||
for (Matrix<Scalar>& m : Rmatrices) {
|
||||
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
|
||||
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
|
||||
}
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * N);
|
||||
d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), Nb, Nb, nnzb, block_size);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * Nb);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::opencl_upload()
|
||||
{
|
||||
d_mat->upload(queue.get(), mat);
|
||||
|
||||
err = CL_SUCCESS;
|
||||
events.resize(2 * Rmatrices.size() + 1);
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
|
||||
sizeof(Scalar) * N, weights.data(), nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(queue.get(), &Amatrices[i]);
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
|
||||
sizeof(Scalar) * Amatrices[i].N, invDiags[i].data(),
|
||||
nullptr, &events[2*i+1]);
|
||||
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
|
||||
sizeof(int) * Amatrices[i].N, PcolIndices[i].data(),
|
||||
nullptr, &events[2*i+2]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
for (unsigned int i = 0; i < Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(queue.get(), &Rmatrices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::
|
||||
create_preconditioner_amg(BlockedMatrix<Scalar>* mat_)
|
||||
{
|
||||
this->mat = mat_;
|
||||
|
||||
coarse_vals.resize(nnzb);
|
||||
coarse_x.resize(Nb);
|
||||
coarse_y.resize(Nb);
|
||||
weights.resize(N);
|
||||
|
||||
try {
|
||||
Scalar rhs[] = {0, 0, 0};
|
||||
rhs[pressure_idx] = 1;
|
||||
|
||||
// find diagonal index for each row
|
||||
if (diagIndices[0].empty()) {
|
||||
diagIndices[0].resize(Nb);
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
auto candidate = std::find(mat->colIndices + start, mat->colIndices + end, row);
|
||||
assert(candidate != mat->colIndices + end);
|
||||
diagIndices[0][row] = candidate - mat->colIndices;
|
||||
}
|
||||
}
|
||||
|
||||
// calculate weights for each row
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
// solve to find weights
|
||||
Scalar* row_weights = weights.data() + block_size * row; // weights for this row
|
||||
solve_transposed_3x3(mat->nnzValues + block_size * block_size * diagIndices[0][row], rhs, row_weights);
|
||||
|
||||
// normalize weights for this row
|
||||
Scalar abs_max = get_absmax(row_weights, block_size);
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
row_weights[i] /= abs_max;
|
||||
}
|
||||
}
|
||||
|
||||
// extract pressure
|
||||
// transform blocks to scalars to create scalar linear system
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
Scalar* block = mat->nnzValues + idx * block_size * block_size;
|
||||
Scalar* row_weights = weights.data() + block_size * row;
|
||||
Scalar value = 0.0;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
value += block[block_size * i + pressure_idx] * row_weights[i];
|
||||
}
|
||||
coarse_vals[idx] = value;
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_MPI
|
||||
using Communication = Dune::OwnerOverlapCopyCommunication<int, int>;
|
||||
#else
|
||||
using Communication = Dune::Amg::SequentialInformation;
|
||||
#endif
|
||||
using OverlapFlags = Dune::NegateSet<Communication::OwnerSet>;
|
||||
if (recalculate_aggregates) {
|
||||
dune_coarse = std::make_unique<DuneMat>(Nb, Nb, nnzb, DuneMat::row_wise);
|
||||
|
||||
using Iter = typename DuneMat::CreateIterator;
|
||||
|
||||
// setup sparsity pattern
|
||||
for (Iter row = dune_coarse->createbegin(); row != dune_coarse->createend(); ++row) {
|
||||
int start = mat->rowPointers[row.index()];
|
||||
int end = mat->rowPointers[row.index() + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
row.insert(col);
|
||||
}
|
||||
}
|
||||
|
||||
// set values
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
dune_op = std::make_shared<MatrixOperator>(*dune_coarse);
|
||||
Dune::Amg::SequentialInformation seqinfo;
|
||||
dune_amg = std::make_unique<DuneAmg>(dune_op, Dune::stackobject_to_shared_ptr(seqinfo));
|
||||
|
||||
PropertyTree property_tree;
|
||||
property_tree.put("alpha", 0.333333333333);
|
||||
|
||||
// The matrix has a symmetric sparsity pattern, but the values are not symmetric
|
||||
// Yet a SymmetricDependency is used in AMGCPR
|
||||
// An UnSymmetricCriterion is also available
|
||||
// using Criterion = Dune::Amg::CoarsenCriterion<Dune::Amg::UnSymmetricCriterion<DuneMat, Dune::Amg::FirstDiagonal> >;
|
||||
using CriterionBase = Dune::Amg::AggregationCriterion<Dune::Amg::SymmetricDependency<DuneMat, Dune::Amg::FirstDiagonal>>;
|
||||
using Criterion = Dune::Amg::CoarsenCriterion<CriterionBase>;
|
||||
const Criterion c = Opm::AMGHelper<MatrixOperator,Dune::Amg::SequentialInformation,DuneMat,DuneVec>::criterion(property_tree);
|
||||
num_pre_smooth_steps = c.getNoPreSmoothSteps();
|
||||
num_post_smooth_steps = c.getNoPostSmoothSteps();
|
||||
|
||||
dune_amg->template build<OverlapFlags>(c);
|
||||
|
||||
analyzeHierarchy();
|
||||
analyzeAggregateMaps();
|
||||
|
||||
recalculate_aggregates = false;
|
||||
} else {
|
||||
// update values of coarsest level in AMG
|
||||
// this works because that level is actually a reference to the DuneMat held by dune_coarse
|
||||
for (int row = 0; row < Nb; ++row) {
|
||||
int start = mat->rowPointers[row];
|
||||
int end = mat->rowPointers[row + 1];
|
||||
for (int idx = start; idx < end; ++idx) {
|
||||
int col = mat->colIndices[idx];
|
||||
(*dune_coarse)[row][col] = coarse_vals[idx];
|
||||
}
|
||||
}
|
||||
|
||||
// update the rest of the AMG hierarchy
|
||||
dune_amg->recalculateGalerkin(OverlapFlags());
|
||||
analyzeHierarchy();
|
||||
}
|
||||
|
||||
// initialize OpenclMatrices and Buffers if needed
|
||||
auto init_func = std::bind(&CPR::init_opencl_buffers, this);
|
||||
std::call_once(opencl_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
opencl_upload();
|
||||
|
||||
} catch (const std::exception& ex) {
|
||||
std::cerr << "Caught exception: " << ex.what() << std::endl;
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::analyzeHierarchy()
|
||||
{
|
||||
const typename DuneAmg::ParallelMatrixHierarchy& matrixHierarchy = dune_amg->matrices();
|
||||
|
||||
// store coarsest AMG level in umfpack format, also performs LU decomposition
|
||||
umfpack.setMatrix((*matrixHierarchy.coarsest()).getmat());
|
||||
|
||||
num_levels = dune_amg->levels();
|
||||
level_sizes.resize(num_levels);
|
||||
diagIndices.resize(num_levels);
|
||||
|
||||
Amatrices.reserve(num_levels);
|
||||
Rmatrices.reserve(num_levels - 1); // coarsest level does not need one
|
||||
invDiags.reserve(num_levels);
|
||||
|
||||
Amatrices.clear();
|
||||
invDiags.clear();
|
||||
|
||||
// matrixIter.dereference() returns MatrixAdapter
|
||||
// matrixIter.dereference().getmat() returns BCRSMat
|
||||
typename DuneAmg::ParallelMatrixHierarchy::ConstIterator matrixIter = matrixHierarchy.finest();
|
||||
for (int level = 0; level < num_levels; ++matrixIter, ++level) {
|
||||
const auto& A = matrixIter.dereference().getmat();
|
||||
level_sizes[level] = A.N();
|
||||
diagIndices[level].reserve(A.N());
|
||||
|
||||
// extract matrix A
|
||||
Amatrices.emplace_back(A.N(), A.nonzeroes());
|
||||
// contiguous copy is not possible
|
||||
// std::copy(&(A[0][0][0][0]), &(A[0][0][0][0]) + A.nonzeroes(), Amatrices.back().nnzValues.data());
|
||||
// also update diagonal indices if needed, level 0 is already filled in create_preconditioner()
|
||||
int nnz_idx = 0;
|
||||
const bool fillDiagIndices = diagIndices[level].empty();
|
||||
for (typename DuneMat::const_iterator r = A.begin(); r != A.end(); ++r) {
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
Amatrices.back().nnzValues[nnz_idx] = A[r.index()][c.index()];
|
||||
if (fillDiagIndices && r.index() == c.index()) {
|
||||
diagIndices[level].emplace_back(nnz_idx);
|
||||
}
|
||||
nnz_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
BdaBridge<DuneMat, DuneVec, 1>::copySparsityPatternFromISTL(A, Amatrices.back().rowPointers,
|
||||
Amatrices.back().colIndices);
|
||||
|
||||
// compute inverse diagonal values for current level
|
||||
invDiags.emplace_back(A.N());
|
||||
for (unsigned int row = 0; row < A.N(); ++row) {
|
||||
invDiags.back()[row] = 1.0 / Amatrices.back().nnzValues[diagIndices[level][row]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::analyzeAggregateMaps()
|
||||
{
|
||||
PcolIndices.resize(num_levels - 1);
|
||||
Rmatrices.clear();
|
||||
|
||||
const typename DuneAmg::AggregatesMapList& aggregatesMaps = dune_amg->aggregatesMaps();
|
||||
|
||||
typename DuneAmg::AggregatesMapList::const_iterator mapIter = aggregatesMaps.begin();
|
||||
for (int level = 0; level < num_levels - 1; ++mapIter, ++level) {
|
||||
typename DuneAmg::AggregatesMap* map = *mapIter;
|
||||
|
||||
Rmatrices.emplace_back(level_sizes[level+1], level_sizes[level], level_sizes[level]);
|
||||
std::fill(Rmatrices.back().nnzValues.begin(), Rmatrices.back().nnzValues.end(), 1.0);
|
||||
|
||||
// get indices for each row of P and R
|
||||
std::vector<std::vector<unsigned>> indicesR(level_sizes[level+1]);
|
||||
PcolIndices[level].resize(level_sizes[level]);
|
||||
|
||||
using AggregateIterator = typename DuneAmg::AggregatesMap::const_iterator;
|
||||
for (AggregateIterator ai = map->begin(); ai != map->end(); ++ai) {
|
||||
if (*ai != DuneAmg::AggregatesMap::ISOLATED) {
|
||||
const long int diff = ai - map->begin();
|
||||
PcolIndices[level][diff] = *ai;
|
||||
indicesR[*ai].emplace_back(diff);
|
||||
}
|
||||
}
|
||||
|
||||
int col_idx = 0;
|
||||
// set sparsity pattern of R
|
||||
Rmatrices.back().rowPointers[0] = 0;
|
||||
for (unsigned int i = 0; i < indicesR.size(); ++i) {
|
||||
Rmatrices.back().rowPointers[i + 1] = Rmatrices.back().rowPointers[i] + indicesR[i].size();
|
||||
for (auto it = indicesR[i].begin(); it != indicesR[i].end(); ++it) {
|
||||
Rmatrices.back().colIndices[col_idx++] = *it;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
OpenclMatrix<Scalar>* A = &d_Amatrices[level];
|
||||
OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
|
||||
if (level == num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueReadBuffer error");
|
||||
}
|
||||
|
||||
// solve coarsest level using umfpack
|
||||
umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
return;
|
||||
}
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
cl::Buffer& t = d_t[level];
|
||||
cl::Buffer& f = d_f[level];
|
||||
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < num_pre_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < num_post_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
|
||||
x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
}
|
||||
|
||||
// x = prec(y)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
// 0-initialize u and x vectors
|
||||
events.resize(d_u.size() + 1);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
|
||||
sizeof(Scalar) * Nb, nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
|
||||
sizeof(Scalar) * Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
|
||||
d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void CPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bilu0->apply(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "CPR apply bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
apply_amg(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "CPR apply amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class CPR<T,1>; \
|
||||
template class CPR<T,2>; \
|
||||
template class CPR<T,3>; \
|
||||
template class CPR<T,4>; \
|
||||
template class CPR<T,5>; \
|
||||
template class CPR<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
@ -1,87 +0,0 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void Preconditioner<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
std::unique_ptr<Preconditioner<Scalar,block_size>>
|
||||
Preconditioner<Scalar,block_size>::create(Type type, bool opencl_ilu_parallel, int verbosity)
|
||||
{
|
||||
switch (type ) {
|
||||
case Type::BILU0:
|
||||
return std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case Type::CPR:
|
||||
return std::make_unique<CPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case Type::BISAI:
|
||||
return std::make_unique<BISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
}
|
||||
|
||||
OPM_THROW(std::logic_error,
|
||||
"Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool Preconditioner<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
return analyze_matrix(mat);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool Preconditioner<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
[[maybe_unused]] BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
return create_preconditioner(mat);
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class Preconditioner<T,1>; \
|
||||
template class Preconditioner<T,2>; \
|
||||
template class Preconditioner<T,3>; \
|
||||
template class Preconditioner<T,4>; \
|
||||
template class Preconditioner<T,5>; \
|
||||
template class Preconditioner<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
@ -24,19 +24,26 @@
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include <thread>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif //HAVE_OPENMP
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Dune::Timer;
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
|
||||
openclBILU0<Scalar,block_size>::openclBILU0(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
@ -46,13 +53,13 @@ BILU0<Scalar,block_size>::BILU0(bool opencl_ilu_parallel_, int verbosity_)
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
bool openclBILU0<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::
|
||||
bool openclBILU0<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
@ -80,7 +87,7 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
CSCRowIndices.data(), CSCColPointers.data(), Nb);
|
||||
if(verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
out << "openclBILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} else {
|
||||
@ -105,11 +112,11 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
}
|
||||
|
||||
if (verbosity >= 1) {
|
||||
out << "BILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||
out << "openclBILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors\n";
|
||||
}
|
||||
|
||||
#if CHOW_PATEL
|
||||
out << "BILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||
out << "openclBILU0 CHOW_PATEL: " << CHOW_PATEL << ", CHOW_PATEL_GPU: " << CHOW_PATEL_GPU;
|
||||
#endif
|
||||
OpmLog::info(out.str());
|
||||
|
||||
@ -169,25 +176,35 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
OPM_THROW(std::logic_error, "openclBILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
|
||||
bool openclBILU0<Scalar,block_size>::create_preconditioner(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BILU0<Scalar,block_size>::
|
||||
bool openclBILU0<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
auto *matToDecompose = jacMat ? jacMat : mat;
|
||||
bool use_multithreading = true;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif
|
||||
|
||||
if (jacMat && use_multithreading) {
|
||||
copyThread->join();
|
||||
}
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
Timer t_copy;
|
||||
@ -196,7 +213,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
out << "BILU0 memcpy: " << t_copy.stop() << " s";
|
||||
out << "openclBILU0 memcpy: " << t_copy.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
@ -239,12 +256,12 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error");
|
||||
OPM_THROW(std::logic_error, "openclBILU0 OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
||||
out << "openclBILU0 copy to GPU: " << t_copyToGpu.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
@ -264,7 +281,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
out << "BILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
out << "openclBILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
#endif // CHOW_PATEL
|
||||
@ -276,7 +293,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
void openclBILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const Scalar relaxation = 0.9;
|
||||
cl::Event event;
|
||||
@ -311,18 +328,18 @@ void BILU0<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t_apply.stop() << " s";
|
||||
out << "openclBILU0 apply: " << t_apply.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class BILU0<T,1>; \
|
||||
template class BILU0<T,2>; \
|
||||
template class BILU0<T,3>; \
|
||||
template class BILU0<T,4>; \
|
||||
template class BILU0<T,5>; \
|
||||
template class BILU0<T,6>;
|
||||
template class openclBILU0<T,1>; \
|
||||
template class openclBILU0<T,2>; \
|
||||
template class openclBILU0<T,3>; \
|
||||
template class openclBILU0<T,4>; \
|
||||
template class openclBILU0<T,5>; \
|
||||
template class openclBILU0<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
@ -17,15 +17,15 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BILU0_HPP
|
||||
#define BILU0_HPP
|
||||
#ifndef OPM_OPENCLBILU0_HPP
|
||||
#define OPM_OPENCLBILU0_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp>
|
||||
|
||||
|
||||
@ -35,9 +35,9 @@ namespace Opm::Accelerator {
|
||||
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
|
||||
/// The preconditioner is applied via two exact triangular solves
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class BILU0 : public Preconditioner<Scalar,block_size>
|
||||
class openclBILU0 : public openclPreconditioner<Scalar,block_size>
|
||||
{
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
using Base = openclPreconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -87,7 +87,7 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
BILU0(bool opencl_ilu_parallel, int verbosity);
|
||||
openclBILU0(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
// analysis, extract parallelism if specified
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
|
||||
@ -103,6 +103,7 @@ public:
|
||||
// via Lz = y
|
||||
// and Ux = z
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
void apply(Scalar& y, Scalar& x) {}
|
||||
|
||||
std::tuple<std::vector<int>, std::vector<int>, std::vector<int>>
|
||||
get_preconditioner_structure()
|
@ -26,8 +26,8 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // disable BISAI if ChowPatel is selected
|
||||
@ -40,17 +40,17 @@ using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
BISAI<Scalar,block_size>::BISAI(bool opencl_ilu_parallel_, int verbosity_)
|
||||
openclBISAI<Scalar,block_size>::openclBISAI(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
{
|
||||
#if CHOW_PATEL
|
||||
OPM_THROW(std::logic_error, "Error --linear-solver=isai cannot be used if ChowPatelIlu is used, probably defined by CMake\n");
|
||||
#endif
|
||||
bilu0 = std::make_unique<BILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
|
||||
bilu0 = std::make_unique<openclBILU0<Scalar,block_size>>(opencl_ilu_parallel_, verbosity_);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::
|
||||
void openclBISAI<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
@ -79,13 +79,13 @@ buildCsrToCscOffsetMap(std::vector<int> colPointers, std::vector<int> rowIndices
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
bool openclBISAI<Scalar,block_size>::analyze_matrix(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return analyze_matrix(mat, nullptr);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
bool openclBISAI<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
@ -108,7 +108,7 @@ analyze_matrix(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
|
||||
void openclBISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
|
||||
{
|
||||
lower.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
@ -138,14 +138,14 @@ void BISAI<Scalar,block_size>::buildLowerSubsystemsStructures()
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildLowerSubsystemsStructures time: "
|
||||
out << "openclBISAI buildLowerSubsystemsStructures time: "
|
||||
<< t_buildLowerSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
|
||||
void openclBISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
|
||||
{
|
||||
upper.subsystemPointers.assign(Nb + 1, 0);
|
||||
|
||||
@ -175,14 +175,14 @@ void BISAI<Scalar,block_size>::buildUpperSubsystemsStructures()
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI buildUpperSubsystemsStructures time: "
|
||||
out << "openclBISAI buildUpperSubsystemsStructures time: "
|
||||
<< t_buildUpperSubsystemsStructures.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
bool openclBISAI<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
@ -300,7 +300,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "BISAI OpenCL enqueueWriteBuffer error");
|
||||
OPM_THROW(std::logic_error, "openclBISAI OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
});
|
||||
|
||||
@ -326,7 +326,7 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "BISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
|
||||
out << "openclBISAI createPreconditioner time: " << t_preconditioner.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
@ -334,14 +334,14 @@ create_preconditioner(BlockedMatrix<Scalar>* mat, BlockedMatrix<Scalar>* jacMat)
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool BISAI<Scalar,block_size>::
|
||||
bool openclBISAI<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat)
|
||||
{
|
||||
return create_preconditioner(mat, nullptr);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
|
||||
void openclBISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
@ -354,12 +354,12 @@ void BISAI<Scalar,block_size>::apply(const cl::Buffer& x, cl::Buffer& y)
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class BISAI<T,1>; \
|
||||
template class BISAI<T,2>; \
|
||||
template class BISAI<T,3>; \
|
||||
template class BISAI<T,4>; \
|
||||
template class BISAI<T,5>; \
|
||||
template class BISAI<T,6>;
|
||||
template class openclBISAI<T,1>; \
|
||||
template class openclBISAI<T,2>; \
|
||||
template class openclBISAI<T,3>; \
|
||||
template class openclBISAI<T,4>; \
|
||||
template class openclBISAI<T,5>; \
|
||||
template class openclBISAI<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
@ -17,14 +17,14 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BISAI_HPP
|
||||
#define BISAI_HPP
|
||||
#ifndef OPM_OPENCLBISAI_HPP
|
||||
#define OPM_OPENCLBISAI_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
@ -33,9 +33,9 @@ template<class Scalar> class BlockedMatrix;
|
||||
/// This class implements a Blocked version of the Incomplete Sparse Approximate Inverse (ISAI) preconditioner.
|
||||
/// Inspired by the paper "Incomplete Sparse Approximate Inverses for Parallel Preconditioning" by Anzt et. al.
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class BISAI : public Preconditioner<Scalar,block_size>
|
||||
class openclBISAI : public openclPreconditioner<Scalar,block_size>
|
||||
{
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
using Base = openclPreconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -68,7 +68,7 @@ private:
|
||||
cl::Buffer d_invL_x;
|
||||
|
||||
bool opencl_ilu_parallel;
|
||||
std::unique_ptr<BILU0<Scalar,block_size>> bilu0;
|
||||
std::unique_ptr<openclBILU0<Scalar,block_size>> bilu0;
|
||||
|
||||
/// Struct that holds the structure of the small subsystems for each column
|
||||
struct subsystemStructure {
|
||||
@ -107,7 +107,7 @@ private:
|
||||
void buildUpperSubsystemsStructures();
|
||||
|
||||
public:
|
||||
BISAI(bool opencl_ilu_parallel, int verbosity);
|
||||
openclBISAI(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
// set own Opencl variables, but also that of the bilu0 preconditioner
|
||||
void setOpencl(std::shared_ptr<cl::Context>& context,
|
||||
@ -125,6 +125,7 @@ public:
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
void apply(Scalar& y, Scalar& x) {}
|
||||
};
|
||||
|
||||
/// Similar function to csrPatternToCsc. It gives an offset map from CSR to CSC instead of the full CSR to CSC conversion.
|
321
opm/simulators/linalg/bda/opencl/openclCPR.cpp
Normal file
321
opm/simulators/linalg/bda/opencl/openclCPR.cpp
Normal file
@ -0,0 +1,321 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Dune::Timer;
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclCPR<Scalar,block_size>::openclCPR(bool opencl_ilu_parallel_, int verbosity_)
|
||||
: Base(verbosity_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
bilu0 = std::make_unique<openclBILU0<Scalar,block_size> >(opencl_ilu_parallel, verbosity_);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_, std::shared_ptr<cl::CommandQueue>& queue_) {
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
|
||||
bilu0->setOpencl(context, queue);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclCPR<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat_) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = this->Nb * block_size;
|
||||
this->nnz = this->nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_);
|
||||
this->mat = mat_;
|
||||
return success;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclCPR<Scalar,block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = this->Nb * block_size;
|
||||
this->nnz = this->nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_, jacMat);
|
||||
this->mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclCPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_, BlockedMatrix<Scalar>* jacMat) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat); // already points to bilu0::rmat if needed
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
// initialize OpenclMatrices and Buffers if needed
|
||||
auto init_func = std::bind(&openclCPR::init_opencl_buffers, this);
|
||||
std::call_once(opencl_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
opencl_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
bool openclCPR<Scalar,block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar>* mat_) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat); // already points to bilu0::rmat if needed
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
// initialize OpenclMatrices and Buffers if needed
|
||||
auto init_func = std::bind(&openclCPR::init_opencl_buffers, this);
|
||||
std::call_once(opencl_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
opencl_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar, block_size>::
|
||||
init_opencl_buffers() {
|
||||
d_Amatrices.reserve(this->num_levels);
|
||||
d_Rmatrices.reserve(this->num_levels - 1);
|
||||
d_invDiags.reserve(this->num_levels - 1);
|
||||
for (Matrix<Scalar>& m : this->Amatrices) {
|
||||
d_Amatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
}
|
||||
for (Matrix<Scalar>& m : this->Rmatrices) {
|
||||
d_Rmatrices.emplace_back(context.get(), m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
d_u.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.N);
|
||||
|
||||
d_PcolIndices.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(int) * m.M);
|
||||
d_invDiags.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M); // create a cl::Buffer
|
||||
d_t.emplace_back(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * m.M);
|
||||
}
|
||||
d_weights = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->N);
|
||||
d_rs = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->N);
|
||||
d_mat = std::make_unique<OpenclMatrix<Scalar>>(context.get(), this->Nb, this->Nb, this->nnzb, block_size);
|
||||
d_coarse_y = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->Nb);
|
||||
d_coarse_x = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(Scalar) * this->Nb);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar,block_size>::opencl_upload()
|
||||
{
|
||||
d_mat->upload(queue.get(), this->mat);
|
||||
|
||||
err = CL_SUCCESS;
|
||||
events.resize(2 * this->Rmatrices.size() + 1);
|
||||
err |= queue->enqueueWriteBuffer(*d_weights, CL_FALSE, 0,
|
||||
sizeof(Scalar) * this->N, this->weights.data(), nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(queue.get(), &this->Amatrices[i]);
|
||||
|
||||
err |= queue->enqueueWriteBuffer(d_invDiags[i], CL_FALSE, 0,
|
||||
sizeof(Scalar) * this->Amatrices[i].N, this->invDiags[i].data(),
|
||||
nullptr, &events[2*i+1]);
|
||||
err |= queue->enqueueWriteBuffer(d_PcolIndices[i], CL_FALSE, 0,
|
||||
sizeof(int) * this->Amatrices[i].N, this->PcolIndices[i].data(),
|
||||
nullptr, &events[2*i+2]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(queue.get(), &this->Rmatrices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar,block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
OpenclMatrix<Scalar>* A = &d_Amatrices[level];
|
||||
OpenclMatrix<Scalar>* R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
|
||||
if (level == this->num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueReadBuffer(y, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_y.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueReadBuffer error");
|
||||
}
|
||||
|
||||
// solve coarsest level using umfpack
|
||||
this->umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
events.resize(1);
|
||||
err = queue->enqueueWriteBuffer(x, CL_FALSE, 0,
|
||||
sizeof(Scalar) * Ncur, h_x.data(), nullptr, &events[0]);
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "openclCPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
return;
|
||||
}
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
cl::Buffer& t = d_t[level];
|
||||
cl::Buffer& f = d_f[level];
|
||||
cl::Buffer& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels<Scalar>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < this->num_post_smooth_steps; ++i) {
|
||||
OpenclKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers,
|
||||
x, y, t, Ncur, 1);
|
||||
OpenclKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
}
|
||||
|
||||
// x = prec(y)
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar,block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
// 0-initialize u and x vectors
|
||||
events.resize(d_u.size() + 1);
|
||||
err = queue->enqueueFillBuffer(*d_coarse_x, 0, 0,
|
||||
sizeof(Scalar) * this->Nb, nullptr, &events[0]);
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
err |= queue->enqueueFillBuffer(d_u[i], 0, 0,
|
||||
sizeof(Scalar) * this->Rmatrices[i].N, nullptr, &events[i + 1]);
|
||||
}
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
if (err != CL_SUCCESS) {
|
||||
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices,
|
||||
d_mat->rowPointers, x, y, *d_rs, this->Nb, block_size);
|
||||
OpenclKernels<Scalar>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, this->Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels<Scalar>::add_coarse_pressure_correction(*d_coarse_x, x, this->pressure_idx, this->Nb);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclCPR<Scalar,block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bilu0->apply(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR apply bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
apply_amg(y, x);
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
out << "openclCPR apply amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class openclCPR<T,1>; \
|
||||
template class openclCPR<T,2>; \
|
||||
template class openclCPR<T,3>; \
|
||||
template class openclCPR<T,4>; \
|
||||
template class openclCPR<T,5>; \
|
||||
template class openclCPR<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
@ -17,8 +17,8 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_CPR_HPP
|
||||
#define OPM_CPR_HPP
|
||||
#ifndef OPM_OPENCLCPR_HPP
|
||||
#define OPM_OPENCLCPR_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
@ -26,10 +26,11 @@
|
||||
#include <dune/istl/umfpack.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/BILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/CprCreation.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/OpenclMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp>
|
||||
|
||||
@ -39,9 +40,9 @@ template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template<class Scalar, unsigned int block_size>
|
||||
class CPR : public Preconditioner<Scalar,block_size>
|
||||
class openclCPR : public openclPreconditioner<Scalar,block_size>, public CprCreation<Scalar, block_size>
|
||||
{
|
||||
using Base = Preconditioner<Scalar,block_size>;
|
||||
using Base = openclPreconditioner<Scalar,block_size>;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
@ -54,13 +55,8 @@ class CPR : public Preconditioner<Scalar,block_size>
|
||||
using Base::err;
|
||||
|
||||
private:
|
||||
int num_levels;
|
||||
std::vector<Scalar> weights, coarse_vals, coarse_x, coarse_y;
|
||||
std::vector<Matrix<Scalar>> Amatrices, Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<OpenclMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
std::vector<std::vector<int> > PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<cl::Buffer> d_PcolIndices;
|
||||
std::vector<std::vector<Scalar>> invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<cl::Buffer> d_invDiags;
|
||||
std::vector<cl::Buffer> d_t, d_f, d_u; // intermediate vectors used during amg cycle
|
||||
std::unique_ptr<cl::Buffer> d_rs; // use before extracting the pressure
|
||||
@ -69,35 +65,11 @@ private:
|
||||
std::unique_ptr<cl::Buffer> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag opencl_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
std::unique_ptr<BILU0<Scalar,block_size>> bilu0; // Blocked ILU0 preconditioner
|
||||
BlockedMatrix<Scalar>* mat = nullptr; // input matrix, blocked
|
||||
|
||||
using DuneMat = Dune::BCRSMatrix<Dune::FieldMatrix<Scalar, 1, 1> >;
|
||||
using DuneVec = Dune::BlockVector<Dune::FieldVector<Scalar, 1> >;
|
||||
using MatrixOperator = Dune::MatrixAdapter<DuneMat, DuneVec, DuneVec>;
|
||||
using DuneAmg = Dune::Amg::MatrixHierarchy<MatrixOperator, Dune::Amg::SequentialInformation>;
|
||||
std::unique_ptr<DuneAmg> dune_amg;
|
||||
std::unique_ptr<DuneMat> dune_coarse; // extracted pressure matrix, finest level in AMG hierarchy
|
||||
std::shared_ptr<MatrixOperator> dune_op; // operator, input to Dune AMG
|
||||
std::vector<int> level_sizes; // size of each level in the AMG hierarchy
|
||||
std::vector<std::vector<int> > diagIndices; // index of diagonal value for each level
|
||||
Dune::UMFPack<DuneMat> umfpack; // dune/istl/umfpack object used to solve the coarsest level of AMG
|
||||
bool always_recalculate_aggregates = false; // OPM always reuses the aggregates by default
|
||||
bool recalculate_aggregates = true; // only rerecalculate if true
|
||||
const int pressure_idx = 1; // hardcoded to mimic OPM
|
||||
unsigned num_pre_smooth_steps; // number of Jacobi smooth steps before restriction
|
||||
unsigned num_post_smooth_steps; // number of Jacobi smooth steps after prolongation
|
||||
std::unique_ptr<openclBILU0<Scalar,block_size>> bilu0; // Blocked ILU0 preconditioner
|
||||
|
||||
std::unique_ptr<openclSolverBackend<Scalar,1>> coarse_solver; // coarse solver is scalar
|
||||
bool opencl_ilu_parallel; // whether ILU0 operation should be parallelized
|
||||
|
||||
// Analyze the AMG hierarchy build by Dune
|
||||
void analyzeHierarchy();
|
||||
|
||||
// Analyze the aggregateMaps from the AMG hierarchy
|
||||
// These can be reused, so only use when recalculate_aggregates is true
|
||||
void analyzeAggregateMaps();
|
||||
|
||||
// Initialize and allocate matrices and vectors
|
||||
void init_opencl_buffers();
|
||||
|
||||
@ -109,10 +81,8 @@ private:
|
||||
|
||||
void amg_cycle_gpu(const int level, cl::Buffer &y, cl::Buffer &x);
|
||||
|
||||
void create_preconditioner_amg(BlockedMatrix<Scalar>* mat);
|
||||
|
||||
public:
|
||||
CPR(bool opencl_ilu_parallel, int verbosity);
|
||||
openclCPR(bool opencl_ilu_parallel, int verbosity);
|
||||
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat) override;
|
||||
bool analyze_matrix(BlockedMatrix<Scalar>* mat,
|
||||
@ -125,18 +95,13 @@ public:
|
||||
// applies blocked ilu0
|
||||
// also applies amg for pressure component
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x) override;
|
||||
void apply(Scalar& y, Scalar& x) {}
|
||||
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat) override;
|
||||
bool create_preconditioner(BlockedMatrix<Scalar>* mat,
|
||||
BlockedMatrix<Scalar>* jacMat) override;
|
||||
};
|
||||
|
||||
// solve A^T * x = b
|
||||
// A should represent a 3x3 matrix
|
||||
// x and b are vectors with 3 elements
|
||||
template<class Scalar>
|
||||
void solve_transposed_3x3(const Scalar* A, const Scalar* b, Scalar* x);
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
@ -26,6 +26,8 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
@ -84,12 +86,6 @@ std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
{
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::init(cl::Context *context,
|
||||
cl::CommandQueue *queue_,
|
||||
|
74
opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp
Normal file
74
opm/simulators/linalg/bda/opencl/openclPreconditioner.cpp
Normal file
@ -0,0 +1,74 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
std::unique_ptr<openclPreconditioner<Scalar,block_size>>
|
||||
openclPreconditioner<Scalar,block_size>::
|
||||
create(PreconditionerType type,
|
||||
int verbosity,
|
||||
bool opencl_ilu_parallel)
|
||||
{
|
||||
switch (type ) {
|
||||
case PreconditionerType::BILU0:
|
||||
return std::make_unique<openclBILU0<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case PreconditionerType::CPR:
|
||||
return std::make_unique<openclCPR<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
case PreconditionerType::BISAI:
|
||||
return std::make_unique<openclBISAI<Scalar,block_size>>(opencl_ilu_parallel, verbosity);
|
||||
}
|
||||
|
||||
OPM_THROW(std::logic_error,
|
||||
"Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclPreconditioner<Scalar,block_size>::
|
||||
setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
std::shared_ptr<cl::CommandQueue>& queue_)
|
||||
{
|
||||
context = context_;
|
||||
queue = queue_;
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class openclPreconditioner<T,1>; \
|
||||
template class openclPreconditioner<T,2>; \
|
||||
template class openclPreconditioner<T,3>; \
|
||||
template class openclPreconditioner<T,4>; \
|
||||
template class openclPreconditioner<T,5>; \
|
||||
template class openclPreconditioner<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
62
opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
Normal file
62
opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
Copyright 2021 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_OPENCLPRECONDITIONER_HEADER_INCLUDED
|
||||
#define OPM_OPENCLPRECONDITIONER_HEADER_INCLUDED
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class openclPreconditioner : public Preconditioner<Scalar, block_size>
|
||||
{
|
||||
|
||||
protected:
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
|
||||
openclPreconditioner(int verbosity_) :
|
||||
Preconditioner<Scalar, block_size>(verbosity_)
|
||||
{};
|
||||
|
||||
public:
|
||||
virtual ~openclPreconditioner() = default;
|
||||
|
||||
static std::unique_ptr<openclPreconditioner<Scalar, block_size>> create(PreconditionerType type, int verbosity, bool opencl_ilu_parallel);
|
||||
|
||||
// nested Preconditioners might need to override this
|
||||
virtual void setOpencl(std::shared_ptr<cl::Context>& context, std::shared_ptr<cl::CommandQueue>& queue);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(const cl::Buffer& y, cl::Buffer& x) = 0;
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat, BlockedMatrix<Scalar> *jacMat) = 0;
|
||||
};
|
||||
} //namespace Opm
|
||||
|
||||
#endif
|
@ -71,16 +71,13 @@ openclSolverBackend(int verbosity_,
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
|
||||
}
|
||||
|
||||
using PreconditionerType = Preconditioner<Scalar,block_size>;
|
||||
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
|
||||
if (use_cpr) {
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::CPR,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity, opencl_ilu_parallel);
|
||||
} else if (use_isai) {
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::BISAI,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BISAI, verbosity, opencl_ilu_parallel);
|
||||
} else {
|
||||
prec = PreconditionerType::create(PreconditionerType::Type::BILU0,
|
||||
opencl_ilu_parallel, verbosity);
|
||||
prec = openclPreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity, opencl_ilu_parallel);
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
@ -228,8 +225,10 @@ openclSolverBackend(int verbosity_,
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
openclSolverBackend<Scalar,block_size>::
|
||||
openclSolverBackend(int verbosity_, int maxit_,
|
||||
Scalar tolerance_, bool opencl_ilu_parallel_)
|
||||
openclSolverBackend(int verbosity_,
|
||||
int maxit_,
|
||||
Scalar tolerance_,
|
||||
bool opencl_ilu_parallel_)
|
||||
: Base(verbosity_, maxit_, tolerance_)
|
||||
, opencl_ilu_parallel(opencl_ilu_parallel_)
|
||||
{
|
||||
@ -248,7 +247,8 @@ setOpencl(std::shared_ptr<cl::Context>& context_,
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
void openclSolverBackend<Scalar,block_size>::
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs, BdaResult& res)
|
||||
gpu_pbicgstab(WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res)
|
||||
{
|
||||
float it;
|
||||
Scalar rho, rhop, beta, alpha, omega, tmp1, tmp2;
|
||||
|
@ -25,7 +25,7 @@
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/Preconditioner.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclPreconditioner.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
@ -60,7 +60,7 @@ private:
|
||||
|
||||
bool useJacMatrix = false;
|
||||
|
||||
std::unique_ptr<Preconditioner<Scalar,block_size>> prec;
|
||||
std::unique_ptr<openclPreconditioner<Scalar,block_size>> prec;
|
||||
// can perform blocked ILU0 and AMG on pressure component
|
||||
bool is_root; // allow for nested solvers, the root solver is called by BdaBridge
|
||||
bool analysis_done = false;
|
||||
|
493
opm/simulators/linalg/bda/rocm/hipKernels.cpp
Normal file
493
opm/simulators/linalg/bda/rocm/hipKernels.cpp
Normal file
@ -0,0 +1,493 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace Opm {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// define static variables and kernels
|
||||
template<class Scalar> int HipKernels<Scalar>::verbosity = 0;
|
||||
template<class Scalar> bool HipKernels<Scalar>::initialized = false;
|
||||
|
||||
#ifdef __HIP__
|
||||
/// HIP kernel to multiply vector with another vector and a scalar, element-wise
|
||||
// add result to a third vector
|
||||
template<class Scalar>
|
||||
__global__ void vmul_k(const Scalar alpha,
|
||||
Scalar const *in1,
|
||||
Scalar const *in2,
|
||||
Scalar *out,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = gridDim.x;
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(idx < N){
|
||||
out[idx] += alpha * in1[idx] * in2[idx];
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to transform blocked vector to scalar vector using pressure-weights
|
||||
// every workitem handles one blockrow
|
||||
template<class Scalar>
|
||||
__global__ void full_to_pressure_restriction_k(const Scalar *fine_y,
|
||||
const Scalar *weights,
|
||||
Scalar *coarse_y,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
Scalar sum = 0.0;
|
||||
unsigned int idx = block_size * target_block_row;
|
||||
for (unsigned int i = 0; i < block_size; ++i) {
|
||||
sum += fine_y[idx + i] * weights[idx + i];
|
||||
}
|
||||
coarse_y[target_block_row] = sum;
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to add the coarse pressure solution back to the finer, complete solution
|
||||
// every workitem handles one blockrow
|
||||
template<class Scalar>
|
||||
__global__ void add_coarse_pressure_correction_k(const Scalar *coarse_x,
|
||||
Scalar *fine_x,
|
||||
const unsigned int pressure_idx,
|
||||
const unsigned int Nb)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int block_size = 3;
|
||||
unsigned int target_block_row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
fine_x[target_block_row * block_size + pressure_idx] += coarse_x[target_block_row];
|
||||
target_block_row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
/// HIP kernel to prolongate vector during amg cycle
|
||||
// every workitem handles one row
|
||||
template<class Scalar>
|
||||
__global__ void prolongate_vector_k(const Scalar *in,
|
||||
Scalar *out,
|
||||
const int *cols,
|
||||
const unsigned int N)
|
||||
{
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
while(row < N){
|
||||
out[row] += in[cols[row]];
|
||||
row += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
|
||||
// res = rhs - mat * x
|
||||
// algorithm based on:
|
||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
template<class Scalar>
|
||||
__global__ void residual_blocked_k(const Scalar *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int Nb,
|
||||
const Scalar *x,
|
||||
const Scalar *rhs,
|
||||
Scalar *out,
|
||||
const unsigned int block_size)
|
||||
{
|
||||
extern __shared__ Scalar tmp[];
|
||||
const unsigned int warpsize = warpSize;
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
unsigned int idx = idx_b * bsize + idx_t;
|
||||
const unsigned int bs = block_size;
|
||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = gridDim.x;
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
|
||||
// for 3x3 blocks:
|
||||
// num_active_threads: 27 (CUDA) vs 63 (ROCM)
|
||||
// num_blocks_per_warp: 3 (CUDA) vs 7 (ROCM)
|
||||
int offsetTarget = warpsize == 64 ? 48 : 32;
|
||||
|
||||
while(target_block_row < Nb){
|
||||
unsigned int first_block = rows[target_block_row];
|
||||
unsigned int last_block = rows[target_block_row+1];
|
||||
unsigned int block = first_block + lane / (bs*bs);
|
||||
Scalar local_out = 0.0;
|
||||
|
||||
if(lane < num_active_threads){
|
||||
for(; block < last_block; block += num_blocks_per_warp){
|
||||
Scalar x_elem = x[cols[block]*bs + c];
|
||||
Scalar A_elem = vals[block*bs*bs + c + r*bs];
|
||||
local_out += x_elem * A_elem;
|
||||
}
|
||||
}
|
||||
|
||||
// do reduction in shared mem
|
||||
tmp[lane] = local_out;
|
||||
|
||||
for(unsigned int offset = 3; offset <= offsetTarget; offset <<= 1)
|
||||
{
|
||||
if (lane + offset < warpsize)
|
||||
{
|
||||
tmp[lane] += tmp[lane + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(lane < bs){
|
||||
unsigned int row = target_block_row*bs + lane;
|
||||
out[row] = rhs[row] - tmp[lane];
|
||||
}
|
||||
target_block_row += num_warps_in_grid;
|
||||
}
|
||||
}
|
||||
|
||||
// KERNEL residual_k
|
||||
// res = rhs - mat * x
|
||||
// algorithm based on:
|
||||
// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
// template <unsigned shared_mem_size>
|
||||
template<class Scalar>
|
||||
__global__ void residual_k(const Scalar *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int N,
|
||||
const Scalar *x,
|
||||
const Scalar *rhs,
|
||||
Scalar *out)
|
||||
{
|
||||
extern __shared__ Scalar tmp[];
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
const unsigned int num_workgroups = gridDim.x;
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
Scalar local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
__syncthreads();
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
out[row] = rhs[row] - tmp[idx_t];
|
||||
}
|
||||
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
__global__ void spmv_k(const Scalar *vals,
|
||||
const int *cols,
|
||||
const int *rows,
|
||||
const int N,
|
||||
const Scalar *x,
|
||||
Scalar *out)
|
||||
{
|
||||
extern __shared__ Scalar tmp[];
|
||||
const unsigned int bsize = blockDim.x;
|
||||
const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const unsigned int idx_b = gid / bsize;
|
||||
const unsigned int idx_t = threadIdx.x;
|
||||
const unsigned int num_workgroups = gridDim.x;
|
||||
|
||||
int row = idx_b;
|
||||
|
||||
while (row < N) {
|
||||
int rowStart = rows[row];
|
||||
int rowEnd = rows[row+1];
|
||||
int rowLength = rowEnd - rowStart;
|
||||
Scalar local_sum = 0.0;
|
||||
for (int j = rowStart + idx_t; j < rowEnd; j += bsize) {
|
||||
int col = cols[j];
|
||||
local_sum += vals[j] * x[col];
|
||||
}
|
||||
|
||||
tmp[idx_t] = local_sum;
|
||||
__syncthreads();
|
||||
|
||||
int offset = bsize / 2;
|
||||
while(offset > 0) {
|
||||
if (idx_t < offset) {
|
||||
tmp[idx_t] += tmp[idx_t + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
offset = offset / 2;
|
||||
}
|
||||
|
||||
if (idx_t == 0) {
|
||||
out[row] = tmp[idx_t];
|
||||
}
|
||||
|
||||
row += num_workgroups;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
init(int verbosity_)
|
||||
{
|
||||
if (initialized) {
|
||||
OpmLog::debug("Warning HipKernels is already initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
verbosity = verbosity_;
|
||||
|
||||
initialized = true;
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
vmul(const Scalar alpha,
|
||||
Scalar* in1,
|
||||
Scalar* in2,
|
||||
Scalar* out,
|
||||
int N,
|
||||
hipStream_t stream)
|
||||
{
|
||||
Timer t_vmul;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
vmul_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(alpha, in1, in2, out, N);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error vmul for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels vmul() time: " << t_vmul.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
full_to_pressure_restriction(const Scalar* fine_y,
|
||||
Scalar* weights,
|
||||
Scalar* coarse_y,
|
||||
int Nb,
|
||||
hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
full_to_pressure_restriction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(fine_y, weights, coarse_y, Nb);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error full_to_pressure_restriction for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels full_to_pressure_restriction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
add_coarse_pressure_correction(Scalar* coarse_x,
|
||||
Scalar* fine_x,
|
||||
int pressure_idx,
|
||||
int Nb,
|
||||
hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
add_coarse_pressure_correction_k<<<dim3(gridDim), dim3(blockDim), 0, stream>>>(coarse_x, fine_x, pressure_idx, Nb);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error add_coarse_pressure_correction for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels add_coarse_pressure_correction() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
prolongate_vector(const Scalar* in,
|
||||
Scalar* out,
|
||||
const int* cols,
|
||||
int N,
|
||||
hipStream_t stream)
|
||||
{
|
||||
Timer t;
|
||||
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
unsigned blocks = Accelerator::ceilDivision(N, blockDim);
|
||||
unsigned gridDim = blocks * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(Scalar);
|
||||
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
prolongate_vector_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(in, out, cols, N);
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error prolongate_vector for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels prolongate_vector() time: " << t.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
residual(Scalar* vals,
|
||||
int* cols,
|
||||
int* rows,
|
||||
Scalar* x,
|
||||
const Scalar* rhs,
|
||||
Scalar* out,
|
||||
int Nb,
|
||||
unsigned int block_size,
|
||||
hipStream_t stream)
|
||||
{
|
||||
Timer t_residual;
|
||||
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = num_work_groups * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(Scalar);
|
||||
|
||||
if (block_size > 1) {
|
||||
// dim3(N) will create a vector {N, 1, 1}
|
||||
residual_blocked_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out, block_size);
|
||||
} else {
|
||||
residual_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, rhs, out);
|
||||
}
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error residual for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels residual() time: " << t_residual.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template<class Scalar>
|
||||
void HipKernels<Scalar>::
|
||||
spmv(Scalar* vals,
|
||||
int* cols,
|
||||
int* rows,
|
||||
Scalar* x,
|
||||
Scalar* y,
|
||||
int Nb,
|
||||
unsigned int block_size,
|
||||
hipStream_t stream)
|
||||
{//NOTE: block_size not used since I use this kernel only for block sizes 1, other uses use rocsparse!
|
||||
Timer t_spmv;
|
||||
#ifdef __HIP__
|
||||
unsigned blockDim = 64;
|
||||
const unsigned int num_work_groups = Accelerator::ceilDivision(Nb, blockDim);
|
||||
unsigned gridDim = num_work_groups * blockDim;
|
||||
unsigned shared_mem_size = blockDim * sizeof(Scalar);
|
||||
|
||||
spmv_k<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(vals, cols, rows, Nb, x, y);
|
||||
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error spmv for rocsparse only supported when compiling with hipcc");
|
||||
#endif
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "HipKernels spmv_blocked() time: " << t_spmv.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template class HipKernels<double>;
|
||||
|
||||
} // namespace Opm
|
138
opm/simulators/linalg/bda/rocm/hipKernels.hpp
Normal file
138
opm/simulators/linalg/bda/rocm/hipKernels.hpp
Normal file
@ -0,0 +1,138 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_HIPKERNELS_HPP
|
||||
#define OPM_HIPKERNELS_HPP
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <cstddef>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
namespace Opm {
|
||||
|
||||
template<class Scalar>
|
||||
class HipKernels
|
||||
{
|
||||
private:
|
||||
static int verbosity;
|
||||
static bool initialized;
|
||||
|
||||
HipKernels();
|
||||
|
||||
public:
|
||||
/// Initialize verbosity level for the HIP kernels
|
||||
/// \param[in] verbosity verbosity level
|
||||
static void init(int verbosity);
|
||||
|
||||
/// Transform blocked vector to scalar vector using pressure-weights, where every workitem handles one blockrow
|
||||
/// \param[in] fine_y Input y vector
|
||||
/// \param[in] weights Weights used to combine cells
|
||||
/// \param[out] course_y Output y vector
|
||||
/// \param[in] Nb Number of blocks in the original matrix
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void full_to_pressure_restriction(const Scalar* fine_y,
|
||||
Scalar* weights,
|
||||
Scalar* coarse_y,
|
||||
int Nb,
|
||||
hipStream_t stream);
|
||||
|
||||
/// Add the coarse pressure solution back to the finer, complete solution; every workitem handles one blockrow
|
||||
/// \param[in] coarse_x Input scalar x vector
|
||||
/// \param[out] fine_x Output blocked x vector
|
||||
/// \param[in] pressure_idx Pressure index
|
||||
/// \param[in] Nb Number of blocks in the original matrix
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void add_coarse_pressure_correction(Scalar* coarse_x,
|
||||
Scalar* fine_x,
|
||||
int pressure_idx,
|
||||
int Nb,
|
||||
hipStream_t stream);
|
||||
|
||||
|
||||
/// Function to multiply vector with another vector and a scalar, element-wise and add the result to a third vector (out = alpha * in1 + in2)
|
||||
/// \param[in] alpha Input scalar
|
||||
/// \param[in] in1 First input vector
|
||||
/// \param[in] in2 Second input vector
|
||||
/// \param[out] out Output vector
|
||||
/// \param[in] N Size of the vector
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void vmul(const Scalar alpha,
|
||||
Scalar* in1,
|
||||
Scalar* in2,
|
||||
Scalar* out,
|
||||
int N,
|
||||
hipStream_t stream);
|
||||
|
||||
/// Function to prolongate vector during amg cycle, every workitem handles one row
|
||||
/// \param[in] in Input fine-grained vector
|
||||
/// \param[out] out Output course-graned vector
|
||||
/// \param[in] cols Column indexes
|
||||
/// \param[in] N Size of the vector
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void prolongate_vector(const Scalar* in,
|
||||
Scalar* out,
|
||||
const int* cols,
|
||||
int N,
|
||||
hipStream_t stream);
|
||||
|
||||
/// Function to perform res = rhs - mat * x
|
||||
/// \param[in] vals Matrix values
|
||||
/// \param[in] cols Column indexes
|
||||
/// \param[in] rows Row pointers
|
||||
/// \param[in] x X vector
|
||||
/// \param[in] rhs Rhs vector
|
||||
/// \param[out] out Output res vector
|
||||
/// \param[in] Nb Number of non-zero blocks in the original matrix
|
||||
/// \param[in] block_size Block size
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void residual(Scalar* vals,
|
||||
int* cols,
|
||||
int* rows,
|
||||
Scalar* x,
|
||||
const Scalar* rhs,
|
||||
Scalar* out,
|
||||
int Nb,
|
||||
unsigned int block_size,
|
||||
hipStream_t stream);
|
||||
|
||||
/// Function to perform sparse matrix vector multipliation
|
||||
/// \param[in] vals Matrix values
|
||||
/// \param[in] cols Column indexes
|
||||
/// \param[in] rows Row pointers
|
||||
/// \param[in] x Input x vector
|
||||
/// \param[out] y Output y vector
|
||||
/// \param[in] Nb Number of non-zero blocks in the original matrix
|
||||
/// \param[in] block_size Block size
|
||||
/// \param[in] stream Hip stream to use for the computations
|
||||
static void spmv(Scalar* vals,
|
||||
int* cols,
|
||||
int* rows,
|
||||
Scalar* x,
|
||||
Scalar* y,
|
||||
int Nb,
|
||||
unsigned int block_size,
|
||||
hipStream_t stream);
|
||||
};
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
@ -37,7 +37,7 @@
|
||||
|
||||
#undef HAVE_CUDA
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocalutionSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocalutionSolverBackend.hpp>
|
||||
|
||||
#include <rocalution.hpp>
|
||||
#include <base/matrix_formats_ind.hpp> // check if blocks are interpreted as row-major or column-major
|
286
opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
Normal file
286
opm/simulators/linalg/bda/rocm/rocsparseBILU0.cpp
Normal file
@ -0,0 +1,286 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include <thread>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <omp.h>
|
||||
#endif //HAVE_OPENMP
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
rocsparseBILU0<Scalar, block_size>::
|
||||
rocsparseBILU0(int verbosity_) :
|
||||
Base(verbosity_)
|
||||
{
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseBILU0<Scalar, block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
rocsparse_int *d_Arows,
|
||||
rocsparse_int *d_Acols)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
this->nnzbs_prec = this->nnzb;
|
||||
|
||||
if (jacMatrix) {
|
||||
this->useJacMatrix = true;
|
||||
this->nnzbs_prec = jacMatrix->nnzbs;
|
||||
this->jacMat = jacMatrix;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_t, sizeof(Scalar) * this->N));
|
||||
|
||||
if (this->useJacMatrix) {
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * this->nnzbs_prec));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
|
||||
} else { // preconditioner matrix is same
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size));
|
||||
d_Mcols = d_Acols;
|
||||
d_Mrows = d_Arows;
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end initialize()
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseBILU0<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat) {
|
||||
return analyze_matrix(mat, &(*this->jacMat));
|
||||
}
|
||||
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseBILU0<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat)
|
||||
{
|
||||
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
Timer t;
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
|
||||
#if HIP_VERSION >= 50400000
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
|
||||
#endif
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
|
||||
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(this->handle, this->dir, this->operation, Nb, this->nnzbs_prec,
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
|
||||
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(this->handle, this->dir, \
|
||||
Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
int zero_position = 0;
|
||||
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
|
||||
if (rocsparse_status_success != status) {
|
||||
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
|
||||
Nb, this->nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(this->handle, this->dir, this->operation, \
|
||||
Nb, this->nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseBILU0::analyze_matrix(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseBILU0<Scalar, block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar> *mat) {
|
||||
return create_preconditioner(mat, &*this->jacMat);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseBILU0<Scalar, block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat)
|
||||
{
|
||||
Timer t;
|
||||
bool result = true;
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0(this->handle, this->dir, Nb, this->nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
// Check for zero pivot
|
||||
int zero_position = 0;
|
||||
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(this->handle, ilu_info, &zero_position);
|
||||
if(rocsparse_status_success != status)
|
||||
{
|
||||
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseBILU0::create_preconditioner(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseBILU0<Scalar, block_size>::
|
||||
copy_system_to_gpu(Scalar *d_Avals) {
|
||||
Timer t;
|
||||
bool use_multithreading = true;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif
|
||||
|
||||
if (this->useJacMatrix) {
|
||||
if (use_multithreading) {
|
||||
copyThread->join();
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mrows, this->jacMat->rowPointers, sizeof(rocsparse_int) * (Nb + 1), hipMemcpyHostToDevice, this->stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mcols, this->jacMat->colIndices, sizeof(rocsparse_int) * this->nnzbs_prec, hipMemcpyHostToDevice, this->stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseBILU0::copy_system_to_gpu(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseBILU0<Scalar, block_size>::
|
||||
update_system_on_gpu(Scalar *d_Avals) {
|
||||
Timer t;
|
||||
bool use_multithreading = true;
|
||||
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() == 1)
|
||||
use_multithreading = false;
|
||||
#endif
|
||||
|
||||
if (this->useJacMatrix) {
|
||||
if (use_multithreading) {
|
||||
copyThread->join();
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, this->jacMat->nnzValues, sizeof(Scalar) * this->nnzbs_prec * block_size * block_size, hipMemcpyHostToDevice, this->stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals, sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, this->stream));
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseSolver::update_system_on_gpu(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseBILU0<Scalar, block_size>::
|
||||
apply(Scalar& y, Scalar& x) {
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
|
||||
Timer t_apply;
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
|
||||
this->operation, Nb, this->nnzbs_prec, &one, \
|
||||
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &y, d_t, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(this->handle, this->dir, \
|
||||
this->operation, Nb, this->nnzbs_prec, &one, \
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, &x, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
out << "rocsparseBILU0 apply: " << t_apply.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class rocsparseBILU0<T,1>; \
|
||||
template class rocsparseBILU0<T,2>; \
|
||||
template class rocsparseBILU0<T,3>; \
|
||||
template class rocsparseBILU0<T,4>; \
|
||||
template class rocsparseBILU0<T,5>; \
|
||||
template class rocsparseBILU0<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
} // namespace Opm
|
125
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
Normal file
125
opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp
Normal file
@ -0,0 +1,125 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCSPARSEBILU0_HPP
|
||||
#define OPM_ROCSPARSEBILU0_HPP
|
||||
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
|
||||
|
||||
#include <rocblas/rocblas.h>
|
||||
#include <rocsparse/rocsparse.h>
|
||||
|
||||
#include <hip/hip_version.h>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
/// This class implements a Blocked ILU0 preconditioner
|
||||
/// The decomposition is done on GPU, using exact decomposition, or ChowPatel decomposition
|
||||
/// The preconditioner is applied via two exact triangular solves
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class rocsparseBILU0 : public rocsparsePreconditioner<Scalar, block_size>
|
||||
{
|
||||
typedef rocsparsePreconditioner<Scalar, block_size> Base;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
|
||||
private:
|
||||
|
||||
rocsparse_mat_descr descr_M, descr_L, descr_U;
|
||||
rocsparse_mat_info ilu_info;
|
||||
#if HIP_VERSION >= 50400000
|
||||
rocsparse_mat_info spmv_info;
|
||||
#endif
|
||||
|
||||
rocsparse_int *d_Mrows, *d_Mcols;
|
||||
Scalar *d_Mvals, *d_t;
|
||||
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
|
||||
|
||||
public:
|
||||
|
||||
rocsparseBILU0(int verbosity_);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
rocsparse_int *d_Arows,
|
||||
rocsparse_int *d_Acols);
|
||||
|
||||
/// Analysis, extract parallelism if specified
|
||||
/// \param[in] mat matrix A
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
|
||||
|
||||
/// Analysis, extract parallelism if specified
|
||||
/// \param[in] mat matrix A
|
||||
/// \param[in] jacMat matrix for preconditioner, analyze this as well
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat);
|
||||
|
||||
/// ILU decomposition
|
||||
/// \param[in] mat matrix A to decompose
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat) override;
|
||||
|
||||
/// ILU decomposition
|
||||
/// \param[in] mat matrix A
|
||||
/// \param[in] jacMat matrix for preconditioner, decompose this one if used
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat) override;
|
||||
|
||||
/// Apply preconditioner, x = prec(y)
|
||||
/// via Lz = y
|
||||
/// and Ux = z
|
||||
/// \param[in] y Input y vector
|
||||
/// \param[out] x Output x vector
|
||||
void apply(Scalar& y,
|
||||
Scalar& x) override;
|
||||
|
||||
#if HAVE_OPENCL
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y,
|
||||
cl::Buffer& x) {}
|
||||
#endif
|
||||
|
||||
/// Copy matrix A values to GPU
|
||||
/// \param[in] mVals Input values
|
||||
void copy_system_to_gpu(Scalar *mVals);
|
||||
|
||||
/// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
|
||||
// /// \param[in] vals New values
|
||||
// /// \param[in] b New b vector
|
||||
// void update_system(Scalar *vals, Scalar *b);
|
||||
|
||||
/// Update GPU values after a new assembly is done
|
||||
/// \param[in] b New b vector
|
||||
void update_system_on_gpu(Scalar *b);
|
||||
};
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
338
opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
Normal file
338
opm/simulators/linalg/bda/rocm/rocsparseCPR.cpp
Normal file
@ -0,0 +1,338 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <dune/common/shared_ptr.hh>
|
||||
|
||||
#include <opm/simulators/linalg/PreconditionerFactory.hpp>
|
||||
#include <opm/simulators/linalg/PropertyTree.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/hipKernels.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
rocsparseCPR<Scalar, block_size>::rocsparseCPR(int verbosity_) :
|
||||
rocsparsePreconditioner<Scalar, block_size>(verbosity_)
|
||||
{
|
||||
bilu0 = std::make_unique<rocsparseBILU0<Scalar, block_size> >(verbosity_);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
rocsparse_int *d_Arows,
|
||||
rocsparse_int *d_Acols)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bilu0->set_context(this->handle, this->dir, this->operation, this->stream);
|
||||
bilu0->setJacMat(*this->jacMat.get());
|
||||
bilu0->initialize(matrix,jacMatrix,d_Arows,d_Acols);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
copy_system_to_gpu(Scalar *b) {
|
||||
bilu0->copy_system_to_gpu(b);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
update_system_on_gpu(Scalar *vals) {
|
||||
bilu0->update_system_on_gpu(vals);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat_) {
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_);
|
||||
|
||||
this->mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
analyze_matrix(BlockedMatrix<Scalar> *mat_,
|
||||
BlockedMatrix<Scalar> *jacMat)
|
||||
{
|
||||
this->Nb = mat_->Nb;
|
||||
this->nnzb = mat_->nnzbs;
|
||||
this->N = Nb * block_size;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
|
||||
bool success = bilu0->analyze_matrix(mat_, jacMat);
|
||||
this->mat = mat_;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar> *mat_,
|
||||
BlockedMatrix<Scalar> *jacMat)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_, jacMat);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
|
||||
std::call_once(rocm_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
rocm_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
bool rocsparseCPR<Scalar, block_size>::
|
||||
create_preconditioner(BlockedMatrix<Scalar> *mat_) {
|
||||
Dune::Timer t_bilu0;
|
||||
bool result = bilu0->create_preconditioner(mat_);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
this->create_preconditioner_amg(this->mat);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR create_preconditioner_amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
auto init_func = std::bind(&rocsparseCPR::init_rocm_buffers, this);
|
||||
std::call_once(rocm_buffers_allocated, init_func);
|
||||
|
||||
// upload matrices and vectors to GPU
|
||||
rocm_upload();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
init_rocm_buffers() {
|
||||
d_Amatrices.reserve(this->num_levels);
|
||||
d_Rmatrices.reserve(this->num_levels - 1);
|
||||
d_invDiags.reserve(this->num_levels - 1);
|
||||
for (Matrix<Scalar>& m : this->Amatrices) {
|
||||
d_Amatrices.emplace_back(m.N, m.M, m.nnzs, 1);
|
||||
}
|
||||
|
||||
for (Matrix<Scalar>& m : this->Rmatrices) {
|
||||
d_Rmatrices.emplace_back(m.N, m.M, m.nnzs, 1);
|
||||
d_f.emplace_back(m.N);
|
||||
d_u.emplace_back(m.N);
|
||||
d_PcolIndices.emplace_back(m.M);
|
||||
d_invDiags.emplace_back(m.M);
|
||||
d_t.emplace_back(m.M);
|
||||
}
|
||||
|
||||
d_weights.emplace_back(this->N);
|
||||
d_rs.emplace_back(this->N);
|
||||
d_mat = std::make_unique<RocmMatrix<Scalar>>(this->Nb, this->Nb, this->nnzb, block_size);
|
||||
|
||||
d_coarse_y.emplace_back(this->Nb);
|
||||
d_coarse_x.emplace_back(this->Nb);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
rocm_upload() {
|
||||
d_mat->upload(this->mat, this->stream);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_weights.data()->nnzValues, this->weights.data(), sizeof(Scalar) * this->N, hipMemcpyHostToDevice, this->stream));
|
||||
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Amatrices[i].upload(&this->Amatrices[i], this->stream);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(d_invDiags[i].nnzValues, this->invDiags[i].data(), sizeof(Scalar) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_PcolIndices[i].nnzValues, this->PcolIndices[i].data(), sizeof(int) * this->Amatrices[i].N, hipMemcpyHostToDevice, this->stream));
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < this->Rmatrices.size(); ++i) {
|
||||
d_Rmatrices[i].upload(&this->Rmatrices[i], this->stream);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
amg_cycle_gpu(const int level,
|
||||
Scalar &y,
|
||||
Scalar &x)
|
||||
{
|
||||
RocmMatrix<Scalar> *A = &d_Amatrices[level];
|
||||
RocmMatrix<Scalar> *R = &d_Rmatrices[level];
|
||||
int Ncur = A->Nb;
|
||||
Scalar zero = 0.0;
|
||||
Scalar one = 1.0;
|
||||
|
||||
rocsparse_mat_info spmv_info;
|
||||
rocsparse_mat_descr descr_R;
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_R));
|
||||
|
||||
if (level == this->num_levels - 1) {
|
||||
// solve coarsest level
|
||||
std::vector<Scalar> h_y(Ncur), h_x(Ncur, 0);
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(h_y.data(), &y, sizeof(Scalar) * Ncur, hipMemcpyDeviceToHost, this->stream));
|
||||
|
||||
// solve coarsest level using umfpack
|
||||
this->umfpack.apply(h_x.data(), h_y.data());
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(&x, h_x.data(), sizeof(Scalar) * Ncur, hipMemcpyHostToDevice, this->stream));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
int Nnext = d_Amatrices[level+1].Nb;
|
||||
|
||||
RocmVector<Scalar>& t = d_t[level];
|
||||
RocmVector<Scalar>& f = d_f[level];
|
||||
RocmVector<Scalar>& u = d_u[level]; // u was 0-initialized earlier
|
||||
|
||||
// presmooth
|
||||
Scalar jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < this->num_pre_smooth_steps; ++i){
|
||||
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
|
||||
// TODO: understand why rocsparse spmv library function does not here.
|
||||
// ROCSPARSE_CHECK(rocsparse_dbsrmv(this->handle, this->dir, this->operation,
|
||||
// R->Nb, R->Mb, R->nnzbs, &one, descr_R,
|
||||
// R->nnzValues, R->rowPointers, R->colIndices, 1,
|
||||
// t.nnzValues, &zero, f.nnzValues));
|
||||
HipKernels<Scalar>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t.nnzValues, f.nnzValues, Nnext, 1, this->stream);
|
||||
|
||||
amg_cycle_gpu(level + 1, *f.nnzValues, *u.nnzValues);
|
||||
HipKernels<Scalar>::prolongate_vector(u.nnzValues, &x, d_PcolIndices[level].nnzValues, Ncur, this->stream);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < this->num_post_smooth_steps; ++i){
|
||||
HipKernels<Scalar>::residual(A->nnzValues, A->colIndices, A->rowPointers, &x, &y, t.nnzValues, Ncur, 1, this->stream);
|
||||
HipKernels<Scalar>::vmul(jacobi_damping, d_invDiags[level].nnzValues, t.nnzValues, &x, Ncur, this->stream);
|
||||
}
|
||||
}
|
||||
|
||||
// x = prec(y)
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
apply_amg(const Scalar& y,
|
||||
Scalar& x)
|
||||
{
|
||||
HIP_CHECK(hipMemsetAsync(d_coarse_x.data()->nnzValues, 0, sizeof(Scalar) * this->Nb, this->stream));
|
||||
|
||||
for (unsigned int i = 0; i < d_u.size(); ++i) {
|
||||
d_u[i].upload(this->Rmatrices[i].nnzValues.data(), this->stream);
|
||||
}
|
||||
|
||||
HipKernels<Scalar>::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, &x, &y, d_rs.data()->nnzValues, this->Nb, block_size, this->stream);
|
||||
|
||||
HipKernels<Scalar>::full_to_pressure_restriction(d_rs.data()->nnzValues, d_weights.data()->nnzValues, d_coarse_y.data()->nnzValues, Nb, this->stream);
|
||||
|
||||
amg_cycle_gpu(0, *(d_coarse_y.data()->nnzValues), *(d_coarse_x.data()->nnzValues));
|
||||
|
||||
HipKernels<Scalar>::add_coarse_pressure_correction(d_coarse_x.data()->nnzValues, &x, this->pressure_idx, Nb, this->stream);
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparseCPR<Scalar, block_size>::
|
||||
apply(Scalar& y,
|
||||
Scalar& x)
|
||||
{
|
||||
Dune::Timer t_bilu0;
|
||||
|
||||
bilu0->apply(y, x);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(this->stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR apply bilu0(): " << t_bilu0.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
Dune::Timer t_amg;
|
||||
apply_amg(y, x);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "rocsparseCPR apply amg(): " << t_amg.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class rocsparseCPR<T,1>; \
|
||||
template class rocsparseCPR<T,2>; \
|
||||
template class rocsparseCPR<T,3>; \
|
||||
template class rocsparseCPR<T,4>; \
|
||||
template class rocsparseCPR<T,5>; \
|
||||
template class rocsparseCPR<T,6>;
|
||||
|
||||
INSTANCE_TYPE(double)
|
||||
} // namespace Opm
|
||||
|
||||
|
142
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
Normal file
142
opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp
Normal file
@ -0,0 +1,142 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCSPARSECPR_HPP
|
||||
#define OPM_ROCSPARSECPR_HPP
|
||||
|
||||
#include <mutex>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/CprCreation.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This class implements a Constrained Pressure Residual (CPR) preconditioner
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class rocsparseCPR : public rocsparsePreconditioner<Scalar, block_size>, public CprCreation<Scalar, block_size>
|
||||
{
|
||||
typedef rocsparsePreconditioner<Scalar, block_size> Base;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
|
||||
private:
|
||||
std::vector<RocmMatrix<Scalar>> d_Amatrices, d_Rmatrices; // scalar matrices that represent the AMG hierarchy
|
||||
|
||||
std::vector<RocmVector<int>> d_PcolIndices; // prolongation does not need a full matrix, only store colIndices
|
||||
std::vector<RocmVector<Scalar>> d_invDiags; // inverse of diagonal of Amatrices
|
||||
std::vector<RocmVector<Scalar>> d_t, d_f; // intermediate vectors used during amg cycle
|
||||
std::vector<RocmVector<Scalar>> d_u; // intermediate vectors used during amg cycle
|
||||
std::vector<RocmVector<Scalar>> d_rs; // use before extracting the pressure
|
||||
std::vector<RocmVector<Scalar>> d_weights; // the quasiimpes weights, used to extract pressure
|
||||
std::unique_ptr<RocmMatrix<Scalar>> d_mat; // stores blocked matrix
|
||||
std::vector<RocmVector<Scalar>> d_coarse_y, d_coarse_x; // stores the scalar vectors
|
||||
std::once_flag rocm_buffers_allocated; // only allocate OpenCL Buffers once
|
||||
|
||||
std::unique_ptr<rocsparseBILU0<Scalar, block_size> > bilu0; // Blocked ILU0 preconditioner
|
||||
|
||||
std::unique_ptr<rocsparseSolverBackend<Scalar, 1> > coarse_solver; // coarse solver is scalar
|
||||
|
||||
// Initialize and allocate matrices and vectors
|
||||
void init_rocm_buffers();
|
||||
|
||||
// Copy matrices and vectors to GPU
|
||||
void rocm_upload();
|
||||
|
||||
// apply pressure correction to vector
|
||||
void apply_amg(const Scalar& y, Scalar& x);
|
||||
|
||||
// Apply the AMG preconditioner
|
||||
void amg_cycle_gpu(const int level, Scalar &y, Scalar &x);
|
||||
|
||||
public:
|
||||
|
||||
rocsparseCPR(int verbosity);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] matrix matrix A
|
||||
/// \param[in] jacMatrix matrix for preconditioner
|
||||
bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
rocsparse_int *d_Arows,
|
||||
rocsparse_int *d_Acols);
|
||||
|
||||
|
||||
/// Analysis, extract parallelism if specified
|
||||
/// \param[in] mat matrix A
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat);
|
||||
|
||||
/// Analysis, extract parallelism if specified
|
||||
/// \param[in] mat matrix A
|
||||
/// \param[in] jacMat matrix for preconditioner, analyze this as well
|
||||
bool analyze_matrix(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat);
|
||||
|
||||
/// Create AMG preconditioner and perform ILU decomposition
|
||||
/// \param[in] mat matrix A
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat);
|
||||
|
||||
/// Create AMG preconditioner and perform ILU decomposition
|
||||
/// \param[in] mat matrix A
|
||||
/// \param[in] jacMat matrix for preconditioner, decompose this one if used
|
||||
bool create_preconditioner(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat);
|
||||
|
||||
/// Apply preconditioner, x = prec(y)
|
||||
/// applies blocked ilu0
|
||||
/// also applies amg for pressure component
|
||||
/// \param[in] y Input y vector
|
||||
/// \param[out] x Output x vector
|
||||
void apply(Scalar& y,
|
||||
Scalar& x) override;
|
||||
|
||||
#if HAVE_OPENCL
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(const cl::Buffer& y,
|
||||
cl::Buffer& x) {}
|
||||
#endif
|
||||
|
||||
/// Copy matrix A values to GPU
|
||||
/// \param[in] mVals Input values
|
||||
void copy_system_to_gpu(Scalar *b);
|
||||
|
||||
/// Reassign pointers, in case the addresses of the Dune variables have changed --> TODO: check when/if we need this method
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector b, contains N values
|
||||
// void update_system(Scalar *vals, Scalar *b);
|
||||
|
||||
/// Update linear system to GPU
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(Scalar *b);
|
||||
};
|
||||
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
||||
|
113
opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
Normal file
113
opm/simulators/linalg/bda/rocm/rocsparseMatrix.cpp
Normal file
@ -0,0 +1,113 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Matrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar>
|
||||
RocmMatrix<Scalar>::
|
||||
RocmMatrix(int Nb_,
|
||||
int Mb_,
|
||||
int nnzbs_,
|
||||
unsigned int block_size_)
|
||||
: Nb(Nb_),
|
||||
Mb(Mb_),
|
||||
nnzbs(nnzbs_),
|
||||
block_size(block_size_)
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * block_size * block_size * nnzbs));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&colIndices, sizeof(int) * nnzbs));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&rowPointers, sizeof(int) * (Nb + 1)));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(Scalar *vals,
|
||||
int *cols,
|
||||
int *rows,
|
||||
hipStream_t stream)
|
||||
{
|
||||
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * block_size * block_size * nnzbs, hipMemcpyHostToDevice, stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(colIndices, cols, sizeof(int) * nnzbs, hipMemcpyHostToDevice, stream));
|
||||
|
||||
HIP_CHECK(hipMemcpyAsync(rowPointers, rows, sizeof(int) * (Nb + 1), hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(Matrix<Scalar> *matrix,
|
||||
hipStream_t stream)
|
||||
{
|
||||
if (block_size != 1) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(matrix->nnzValues.data(), matrix->colIndices.data(), matrix->rowPointers.data(), stream);
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmMatrix<Scalar>::
|
||||
upload(BlockedMatrix<Scalar> *matrix,
|
||||
hipStream_t stream)
|
||||
{
|
||||
if (matrix->block_size != block_size) {
|
||||
OPM_THROW(std::logic_error, "Error trying to upload a BlockedMatrix to RocmMatrix with different block_size");
|
||||
}
|
||||
|
||||
upload(matrix->nnzValues, matrix->colIndices, matrix->rowPointers, stream);
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
RocmVector<Scalar>::RocmVector(int N)
|
||||
: size(N)
|
||||
{
|
||||
HIP_CHECK(hipMalloc((void**)&nnzValues, sizeof(Scalar) * N));
|
||||
}
|
||||
|
||||
template <class Scalar>
|
||||
void RocmVector<Scalar>::
|
||||
upload(Scalar *vals,
|
||||
hipStream_t stream)
|
||||
{
|
||||
HIP_CHECK(hipMemcpyAsync(nnzValues, vals, sizeof(Scalar) * size, hipMemcpyHostToDevice, stream));
|
||||
}
|
||||
|
||||
#define INSTANCE_TYPE(T) \
|
||||
template class RocmVector<T>;\
|
||||
template class RocmMatrix<T>;
|
||||
|
||||
INSTANCE_TYPE(int);
|
||||
INSTANCE_TYPE(double);
|
||||
|
||||
} // namespace Opm
|
73
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
Normal file
73
opm/simulators/linalg/bda/rocm/rocsparseMatrix.hpp
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCMMATRIX_HEADER_INCLUDED
|
||||
#define OPM_ROCMMATRIX_HEADER_INCLUDED
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class Matrix;
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
/// This struct resembles a csr matrix
|
||||
template<class Scalar>
|
||||
class RocmMatrix {
|
||||
public:
|
||||
|
||||
RocmMatrix(int Nb_, int Mb_, int nnzbs_, unsigned int block_size_);
|
||||
|
||||
void upload(Scalar *vals,
|
||||
int *cols,
|
||||
int *rows,
|
||||
hipStream_t stream);
|
||||
|
||||
void upload(Matrix<Scalar> *matrix,
|
||||
hipStream_t stream);
|
||||
|
||||
void upload(BlockedMatrix<Scalar> *matrix,
|
||||
hipStream_t stream);
|
||||
|
||||
Scalar* nnzValues;
|
||||
int* colIndices;
|
||||
int* rowPointers;
|
||||
int Nb, Mb;
|
||||
int nnzbs;
|
||||
unsigned int block_size;
|
||||
};
|
||||
|
||||
template <typename Scalar>
|
||||
class RocmVector {
|
||||
public:
|
||||
|
||||
RocmVector(int N);
|
||||
|
||||
void upload(Scalar *vals,
|
||||
hipStream_t stream);
|
||||
|
||||
void upload(Matrix<Scalar> *matrix,
|
||||
hipStream_t stream);
|
||||
|
||||
Scalar* nnzValues;
|
||||
int size;
|
||||
};
|
||||
} // namespace Opm
|
||||
|
||||
#endif
|
86
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
Normal file
86
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <memory>
|
||||
#include <opm/common/TimingMacros.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseBILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
std::unique_ptr<rocsparsePreconditioner<Scalar,block_size> > rocsparsePreconditioner<Scalar,block_size>::
|
||||
create(PreconditionerType type,
|
||||
int verbosity)
|
||||
{
|
||||
switch (type ) {
|
||||
case PreconditionerType::BILU0:
|
||||
return std::make_unique<Opm::Accelerator::rocsparseBILU0<Scalar, block_size> >(verbosity);
|
||||
case PreconditionerType::CPR:
|
||||
return std::make_unique<Opm::Accelerator::rocsparseCPR<Scalar, block_size> >(verbosity);
|
||||
}
|
||||
|
||||
OPM_THROW(std::logic_error,
|
||||
"Invalid preconditioner type " + std::to_string(static_cast<int>(type)));
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparsePreconditioner<Scalar, block_size>::
|
||||
set_matrix_analysis(rocsparse_mat_descr descr_L,
|
||||
rocsparse_mat_descr descr_U)
|
||||
{
|
||||
descr_L = descr_L;
|
||||
descr_U = descr_U;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparsePreconditioner<Scalar, block_size>::
|
||||
set_context(rocsparse_handle handle,
|
||||
rocsparse_direction dir,
|
||||
rocsparse_operation operation,
|
||||
hipStream_t stream)
|
||||
{
|
||||
this->handle = handle;
|
||||
this->dir = dir;
|
||||
this->operation = operation;
|
||||
this->stream = stream;
|
||||
}
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
void rocsparsePreconditioner<Scalar, block_size>::
|
||||
setJacMat(BlockedMatrix<Scalar> jacMat) {
|
||||
this->jacMat = std::make_shared<BlockedMatrix<Scalar>>(jacMat);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_TYPE(T) \
|
||||
template class rocsparsePreconditioner<T,1>; \
|
||||
template class rocsparsePreconditioner<T,2>; \
|
||||
template class rocsparsePreconditioner<T,3>; \
|
||||
template class rocsparsePreconditioner<T,4>; \
|
||||
template class rocsparsePreconditioner<T,5>; \
|
||||
template class rocsparsePreconditioner<T,6>;
|
||||
|
||||
INSTANTIATE_TYPE(double)
|
||||
|
||||
} //namespace Opm
|
||||
|
91
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
Normal file
91
opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp
Normal file
@ -0,0 +1,91 @@
|
||||
/*
|
||||
Copyright 2024 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
|
||||
#define OPM_ROCSPARSEPRECONDITIONER_HEADER_INCLUDED
|
||||
|
||||
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
|
||||
|
||||
#include <rocsparse/rocsparse.h>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
template<class Scalar> class BlockedMatrix;
|
||||
|
||||
template <class Scalar, unsigned int block_size>
|
||||
class rocsparsePreconditioner : public Preconditioner<Scalar, block_size>
|
||||
{
|
||||
|
||||
protected:
|
||||
rocsparse_handle handle;
|
||||
rocsparse_direction dir = rocsparse_direction_row;
|
||||
rocsparse_operation operation = rocsparse_operation_none;
|
||||
rocsparse_mat_descr descr_L, descr_U;
|
||||
|
||||
hipStream_t stream;
|
||||
|
||||
rocsparsePreconditioner(int verbosity_) :
|
||||
Preconditioner<Scalar, block_size>(verbosity_)
|
||||
{};
|
||||
|
||||
public:
|
||||
|
||||
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
|
||||
bool useJacMatrix = false;
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
|
||||
|
||||
virtual ~rocsparsePreconditioner() = default;
|
||||
|
||||
static std::unique_ptr<rocsparsePreconditioner<Scalar, block_size>> create(PreconditionerType type,
|
||||
int verbosity);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
virtual void apply(Scalar& y, Scalar& x) = 0;
|
||||
|
||||
// create/update preconditioner, probably used every linear solve
|
||||
// the version with two params can be overloaded, if not, it will default to using the one param version
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat) = 0;
|
||||
|
||||
virtual bool create_preconditioner(BlockedMatrix<Scalar> *mat,
|
||||
BlockedMatrix<Scalar> *jacMat) = 0;
|
||||
|
||||
virtual bool initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix,
|
||||
rocsparse_int *d_Arows,
|
||||
rocsparse_int *d_Acols) = 0;
|
||||
|
||||
virtual void copy_system_to_gpu(Scalar *b)=0;
|
||||
|
||||
/// Update linear system to GPU
|
||||
/// \param[in] b input vector, contains N values
|
||||
virtual void update_system_on_gpu(Scalar *b)=0;
|
||||
|
||||
void set_matrix_analysis(rocsparse_mat_descr descr_L,
|
||||
rocsparse_mat_descr descr_U);
|
||||
|
||||
void set_context(rocsparse_handle handle,
|
||||
rocsparse_direction dir,
|
||||
rocsparse_operation operation,
|
||||
hipStream_t stream);
|
||||
|
||||
void setJacMat(BlockedMatrix<Scalar> jacMat);
|
||||
};
|
||||
} //namespace Opm
|
||||
|
||||
#endif
|
@ -36,63 +36,22 @@
|
||||
|
||||
#undef HAVE_CUDA
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_version.h>
|
||||
#include <opm/simulators/linalg/bda/Preconditioner.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
#ifdef HIP_HAVE_CUDA_DEFINED
|
||||
#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
|
||||
#undef HIP_HAVE_CUDA_DEFINED
|
||||
#endif
|
||||
|
||||
#define HIP_CHECK(STAT) \
|
||||
do { \
|
||||
const hipError_t stat = (STAT); \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::hip "; \
|
||||
oss << "error: " << hipGetErrorString(stat); \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCSPARSE_CHECK(STAT) \
|
||||
do { \
|
||||
const rocsparse_status stat = (STAT); \
|
||||
if(stat != rocsparse_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocsparse "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ROCBLAS_CHECK(STAT) \
|
||||
do { \
|
||||
const rocblas_status stat = (STAT); \
|
||||
if(stat != rocblas_status_success) \
|
||||
{ \
|
||||
std::ostringstream oss; \
|
||||
oss << "rocsparseSolverBackend::rocblas "; \
|
||||
oss << "error: " << stat; \
|
||||
OPM_THROW(std::logic_error, oss.str()); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#if HAVE_OPENMP
|
||||
#include <thread>
|
||||
#include <omp.h>
|
||||
extern std::shared_ptr<std::thread> copyThread;
|
||||
#endif //HAVE_OPENMP
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Dune::Timer;
|
||||
@ -100,10 +59,26 @@ using Dune::Timer;
|
||||
template<class Scalar, unsigned int block_size>
|
||||
rocsparseSolverBackend<Scalar,block_size>::
|
||||
rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
|
||||
unsigned int platformID_, unsigned int deviceID_)
|
||||
unsigned int platformID_, unsigned int deviceID_, std::string linsolver)
|
||||
: Base(verbosity_, maxit_, tolerance_, platformID_, deviceID_)
|
||||
{
|
||||
int numDevices = 0;
|
||||
bool use_cpr, use_isai;
|
||||
|
||||
if (linsolver.compare("ilu0") == 0) {
|
||||
use_cpr = false;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("cpr_quasiimpes") == 0) {
|
||||
use_cpr = true;
|
||||
use_isai = false;
|
||||
} else if (linsolver.compare("isai") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=isai");
|
||||
} else if (linsolver.compare("cpr_trueimpes") == 0) {
|
||||
OPM_THROW(std::logic_error, "Error rocsparseSolver does not support --linerar-solver=cpr_trueimpes");
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for argument --linear-solver, " + linsolver);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
if (static_cast<int>(deviceID) >= numDevices) {
|
||||
OPM_THROW(std::runtime_error, "Invalid HIP device ID");
|
||||
@ -124,6 +99,15 @@ rocsparseSolverBackend(int verbosity_, int maxit_, Scalar tolerance_,
|
||||
HIP_CHECK(hipStreamCreate(&stream));
|
||||
ROCSPARSE_CHECK(rocsparse_set_stream(handle, stream));
|
||||
ROCBLAS_CHECK(rocblas_set_stream(blas_handle, stream));
|
||||
|
||||
using PreconditionerType = typename Opm::Accelerator::PreconditionerType;
|
||||
if (use_cpr) {
|
||||
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::CPR, verbosity);
|
||||
} else {
|
||||
prec = rocsparsePreconditioner<Scalar, block_size>::create(PreconditionerType::BILU0, verbosity);
|
||||
}
|
||||
|
||||
prec->set_context(handle, dir, operation, stream);
|
||||
}
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
||||
@ -170,17 +154,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
// HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
|
||||
#if HIP_VERSION >= 60000000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_x, &zero, d_r));
|
||||
#elif HIP_VERSION >= 50400000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_x, &zero, d_r));
|
||||
#else
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
d_x, &zero, d_r));
|
||||
#endif
|
||||
@ -216,13 +200,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
t_prec.start();
|
||||
}
|
||||
|
||||
// apply ilu0
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_p, d_t, rocsparse_solve_policy_auto, d_buffer));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_pw, rocsparse_solve_policy_auto, d_buffer));
|
||||
// apply ilu0
|
||||
prec->apply(*d_p, *d_pw);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
t_prec.stop();
|
||||
@ -232,17 +212,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
// spmv
|
||||
#if HIP_VERSION >= 60000000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_pw, &zero, d_v));
|
||||
#elif HIP_VERSION >= 50400000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_pw, &zero, d_v));
|
||||
#else
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
d_pw, &zero, d_v));
|
||||
#endif
|
||||
@ -283,12 +263,9 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
if (verbosity >= 3) {
|
||||
t_prec.start();
|
||||
}
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_r, d_t, rocsparse_solve_policy_auto, d_buffer));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_solve(handle, dir, \
|
||||
operation, Nb, nnzbs_prec, &one, \
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, d_t, d_s, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
prec->apply(*d_r, *d_s);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
t_prec.stop();
|
||||
@ -298,17 +275,17 @@ gpu_pbicgstab([[maybe_unused]] WellContributions<Scalar>& wellContribs,
|
||||
// spmv
|
||||
#if HIP_VERSION >= 60000000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_s, &zero, d_t));
|
||||
#elif HIP_VERSION >= 50400000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
spmv_info, d_s, &zero, d_t));
|
||||
#else
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv(handle, dir, operation,
|
||||
Nb, Nb, nnzb, &one, descr_M,
|
||||
Nb, Nb, nnzb, &one, descr_A,
|
||||
d_Avals, d_Arows, d_Acols, block_size,
|
||||
d_s, &zero, d_t));
|
||||
#endif
|
||||
@ -384,14 +361,18 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMatrix)
|
||||
{
|
||||
this->Nb = matrix->Nb;
|
||||
this->N = Nb * block_size;
|
||||
this->N = this->Nb * block_size;
|
||||
this->nnzb = matrix->nnzbs;
|
||||
this->nnz = nnzb * block_size * block_size;
|
||||
nnzbs_prec = nnzb;
|
||||
this->nnz = this->nnzb * block_size * block_size;
|
||||
|
||||
if (jacMatrix) {
|
||||
useJacMatrix = true;
|
||||
nnzbs_prec = jacMatrix->nnzbs;
|
||||
prec->useJacMatrix = true;
|
||||
prec->jacMat = jacMatrix;
|
||||
prec->nnzbs_prec = jacMatrix->nnzbs;
|
||||
} else {
|
||||
prec->useJacMatrix = false;
|
||||
prec->jacMat = matrix;
|
||||
prec->nnzbs_prec = this->nnzb;
|
||||
}
|
||||
|
||||
std::ostringstream out;
|
||||
@ -408,7 +389,6 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
out.clear();
|
||||
|
||||
mat = matrix;
|
||||
jacMat = jacMatrix;
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_r, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_rw, sizeof(Scalar) * N));
|
||||
@ -424,15 +404,7 @@ initialize(std::shared_ptr<BlockedMatrix<Scalar>> matrix,
|
||||
HIP_CHECK(hipMalloc((void**)&d_x, sizeof(Scalar) * N));
|
||||
HIP_CHECK(hipMalloc((void**)&d_b, sizeof(Scalar) * N));
|
||||
|
||||
if (useJacMatrix) {
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mrows, sizeof(rocsparse_int) * (Nb + 1)));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mcols, sizeof(rocsparse_int) * nnzbs_prec));
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
|
||||
} else { // preconditioner matrix is same
|
||||
HIP_CHECK(hipMalloc((void**)&d_Mvals, sizeof(Scalar) * nnzbs_prec * block_size * block_size));
|
||||
d_Mcols = d_Acols;
|
||||
d_Mrows = d_Arows;
|
||||
}
|
||||
prec->initialize(matrix, jacMatrix, d_Arows, d_Acols);//TODO: do we need all parameters?
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
@ -453,28 +425,10 @@ copy_system_to_gpu(Scalar *b)
|
||||
sizeof(Scalar) * nnz,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemsetAsync(d_x, 0, N * sizeof(Scalar), stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar) * N,
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, N * sizeof(Scalar),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mrows, jacMat->rowPointers,
|
||||
sizeof(rocsparse_int) * (Nb + 1),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mcols, jacMat->colIndices,
|
||||
sizeof(rocsparse_int) * nnzbs_prec,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
|
||||
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
|
||||
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
prec->copy_system_to_gpu(d_Avals);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
@ -500,19 +454,8 @@ update_system_on_gpu(Scalar* b)
|
||||
HIP_CHECK(hipMemcpyAsync(d_b, b, N* sizeof(Scalar),
|
||||
hipMemcpyHostToDevice, stream));
|
||||
|
||||
if (useJacMatrix) {
|
||||
#if HAVE_OPENMP
|
||||
if (omp_get_max_threads() > 1) {
|
||||
copyThread->join();
|
||||
}
|
||||
#endif
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, jacMat->nnzValues,
|
||||
sizeof(Scalar) * nnzbs_prec * block_size * block_size,
|
||||
hipMemcpyHostToDevice, stream));
|
||||
} else {
|
||||
HIP_CHECK(hipMemcpyAsync(d_Mvals, d_Avals,
|
||||
sizeof(Scalar) * nnz, hipMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
prec->update_system_on_gpu(d_Avals);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
|
||||
@ -528,58 +471,15 @@ template<class Scalar, unsigned int block_size>
|
||||
bool rocsparseSolverBackend<Scalar,block_size>::
|
||||
analyze_matrix()
|
||||
{
|
||||
std::size_t d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
Timer t;
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&ilu_info));
|
||||
#if HIP_VERSION >= 50400000
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_info(&spmv_info));
|
||||
#endif
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_A));
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_M));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_L));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_create_mat_descr(&descr_U));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper));
|
||||
ROCSPARSE_CHECK(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit));
|
||||
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0_buffer_size(handle, dir, Nb, nnzbs_prec,
|
||||
descr_M, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_M));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
|
||||
descr_L, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_L));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_buffer_size(handle, dir, operation, Nb, nnzbs_prec,
|
||||
descr_U, d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, &d_bufferSize_U));
|
||||
|
||||
d_bufferSize = std::max(d_bufferSize_M,
|
||||
std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
HIP_CHECK(hipMalloc((void**)&d_buffer, d_bufferSize));
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0_analysis(handle, dir, \
|
||||
Nb, nnzbs_prec, descr_M, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
int zero_position = 0;
|
||||
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
|
||||
if (rocsparse_status_success != status) {
|
||||
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
|
||||
Nb, nnzbs_prec, descr_L, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrsv_analysis(handle, dir, operation, \
|
||||
Nb, nnzbs_prec, descr_U, d_Mvals, d_Mrows, d_Mcols, \
|
||||
block_size, ilu_info, rocsparse_analysis_policy_reuse, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
#if HIP_VERSION >= 60000000
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrmv_analysis(handle, dir, operation,
|
||||
@ -593,6 +493,13 @@ analyze_matrix()
|
||||
block_size, spmv_info));
|
||||
#endif
|
||||
|
||||
if(!prec->analyze_matrix(&*mat)) {
|
||||
std::ostringstream out;
|
||||
out << "Warning: rocsparseSolver matrix analysis failed!";
|
||||
OpmLog::info(out.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
std::ostringstream out;
|
||||
@ -609,28 +516,7 @@ template<class Scalar, unsigned int block_size>
|
||||
bool rocsparseSolverBackend<Scalar,block_size>::
|
||||
create_preconditioner()
|
||||
{
|
||||
Timer t;
|
||||
|
||||
bool result = true;
|
||||
ROCSPARSE_CHECK(rocsparse_dbsrilu0(handle, dir, Nb, nnzbs_prec, descr_M,
|
||||
d_Mvals, d_Mrows, d_Mcols, block_size, ilu_info, rocsparse_solve_policy_auto, d_buffer));
|
||||
|
||||
// Check for zero pivot
|
||||
int zero_position = 0;
|
||||
rocsparse_status status = rocsparse_bsrilu0_zero_pivot(handle, ilu_info, &zero_position);
|
||||
if(rocsparse_status_success != status)
|
||||
{
|
||||
printf("L has structural and/or numerical zero at L(%d,%d)\n", zero_position, zero_position);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
std::ostringstream out;
|
||||
out << "rocsparseSolver::create_preconditioner(): " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return result;
|
||||
return prec->create_preconditioner(&*mat);
|
||||
} // end create_preconditioner()
|
||||
|
||||
template<class Scalar, unsigned int block_size>
|
@ -26,6 +26,8 @@
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparsePreconditioner.hpp>
|
||||
|
||||
#include <rocblas/rocblas.h>
|
||||
#include <rocsparse/rocsparse.h>
|
||||
|
||||
@ -57,29 +59,26 @@ private:
|
||||
|
||||
bool analysis_done = false;
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> mat{}; // original matrix
|
||||
std::shared_ptr<BlockedMatrix<Scalar>> jacMat{}; // matrix for preconditioner
|
||||
int nnzbs_prec = 0; // number of nnz blocks in preconditioner matrix M
|
||||
|
||||
rocsparse_direction dir = rocsparse_direction_row;
|
||||
rocsparse_operation operation = rocsparse_operation_none;
|
||||
rocsparse_handle handle;
|
||||
rocblas_handle blas_handle;
|
||||
rocsparse_mat_descr descr_A, descr_M, descr_L, descr_U;
|
||||
rocsparse_mat_info ilu_info;
|
||||
rocsparse_mat_descr descr_A;
|
||||
#if HIP_VERSION >= 50400000
|
||||
rocsparse_mat_info spmv_info;
|
||||
#endif
|
||||
hipStream_t stream;
|
||||
|
||||
rocsparse_int *d_Arows, *d_Mrows;
|
||||
rocsparse_int *d_Acols, *d_Mcols;
|
||||
Scalar *d_Avals, *d_Mvals;
|
||||
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
rocsparse_int *d_Arows, *d_Acols;
|
||||
Scalar *d_Avals;
|
||||
Scalar *d_x, *d_b, *d_r, *d_rw, *d_p; // vectors, used during linear solve
|
||||
Scalar *d_pw, *d_s, *d_t, *d_v;
|
||||
void *d_buffer; // buffer space, used by rocsparse ilu0 analysis
|
||||
int ver;
|
||||
char rev[64];
|
||||
|
||||
std::unique_ptr<rocsparsePreconditioner<Scalar, block_size> > prec; // can perform blocked ILU0 and AMG on pressure component
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
@ -119,14 +118,16 @@ public:
|
||||
/// \param[in] tolerance required relative tolerance for rocsparseSolver
|
||||
/// \param[in] platformID the OpenCL platform to be used
|
||||
/// \param[in] deviceID the device to be used
|
||||
/// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument
|
||||
rocsparseSolverBackend(int linear_solver_verbosity,
|
||||
int maxit,
|
||||
Scalar tolerance,
|
||||
unsigned int platformID,
|
||||
unsigned int deviceID);
|
||||
unsigned int deviceID,
|
||||
std::string linsolver);
|
||||
|
||||
/// For the CPR coarse solver
|
||||
// rocsparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
rocsparseSolverBackend(int linear_solver_verbosity, int maxit, Scalar tolerance, bool opencl_ilu_reorder);
|
||||
|
||||
/// Destroy a openclSolver, and free memory
|
||||
~rocsparseSolverBackend();
|
||||
@ -144,10 +145,6 @@ public:
|
||||
WellContributions<Scalar>& wellContribs,
|
||||
BdaResult& res) override;
|
||||
|
||||
/// Solve scalar linear system, for example a coarse system of an AMG preconditioner
|
||||
/// Data is already on the GPU
|
||||
// SolverStatus solve_system(BdaResult &res);
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(Scalar* x) override;
|
@ -29,7 +29,7 @@
|
||||
|
||||
#undef HAVE_CUDA
|
||||
|
||||
#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
|
||||
#include <opm/simulators/linalg/bda/rocm/rocsparseWellContributions.hpp>
|
||||
|
||||
#ifdef HIP_HAVE_CUDA_DEFINED
|
||||
#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
|
||||
@ -40,17 +40,9 @@
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#define HIP_CHECK(stat) \
|
||||
{ \
|
||||
if(stat != hipSuccess) \
|
||||
{ \
|
||||
OPM_THROW(std::logic_error, "HIP error"); \
|
||||
} \
|
||||
}
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
@ -9,7 +9,7 @@
|
||||
#include <dune/istl/bcrsmatrix.hh>
|
||||
#include <dune/istl/matrixmarket.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/BISAI.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclBISAI.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(testcsrtocscoffsetmap){
|
||||
|
||||
|
@ -31,7 +31,8 @@
|
||||
|
||||
#include <dune/istl/bcrsmatrix.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/CPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclCPR.hpp>
|
||||
#include <opm/simulators/linalg/bda/Misc.hpp>
|
||||
|
||||
BOOST_AUTO_TEST_CASE(testsolvetransposed3x3)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user