mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-01-01 12:06:54 -06:00
Merge pull request #2209 from Tongdongq/master
Added cusparseSolver, needs GPU to be used
This commit is contained in:
commit
b9062396be
@ -12,10 +12,20 @@
|
||||
###########################################################################
|
||||
|
||||
# Mandatory call to project
|
||||
|
||||
project(opm-simulators C CXX)
|
||||
|
||||
cmake_minimum_required (VERSION 2.8)
|
||||
|
||||
|
||||
find_package(CUDA)
|
||||
|
||||
if(CUDA_FOUND)
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
enable_language(CUDA)
|
||||
set(HAVE_CUDA 1)
|
||||
endif()
|
||||
|
||||
option(SIBLING_SEARCH "Search for other modules in sibling directories?" ON)
|
||||
set( USE_OPENMP_DEFAULT OFF ) # Use of OpenMP is considered experimental
|
||||
option(BUILD_FLOW "Build the production oriented flow simulator?" ON)
|
||||
@ -286,3 +296,10 @@ endif()
|
||||
if (OPM_ENABLE_PYTHON)
|
||||
add_subdirectory(python)
|
||||
endif()
|
||||
|
||||
# must link libraries after target 'flow' has been defined
|
||||
if(CUDA_FOUND)
|
||||
target_link_libraries( opmsimulators ${CUDA_cublas_LIBRARY} )
|
||||
target_link_libraries( opmsimulators ${CUDA_cusparse_LIBRARY} )
|
||||
endif()
|
||||
|
||||
|
@ -29,6 +29,7 @@ list (APPEND MAIN_SOURCE_FILES
|
||||
opm/simulators/flow/MissingFeatures.cpp
|
||||
opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
|
||||
opm/simulators/linalg/setupPropertyTree.cpp
|
||||
opm/simulators/linalg/bda/BdaBridge.cpp
|
||||
opm/simulators/timestepping/TimeStepControl.cpp
|
||||
opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
|
||||
opm/simulators/timestepping/SimulatorTimer.cpp
|
||||
@ -41,6 +42,10 @@ list (APPEND MAIN_SOURCE_FILES
|
||||
opm/simulators/wells/VFPInjProperties.cpp
|
||||
)
|
||||
|
||||
if(CUDA_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
|
||||
endif()
|
||||
|
||||
# originally generated with the command:
|
||||
# find tests -name '*.cpp' -a ! -wholename '*/not-unit/*' -printf '\t%p\n' | sort
|
||||
list (APPEND TEST_SOURCE_FILES
|
||||
@ -129,6 +134,10 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/aquifers/AquiferFetkovich.hpp
|
||||
opm/simulators/aquifers/BlackoilAquiferModel.hpp
|
||||
opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
|
||||
opm/simulators/linalg/bda/BdaBridge.hpp
|
||||
opm/simulators/linalg/bda/BdaResult.hpp
|
||||
opm/simulators/linalg/bda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/BlackoilAmg.hpp
|
||||
opm/simulators/linalg/BlackoilAmgCpr.hpp
|
||||
opm/simulators/linalg/amgcpr.hh
|
||||
|
@ -5,6 +5,7 @@ set (opm-simulators_CONFIG_VAR
|
||||
HAVE_EWOMS
|
||||
HAVE_MPI
|
||||
HAVE_PETSC
|
||||
HAVE_CUDA
|
||||
HAVE_SUITESPARSE_UMFPACK_H
|
||||
HAVE_DUNE_ISTL
|
||||
DUNE_ISTL_VERSION_MAJOR
|
||||
|
@ -67,6 +67,7 @@ NEW_PROP_TAG(CprMaxEllIter);
|
||||
NEW_PROP_TAG(CprEllSolvetype);
|
||||
NEW_PROP_TAG(CprReuseSetup);
|
||||
NEW_PROP_TAG(LinearSolverConfigurationJsonFile);
|
||||
NEW_PROP_TAG(UseGpu);
|
||||
|
||||
SET_SCALAR_PROP(FlowIstlSolverParams, LinearSolverReduction, 1e-2);
|
||||
SET_SCALAR_PROP(FlowIstlSolverParams, IluRelaxation, 0.9);
|
||||
@ -92,6 +93,7 @@ SET_INT_PROP(FlowIstlSolverParams, CprMaxEllIter, 20);
|
||||
SET_INT_PROP(FlowIstlSolverParams, CprEllSolvetype, 0);
|
||||
SET_INT_PROP(FlowIstlSolverParams, CprReuseSetup, 0);
|
||||
SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfigurationJsonFile, "none");
|
||||
SET_BOOL_PROP(FlowIstlSolverParams, UseGpu, false);
|
||||
|
||||
|
||||
|
||||
@ -163,6 +165,7 @@ namespace Opm
|
||||
std::string system_strategy_;
|
||||
bool scale_linear_system_;
|
||||
std::string linear_solver_configuration_json_file_;
|
||||
bool use_gpu_;
|
||||
|
||||
template <class TypeTag>
|
||||
void init()
|
||||
@ -190,6 +193,7 @@ namespace Opm
|
||||
cpr_ell_solvetype_ = EWOMS_GET_PARAM(TypeTag, int, CprEllSolvetype);
|
||||
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
|
||||
linear_solver_configuration_json_file_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile);
|
||||
use_gpu_ = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
}
|
||||
|
||||
template <class TypeTag>
|
||||
@ -217,6 +221,7 @@ namespace Opm
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprEllSolvetype, "Solver type of elliptic pressure solve (0: bicgstab, 1: cg, 2: only amg preconditioner)");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse Amg Setup");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile, "Filename of JSON configuration for flexible linear solver system.");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, bool, UseGpu, "Use GPU cusparseSolver as the linear solver");
|
||||
}
|
||||
|
||||
FlowLinearSolverParameters() { reset(); }
|
||||
@ -238,6 +243,7 @@ namespace Opm
|
||||
ilu_milu_ = MILU_VARIANT::ILU;
|
||||
ilu_redblack_ = false;
|
||||
ilu_reorder_sphere_ = true;
|
||||
use_gpu_ = false;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -45,6 +45,8 @@
|
||||
|
||||
#include <opm/common/utility/platform_dependent/reenable_warnings.h>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
|
||||
BEGIN_PROPERTIES
|
||||
|
||||
NEW_TYPE_TAG(FlowIstlSolver, INHERITS_FROM(FlowIstlSolverParams));
|
||||
@ -222,6 +224,10 @@ protected:
|
||||
enum { pressureVarIndex = Indices::pressureSwitchIdx };
|
||||
static const int numEq = Indices::numEq;
|
||||
|
||||
#if HAVE_CUDA
|
||||
std::unique_ptr<BdaBridge> bdaBridge;
|
||||
#endif
|
||||
|
||||
public:
|
||||
typedef Dune::AssembledLinearOperator< Matrix, Vector, Vector > AssembledLinearOperatorType;
|
||||
|
||||
@ -239,6 +245,22 @@ protected:
|
||||
converged_(false)
|
||||
{
|
||||
parameters_.template init<TypeTag>();
|
||||
#if HAVE_CUDA
|
||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
|
||||
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
||||
const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||
if (use_gpu && !matrix_add_well_contributions) {
|
||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
|
||||
}
|
||||
bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
|
||||
#else
|
||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
if (use_gpu) {
|
||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
|
||||
}
|
||||
#endif
|
||||
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
||||
const auto& gridForConn = simulator_.vanguard().grid();
|
||||
const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
|
||||
@ -436,11 +458,30 @@ protected:
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// tries to solve linear system
|
||||
// solve_system() does nothing if Dune is selected
|
||||
#if HAVE_CUDA
|
||||
bdaBridge->solve_system(matrix_.get(), istlb, result);
|
||||
|
||||
if (result.converged) {
|
||||
// get result vector x from non-Dune backend, iff solve was successful
|
||||
bdaBridge->get_result(x);
|
||||
} else {
|
||||
// CPU fallback, or default case for Dune
|
||||
const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
|
||||
if (use_gpu) {
|
||||
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
||||
}
|
||||
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
||||
solve(linearOperator, x, istlb, *sp, *precond, result);
|
||||
} // end Dune call
|
||||
#else
|
||||
// Construct preconditioner.
|
||||
auto precond = constructPrecond(linearOperator, parallelInformation_arg);
|
||||
|
||||
// Solve.
|
||||
solve(linearOperator, x, istlb, *sp, *precond, result);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
240
opm/simulators/linalg/bda/BdaBridge.cpp
Normal file
240
opm/simulators/linalg/bda/BdaBridge.cpp
Normal file
@ -0,0 +1,240 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/material/common/Unused.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
|
||||
#define PRINT_TIMERS_BRIDGE 0
|
||||
|
||||
typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
BdaBridge::BdaBridge(bool use_gpu_ OPM_UNUSED, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED) : use_gpu(use_gpu_) {
|
||||
#if HAVE_CUDA
|
||||
if (use_gpu) {
|
||||
backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if HAVE_CUDA
|
||||
template <class BridgeMatrix>
|
||||
int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
static std::vector<typename BridgeMatrix::size_type> diag_indices; // contains offsets of the diagonal nnzs
|
||||
int numZeros = 0;
|
||||
const int dim = 3; // might be replaced with mat[0][0].N() or BridgeMatrix::block_type::size()
|
||||
const double zero_replace = 1e-15;
|
||||
if (diag_indices.size() == 0) {
|
||||
int N = mat.N();
|
||||
diag_indices.reserve(N);
|
||||
for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
auto diag = r->find(r.index()); // diag is an iterator
|
||||
assert(diag.index() == r.index());
|
||||
for (int rr = 0; rr < dim; ++rr) {
|
||||
auto& val = (*diag)[rr][rr]; // reference to easily change the value
|
||||
if (val == 0.0) { // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
diag_indices.emplace_back(diag.offset());
|
||||
}
|
||||
}else{
|
||||
for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
typename BridgeMatrix::size_type offset = diag_indices[r.index()];
|
||||
auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
|
||||
for (int rr = 0; rr < dim; ++rr) {
|
||||
auto& val = diag_block[rr][rr];
|
||||
if (val == 0.0) { // could be replaced by '< 1e-30' or similar
|
||||
val = zero_replace;
|
||||
++numZeros;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return numZeros;
|
||||
}
|
||||
|
||||
|
||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||
// sparsity pattern should stay the same due to matrix-add-well-contributions
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix>
|
||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
int sum_nnzs = 0;
|
||||
|
||||
// convert colIndices and rowPointers
|
||||
if (h_rows.size() == 0) {
|
||||
h_rows.emplace_back(0);
|
||||
for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
|
||||
int size_row = 0;
|
||||
for (auto c = r->begin(); c != r->end(); ++c) {
|
||||
h_cols.emplace_back(c.index());
|
||||
size_row++;
|
||||
}
|
||||
sum_nnzs += size_row;
|
||||
h_rows.emplace_back(sum_nnzs);
|
||||
}
|
||||
|
||||
// h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
|
||||
if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
|
||||
OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
|
||||
}
|
||||
}
|
||||
} // end getSparsityPattern()
|
||||
|
||||
#endif
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector>
|
||||
void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
||||
{
|
||||
|
||||
#if HAVE_CUDA
|
||||
if (use_gpu) {
|
||||
BdaResult result;
|
||||
result.converged = false;
|
||||
static std::vector<int> h_rows;
|
||||
static std::vector<int> h_cols;
|
||||
const int dim = (*mat)[0][0].N();
|
||||
const int N = mat->N()*dim;
|
||||
const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
|
||||
|
||||
if (dim != 3) {
|
||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||
use_gpu = false;
|
||||
}
|
||||
|
||||
if (h_rows.capacity() == 0) {
|
||||
h_rows.reserve(N+1);
|
||||
h_cols.reserve(nnz);
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
Dune::Timer t;
|
||||
#endif
|
||||
getSparsityPattern(*mat, h_rows, h_cols);
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
std::ostringstream out;
|
||||
out << "getSparsityPattern() took: " << t.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
#endif
|
||||
}
|
||||
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
Dune::Timer t_zeros;
|
||||
int numZeros = checkZeroDiagonal(*mat);
|
||||
std::ostringstream out;
|
||||
out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
|
||||
OpmLog::info(out.str());
|
||||
#else
|
||||
checkZeroDiagonal(*mat);
|
||||
#endif
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// actually solve
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
|
||||
cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
|
||||
switch(status) {
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
|
||||
//OpmLog::info("cusparseSolver converged");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
|
||||
OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
|
||||
break;
|
||||
default:
|
||||
OpmLog::warning("cusparseSolver returned unknown status code");
|
||||
}
|
||||
|
||||
res.iterations = result.iterations;
|
||||
res.reduction = result.reduction;
|
||||
res.converged = result.converged;
|
||||
res.conv_rate = result.conv_rate;
|
||||
res.elapsed = result.elapsed;
|
||||
}else{
|
||||
res.converged = false;
|
||||
}
|
||||
#endif // HAVE_CUDA
|
||||
}
|
||||
|
||||
|
||||
template <class BridgeVector>
|
||||
void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
|
||||
#if HAVE_CUDA
|
||||
if (use_gpu) {
|
||||
backend->post_process(static_cast<double*>(&(x[0][0])));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
template void BdaBridge::solve_system< \
|
||||
Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
|
||||
(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
|
||||
InverseOperatorResult &res);
|
||||
|
||||
|
||||
template void BdaBridge::get_result< \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
|
||||
(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
|
||||
|
||||
template void BdaBridge::get_result< \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
|
||||
(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
|
||||
|
||||
template void BdaBridge::get_result< \
|
||||
Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
|
||||
(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
73
opm/simulators/linalg/bda/BdaBridge.hpp
Normal file
73
opm/simulators/linalg/bda/BdaBridge.hpp
Normal file
@ -0,0 +1,73 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BDABRIDGE_HEADER_INCLUDED
|
||||
#define BDABRIDGE_HEADER_INCLUDED
|
||||
|
||||
#include <config.h>
|
||||
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
|
||||
|
||||
#include "dune/istl/bcrsmatrix.hh"
|
||||
#include <opm/simulators/linalg/matrixblock.hh>
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
typedef Dune::InverseOperatorResult InverseOperatorResult;
|
||||
|
||||
/// BdaBridge acts as interface between opm-simulators with the cusparseSolver
|
||||
/// if CUDA was not found during CMake, function bodies of this class are empty
|
||||
class BdaBridge
|
||||
{
|
||||
private:
|
||||
#if HAVE_CUDA
|
||||
std::unique_ptr<cusparseSolverBackend> backend;
|
||||
#endif
|
||||
bool use_gpu;
|
||||
|
||||
public:
|
||||
/// Construct a BdaBridge
|
||||
/// \param[in] use_gpu true iff the cusparseSolver is used, is passed via command-line: '--use-gpu=[true|false]'
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||
BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
|
||||
/// \param[in] b vector b, should be of type Dune::BlockVector
|
||||
/// \param[in] result summary of solver result
|
||||
template <class BridgeMatrix, class BridgeVector>
|
||||
void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
|
||||
|
||||
/// Get the resulting x vector
|
||||
/// \param[inout] x vector x, should be of type Dune::BlockVector
|
||||
template <class BridgeVector>
|
||||
void get_result(BridgeVector &x);
|
||||
|
||||
}; // end class BdaBridge
|
||||
|
||||
}
|
||||
|
||||
#endif
|
44
opm/simulators/linalg/bda/BdaResult.hpp
Normal file
44
opm/simulators/linalg/bda/BdaResult.hpp
Normal file
@ -0,0 +1,44 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef BDARESULT_HEADER_INCLUDED
|
||||
#define BDARESULT_HEADER_INCLUDED
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
/// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
|
||||
/// It is needed to prevent a compile error in basearray.hh, the nvcc compiler might not support all features in there
|
||||
class BdaResult
|
||||
{
|
||||
|
||||
public:
|
||||
int iterations = 0; // number of iterations
|
||||
double reduction = 0.0; // reduction of norm, norm_start / norm_final
|
||||
bool converged = false; // true iff the linear solver reached the desired norm within maxit iterations
|
||||
double conv_rate = 0.0; // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
|
||||
double elapsed = 0.0; // time in seconds to run the linear solver
|
||||
|
||||
// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
|
||||
|
||||
}; // end class BdaResult
|
||||
|
||||
}
|
||||
|
||||
#endif
|
41
opm/simulators/linalg/bda/cuda_header.hpp
Normal file
41
opm/simulators/linalg/bda/cuda_header.hpp
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef CUDA_HEADER_HEADER_INCLUDED
|
||||
#define CUDA_HEADER_HEADER_INCLUDED
|
||||
|
||||
#include <iostream>
|
||||
|
||||
/// Runtime error checking of CUDA functions
|
||||
/// Usage:
|
||||
/// cudaMalloc(...);
|
||||
/// cudaCheckLastError("Error could not allocate memory");
|
||||
///
|
||||
#define cudaCheckLastError(msg) __cudaCheckError( __FILE__, __LINE__, #msg )
|
||||
|
||||
inline void __cudaCheckError(const char *file, const int line, const char *msg){
|
||||
cudaError err = cudaGetLastError();
|
||||
if (cudaSuccess != err){
|
||||
std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
|
||||
std::cerr << "BDA error message: " << msg << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
499
opm/simulators/linalg/bda/cusparseSolverBackend.cu
Normal file
499
opm/simulators/linalg/bda/cusparseSolverBackend.cu
Normal file
@ -0,0 +1,499 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef __NVCC__
|
||||
#error "Cannot compile for cusparse: NVIDIA compiler not found"
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
#include <sys/time.h>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/BdaResult.hpp>
|
||||
#include <opm/simulators/linalg/bda/cuda_header.hpp>
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "cusparse_v2.h"
|
||||
// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
|
||||
const cusparseOperation_t operation = CUSPARSE_OPERATION_NON_TRANSPOSE;
|
||||
const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
|
||||
|
||||
double second(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) {
|
||||
}
|
||||
|
||||
cusparseSolverBackend::~cusparseSolverBackend() {
|
||||
finalize();
|
||||
}
|
||||
|
||||
void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
|
||||
double t_total1, t_total2;
|
||||
int n = N;
|
||||
double rho = 1.0, rhop;
|
||||
double alpha, nalpha, beta;
|
||||
double omega, nomega, tmp1, tmp2;
|
||||
double norm, norm_0;
|
||||
double zero = 0.0;
|
||||
double one = 1.0;
|
||||
double mone = -1.0;
|
||||
float it;
|
||||
|
||||
t_total1 = second();
|
||||
|
||||
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
|
||||
|
||||
cublasDscal(cublasHandle, n, &mone, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
|
||||
cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << std::scientific << "cusparseSolver initial norm: " << norm_0;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
for (it = 0.5; it < maxit; it+=0.5) {
|
||||
rhop = rho;
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho/rhop) * (alpha/omega);
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
|
||||
cublasDscal(cublasHandle, n, &beta, d_p, 1);
|
||||
cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
|
||||
}
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, \
|
||||
&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
|
||||
alpha = rho / tmp1;
|
||||
nalpha = -alpha;
|
||||
cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
|
||||
cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
if (norm < tolerance * norm_0 && it > minit) {
|
||||
break;
|
||||
}
|
||||
|
||||
it += 0.5;
|
||||
|
||||
// apply ilu0
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
|
||||
cusparseDbsrsv2_solve(cusparseHandle, order, \
|
||||
operation, Nb, nnzb, &one, \
|
||||
descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
|
||||
|
||||
// spmv
|
||||
cusparseDbsrmv(cusparseHandle, order, \
|
||||
operation, Nb, Nb, nnzb, &one, descr_M, \
|
||||
d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
|
||||
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
|
||||
cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
|
||||
omega = tmp1 / tmp2;
|
||||
nomega = -omega;
|
||||
cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
|
||||
cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
|
||||
|
||||
cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
|
||||
|
||||
|
||||
if (norm < tolerance * norm_0 && it > minit) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (verbosity > 1) {
|
||||
std::ostringstream out;
|
||||
out << "it: " << it << std::scientific << ", norm: " << norm;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
t_total2 = second();
|
||||
|
||||
res.iterations = std::min(it, (float)maxit);
|
||||
res.reduction = norm/norm_0;
|
||||
res.conv_rate = static_cast<double>(pow(res.reduction,1.0/it));
|
||||
res.elapsed = t_total2 - t_total1;
|
||||
res.converged = (it != (maxit + 0.5));
|
||||
|
||||
if (verbosity > 0) {
|
||||
std::ostringstream out;
|
||||
out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
|
||||
", time per iteration: " << res.elapsed/it << ", iterations: " << it;
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
|
||||
this->N = N;
|
||||
this->nnz = nnz;
|
||||
this->BLOCK_SIZE = dim;
|
||||
this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
std::ostringstream out;
|
||||
out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
|
||||
OpmLog::info(out.str());
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
int deviceID = 0;
|
||||
cudaSetDevice(deviceID);
|
||||
cudaCheckLastError("Could not get device");
|
||||
struct cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, deviceID);
|
||||
cudaCheckLastError("Could not get device properties");
|
||||
out.str("");
|
||||
out.clear();
|
||||
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
|
||||
OpmLog::info(out.str());
|
||||
|
||||
cudaStreamCreate(&stream);
|
||||
cudaCheckLastError("Could not create stream");
|
||||
|
||||
cublasCreate(&cublasHandle);
|
||||
cudaCheckLastError("Could not create cublasHandle");
|
||||
|
||||
cusparseCreate(&cusparseHandle);
|
||||
cudaCheckLastError("Could not create cusparseHandle");
|
||||
|
||||
cudaMalloc((void**)&d_x, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_b, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_r, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_rw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_p, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_pw,sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_s, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_t, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_v, sizeof(double) * N);
|
||||
cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
|
||||
cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
|
||||
cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
|
||||
cudaCheckLastError("Could not allocate enough memory on GPU");
|
||||
|
||||
cublasSetStream(cublasHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cublas");
|
||||
cusparseSetStream(cusparseHandle, stream);
|
||||
cudaCheckLastError("Could not set stream to cusparse");
|
||||
|
||||
cudaMallocHost((void**)&x, sizeof(double) * N);
|
||||
cudaCheckLastError("Could not allocate pinned host memory");
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void cusparseSolverBackend::finalize() {
|
||||
cudaFree(d_x);
|
||||
cudaFree(d_b);
|
||||
cudaFree(d_r);
|
||||
cudaFree(d_rw);
|
||||
cudaFree(d_p);
|
||||
cudaFree(d_pw);
|
||||
cudaFree(d_s);
|
||||
cudaFree(d_t);
|
||||
cudaFree(d_v);
|
||||
cudaFree(d_mVals);
|
||||
cudaFree(d_bVals);
|
||||
cudaFree(d_bCols);
|
||||
cudaFree(d_bRows);
|
||||
cudaFree(d_buffer);
|
||||
cusparseDestroyBsrilu02Info(info_M);
|
||||
cusparseDestroyBsrsv2Info(info_L);
|
||||
cusparseDestroyBsrsv2Info(info_U);
|
||||
cusparseDestroyMatDescr(descr_B);
|
||||
cusparseDestroyMatDescr(descr_M);
|
||||
cusparseDestroyMatDescr(descr_L);
|
||||
cusparseDestroyMatDescr(descr_U);
|
||||
cusparseDestroy(cusparseHandle);
|
||||
cublasDestroy(cublasHandle);
|
||||
cudaHostUnregister(vals);
|
||||
cudaHostUnregister(cols);
|
||||
cudaHostUnregister(rows);
|
||||
cudaStreamDestroy(stream);
|
||||
cudaFreeHost(x);
|
||||
} // end finalize()
|
||||
|
||||
|
||||
void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
|
||||
// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
|
||||
cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
|
||||
cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
this->vals = vals;
|
||||
this->cols = cols;
|
||||
this->rows = rows;
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end copy_system_to_gpu()
|
||||
|
||||
|
||||
// don't copy rowpointers and colindices, they stay the same
|
||||
void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
|
||||
cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end update_system_on_gpu()
|
||||
|
||||
|
||||
void cusparseSolverBackend::reset_prec_on_gpu() {
|
||||
cudaMemcpyAsync(d_mVals, d_bVals, nnz * sizeof(double), cudaMemcpyDeviceToDevice, stream);
|
||||
}
|
||||
|
||||
|
||||
bool cusparseSolverBackend::analyse_matrix() {
|
||||
|
||||
int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
|
||||
double t1, t2;
|
||||
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cusparseCreateMatDescr(&descr_B);
|
||||
cusparseCreateMatDescr(&descr_M);
|
||||
cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO; // matrices from Flow are base0
|
||||
|
||||
cusparseSetMatIndexBase(descr_B, base_type);
|
||||
cusparseSetMatIndexBase(descr_M, base_type);
|
||||
|
||||
cusparseCreateMatDescr(&descr_L);
|
||||
cusparseSetMatIndexBase(descr_L, base_type);
|
||||
cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
|
||||
cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
|
||||
|
||||
cusparseCreateMatDescr(&descr_U);
|
||||
cusparseSetMatIndexBase(descr_U, base_type);
|
||||
cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
|
||||
cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
|
||||
cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
|
||||
cudaCheckLastError("Could not initialize matrix descriptions");
|
||||
|
||||
cusparseCreateBsrilu02Info(&info_M);
|
||||
cusparseCreateBsrsv2Info(&info_L);
|
||||
cusparseCreateBsrsv2Info(&info_U);
|
||||
cudaCheckLastError("Could not create analysis info");
|
||||
|
||||
cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
|
||||
descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
|
||||
cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
|
||||
descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
|
||||
cudaCheckLastError();
|
||||
d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
|
||||
|
||||
cudaMalloc((void**)&d_buffer, d_bufferSize);
|
||||
|
||||
// analysis of ilu LU decomposition
|
||||
cusparseDbsrilu02_analysis(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// analysis of ilu apply
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_L, policy, d_buffer);
|
||||
|
||||
cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
|
||||
Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
|
||||
BLOCK_SIZE, info_U, policy, d_buffer);
|
||||
cudaCheckLastError("Could not analyse level information");
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end analyse_matrix()
|
||||
|
||||
bool cusparseSolverBackend::create_preconditioner() {
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
d_mCols = d_bCols;
|
||||
d_mRows = d_bRows;
|
||||
cusparseDbsrilu02(cusparseHandle, order, \
|
||||
Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
|
||||
BLOCK_SIZE, info_M, policy, d_buffer);
|
||||
|
||||
int structural_zero;
|
||||
// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
|
||||
cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
|
||||
if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
cudaStreamSynchronize(stream);
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
void cusparseSolverBackend::solve_system(BdaResult &res) {
|
||||
// actually solve
|
||||
gpu_pbicgstab(res);
|
||||
cudaStreamSynchronize(stream);
|
||||
cudaCheckLastError("Something went wrong during the GPU solve");
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
void cusparseSolverBackend::post_process(double *x) {
|
||||
|
||||
if (!initialized) {
|
||||
cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
|
||||
}
|
||||
|
||||
double t1, t2;
|
||||
if (verbosity > 2) {
|
||||
t1 = second();
|
||||
}
|
||||
|
||||
cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
|
||||
if (verbosity > 2) {
|
||||
t2 = second();
|
||||
std::ostringstream out;
|
||||
out << "cusparseSolver::post_process(): " << t2-t1 << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
} // end post_process()
|
||||
|
||||
|
||||
typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
|
||||
|
||||
cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {
|
||||
if (initialized == false) {
|
||||
initialize(N, nnz, dim);
|
||||
copy_system_to_gpu(vals, rows, cols, b);
|
||||
}else{
|
||||
update_system_on_gpu(vals, b);
|
||||
}
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
reset_prec_on_gpu();
|
||||
if (create_preconditioner()) {
|
||||
solve_system(res);
|
||||
}else{
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
151
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
Normal file
151
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
Normal file
@ -0,0 +1,151 @@
|
||||
/*
|
||||
Copyright 2019 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_CUSPARSESOLVER_BACKEND_HEADER_INCLUDED
|
||||
#define OPM_CUSPARSESOLVER_BACKEND_HEADER_INCLUDED
|
||||
|
||||
|
||||
#include "cublas_v2.h"
|
||||
#include "cusparse_v2.h"
|
||||
|
||||
#include "opm/simulators/linalg/bda/BdaResult.hpp"
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
|
||||
class cusparseSolverBackend{
|
||||
|
||||
private:
|
||||
|
||||
int minit;
|
||||
int maxit;
|
||||
double tolerance;
|
||||
|
||||
cublasHandle_t cublasHandle;
|
||||
cusparseHandle_t cusparseHandle;
|
||||
cudaStream_t stream;
|
||||
cusparseMatDescr_t descr_B, descr_M, descr_L, descr_U;
|
||||
bsrilu02Info_t info_M;
|
||||
bsrsv2Info_t info_L, info_U;
|
||||
// b: bsr matrix, m: preconditioner
|
||||
double *d_bVals, *d_mVals;
|
||||
int *d_bCols, *d_mCols;
|
||||
int *d_bRows, *d_mRows;
|
||||
double *d_x, *d_b, *d_r, *d_rw, *d_p;
|
||||
double *d_pw, *d_s, *d_t, *d_v;
|
||||
double *vals;
|
||||
int *cols, *rows;
|
||||
double *x, *b;
|
||||
void *d_buffer;
|
||||
int N, Nb, nnz, nnzb;
|
||||
|
||||
int BLOCK_SIZE;
|
||||
|
||||
bool initialized = false;
|
||||
bool analysis_done = false;
|
||||
|
||||
// verbosity
|
||||
// 0: print nothing during solves, only when initializing
|
||||
// 1: print number of iterations and final norm
|
||||
// 2: also print norm each iteration
|
||||
// 3: also print timings of different backend functions
|
||||
|
||||
int verbosity = 0;
|
||||
|
||||
/// Solve linear system using ilu0-bicgstab
|
||||
/// \param[inout] res summary of solver result
|
||||
void gpu_pbicgstab(BdaResult& res);
|
||||
|
||||
/// Initialize GPU and allocate memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
void initialize(int N, int nnz, int dim);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
/// Copy linear system to GPU
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] b input vector, contains N values
|
||||
void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
|
||||
|
||||
// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector, contains N values
|
||||
void update_system_on_gpu(double *vals, double *b);
|
||||
|
||||
/// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
|
||||
void reset_prec_on_gpu();
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
bool analyse_matrix();
|
||||
|
||||
/// Perform ilu0-decomposition
|
||||
/// \return true iff decomposition was successful
|
||||
bool create_preconditioner();
|
||||
|
||||
/// Solve linear system
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(BdaResult &res);
|
||||
|
||||
public:
|
||||
|
||||
enum class cusparseSolverStatus {
|
||||
CUSPARSE_SOLVER_SUCCESS,
|
||||
CUSPARSE_SOLVER_ANALYSIS_FAILED,
|
||||
CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
|
||||
CUSPARSE_SOLVER_UNKNOWN_ERROR
|
||||
};
|
||||
|
||||
/// Construct a cusparseSolver
|
||||
/// \param[in] linear_solver_verbosity verbosity of cusparseSolver
|
||||
/// \param[in] maxit maximum number of iterations for cusparseSolver
|
||||
/// \param[in] tolerance required relative tolerance for cusparseSolver
|
||||
cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
|
||||
|
||||
/// Destroy a cusparseSolver, and free memory
|
||||
~cusparseSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
|
||||
|
||||
/// Post processing after linear solve, now only copies resulting x vector back
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void post_process(double *x);
|
||||
|
||||
}; // end class cusparseSolverBackend
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user