Merge pull request #4273 from Tongdongq/add-test

Add test for cusparseSolver
This commit is contained in:
Markus Blatt 2022-11-22 13:55:11 +01:00 committed by GitHub
commit 42fdc38ae0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 107 additions and 63 deletions

View File

@ -162,6 +162,7 @@ endif()
if(CUDA_FOUND)
set(HAVE_CUDA 1)
set(COMPILE_BDA_BRIDGE 1)
include_directories(${CUDA_INCLUDE_DIRS})
endif()
@ -173,6 +174,7 @@ if(OpenCL_FOUND)
find_file(CL2_HPP CL/cl2.hpp HINTS ${OpenCL_INCLUDE_DIRS})
if(CL2_HPP)
set(HAVE_OPENCL 1)
set(COMPILE_BDA_BRIDGE 1)
include_directories(${OpenCL_INCLUDE_DIRS})
find_file(OPENCL_HPP CL/opencl.hpp HINTS ${OpenCL_INCLUDE_DIRS})
if(OPENCL_HPP)
@ -206,6 +208,7 @@ endif()
find_package(amgcl)
if(amgcl_FOUND)
set(HAVE_AMGCL 1)
set(COMPILE_BDA_BRIDGE 1)
# Linking to target angcl::amgcl drags in OpenMP and -fopenmp as a compile
# flag. With that nvcc fails as it does not that flag.
# Hence we set AMGCL_INCLUDE_DIRS.
@ -217,6 +220,7 @@ if(OpenCL_FOUND)
find_package(VexCL)
if(VexCL_FOUND)
set(HAVE_VEXCL 1)
set(COMPILE_BDA_BRIDGE 1)
# generator expressions in vexcl do not seem to work and therefore
# we cannot use the imported target. Hence we exract the needed info
# from the targets
@ -235,6 +239,7 @@ endif()
find_package(rocalution)
if(ROCALUTION_FOUND)
set(HAVE_ROCALUTION 1)
set(COMPILE_BDA_BRIDGE 1)
endif()

View File

@ -148,7 +148,7 @@ endif()
if(ROCALUTION_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
endif()
if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR amgcl_FOUND OR ROCALUTION_FOUND)
if(COMPILE_BDA_BRIDGE)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif()
if(amgcl_FOUND)

View File

@ -5,6 +5,7 @@ set (opm-simulators_CONFIG_VAR
HAVE_EWOMS
HAVE_MPI
HAVE_PETSC
COMPILE_BDA_BRIDGE
HAVE_CUDA
HAVE_OPENCL
HAVE_OPENCL_HPP

View File

@ -31,7 +31,7 @@
#include <opm/simulators/linalg/ParallelIstlInformation.hpp>
#include <opm/simulators/utils/ParallelCommunication.hpp>
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#endif
@ -149,7 +149,7 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
this->op_ = std::move(pop);
this->solver_ = std::move(sol);
}
#endif
#endif
} else {
if (!wellOperator_) {
using SeqOperatorType = Dune::MatrixAdapter<Matrix, Vector, Vector>;
@ -175,7 +175,7 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
}
}
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
template<class Matrix, class Vector>
BdaSolverInfo<Matrix,Vector>::
BdaSolverInfo(const std::string& accelerator_mode,
@ -329,7 +329,7 @@ copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
}
}
}
#endif
#endif // COMPILE_BDA_BRIDGE
template<int Dim>
using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
@ -346,7 +346,7 @@ using CommunicationType = Dune::CollectiveCommunication<int>;
template void makeOverlapRowsInvalid<BM<Dim>>(BM<Dim>&, const std::vector<int>&); \
template struct FlexibleSolverInfo<BM<Dim>,BV<Dim>,CommunicationType>;
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
#define INSTANCE_GRID(Dim, Grid) \
template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
@ -376,7 +376,7 @@ using CommunicationType = Dune::CollectiveCommunication<int>;
#else
#define INSTANCE(Dim) \
INSTANCE_FLEX(Dim)
#endif
#endif // COMPILE_BDA_BRIDGE
INSTANCE(1)
INSTANCE(2)

View File

@ -84,7 +84,7 @@ public:
namespace Opm
{
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
template<class Matrix, class Vector, int block_size> class BdaBridge;
class WellContributions;
#endif
@ -113,7 +113,7 @@ struct FlexibleSolverInfo
size_t interiorCellNum_ = 0;
};
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
template<class Matrix, class Vector>
struct BdaSolverInfo
{
@ -245,7 +245,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
EWOMS_PARAM_IS_SET(TypeTag, int, LinearSolverMaxIter),
EWOMS_PARAM_IS_SET(TypeTag, int, CprMaxEllIter));
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
{
std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
@ -383,7 +383,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
// Solve system.
Dune::InverseOperatorResult result;
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
std::function<void(WellContributions&)> getContribs =
[this](WellContributions& w)
{
@ -555,7 +555,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
Matrix* matrix_;
Vector *rhs_;
#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
#if COMPILE_BDA_BRIDGE
std::unique_ptr<detail::BdaSolverInfo<Matrix, Vector>> bdaBridge;
#endif

View File

@ -52,7 +52,31 @@ const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
template <unsigned int block_size>
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {}
cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {
// initialize CUDA device, stream and libraries
cudaSetDevice(deviceID);
cudaCheckLastError("Could not get device");
struct cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
std::ostringstream out;
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
cudaCheckLastError("Could not create stream");
cublasCreate(&cublasHandle);
cudaCheckLastError("Could not create cublasHandle");
cusparseCreate(&cusparseHandle);
cudaCheckLastError("Could not create cusparseHandle");
cublasSetStream(cublasHandle, stream);
cudaCheckLastError("Could not set stream to cublas");
cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse");
}
template <unsigned int block_size>
cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
@ -209,25 +233,6 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
OpmLog::info(out.str());
cudaSetDevice(deviceID);
cudaCheckLastError("Could not get device");
struct cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceID);
cudaCheckLastError("Could not get device properties");
out.str("");
out.clear();
out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
OpmLog::info(out.str());
cudaStreamCreate(&stream);
cudaCheckLastError("Could not create stream");
cublasCreate(&cublasHandle);
cudaCheckLastError("Could not create cublasHandle");
cusparseCreate(&cusparseHandle);
cudaCheckLastError("Could not create cusparseHandle");
cudaMalloc((void**)&d_x, sizeof(double) * N);
cudaMalloc((void**)&d_b, sizeof(double) * N);
cudaMalloc((void**)&d_r, sizeof(double) * N);
@ -251,11 +256,6 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
}
cudaCheckLastError("Could not allocate enough memory on GPU");
cublasSetStream(cublasHandle, stream);
cudaCheckLastError("Could not set stream to cublas");
cusparseSetStream(cusparseHandle, stream);
cudaCheckLastError("Could not set stream to cusparse");
#if COPY_ROW_BY_ROW
cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
cudaCheckLastError("Could not allocate pinned memory");

View File

@ -52,6 +52,11 @@ rocalutionSolverBackend<block_size>::rocalutionSolverBackend(int verbosity_, int
template <unsigned int block_size>
rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
// normally, these rocalution variables are destroyed after the destructor automatically,
// but sometimes it segfaults, both with test_rocalutionSolver and with an actual case
// release both variables here to prevent that segfault
roc_prec.release();
roc_solver.release();
rocalution::stop_rocalution();
}

View File

@ -60,7 +60,6 @@ private:
int *tmp_colindices; // store matrix on host, this pointer is given to and freed by rocalution
double *tmp_nnzvalues; // store matrix on host, this pointer is given to and freed by rocalution
// must be declared in this order, to prevent a segfault during the test
std::unique_ptr<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_prec;
std::unique_ptr<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_solver;

View File

@ -87,24 +87,18 @@ getDuneSolution(Matrix<bz>& matrix, Vector<bz>& rhs)
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vector<bz>& rhs)
void
createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge)
{
const int linear_solver_verbosity = prm.get<int>("verbosity");
const int maxit = prm.get<int>("maxiter");
const double tolerance = prm.get<double>("tol");
const bool opencl_ilu_parallel(true); // unused
const int platformID = 0; // unused
const bool opencl_ilu_parallel(true);
const int platformID = 0;
const int deviceID = 0;
const std::string accelerator_mode("cusparse");
const std::string linsolver("ilu0");
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
linear_solver_verbosity,
@ -114,12 +108,6 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
deviceID,
opencl_ilu_parallel,
linsolver);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
bridge->get_result(x);
return x;
} catch (const std::logic_error& error) {
BOOST_WARN_MESSAGE(true, error.what());
if (strstr(error.what(), "Could not get device") != nullptr)
@ -129,6 +117,38 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
}
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testCusparseSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
bridge.get_result(x);
return x;
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testCusparseSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("cusparse", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
// the sparsity pattern and values are actually the same
bridge.solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
bridge.get_result(x);
return x;
}
namespace pt = boost::property_tree;
void test3(const pt::ptree& prm)
@ -136,17 +156,31 @@ void test3(const pt::ptree& prm)
const int bz = 3;
Matrix<bz> matrix;
Vector<bz> rhs;
std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
readLinearSystem("matr33.txt", "rhs3.txt", matrix, rhs);
Vector<bz> rhs2 = rhs; // deep copy, getDuneSolution() changes values in rhs vector
auto duneSolution = getDuneSolution<bz>(matrix, rhs);
auto sol = testCusparseSolver<bz>(prm, matrix, rhs2);
createBridge(prm, bridge); // create bridge with openclSolver
// should create bridge only once
// test cusparseSolver without Jacobi matrix
auto sol = testCusparseSolver<bz>(*bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
for (size_t i = 0; i < sol.size(); ++i) {
for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(sol[i][row], duneSolution[i][row], 1e-3);
}
}
// test cusparseSolver with Jacobi matrix
auto solJacobi = testCusparseSolverJacobi<bz>(*bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
for (size_t i = 0; i < solJacobi.size(); ++i) {
for (int row = 0; row < bz; ++row) {
BOOST_CHECK_CLOSE(solJacobi[i][row], duneSolution[i][row], 1e-3);
}
}
}

View File

@ -116,22 +116,22 @@ createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBri
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testOpenclSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
testOpenclSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
auto wellContribs = Opm::WellContributions::create("opencl", false);
auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
// matrix created by readMatrixMarket() did not have contiguous memory
bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
bridge->get_result(x);
bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
bridge.get_result(x);
return x;
}
template <int bz>
Dune::BlockVector<Dune::FieldVector<double, bz>>
testOpenclSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
testOpenclSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
{
Dune::InverseOperatorResult result;
Vector<bz> x(rhs.size());
@ -140,8 +140,8 @@ testOpenclSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz
// matrix created by readMatrixMarket() did not have contiguous memory
auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
// the sparsity pattern and values are actually the same
bridge->solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
bridge->get_result(x);
bridge.solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
bridge.get_result(x);
return x;
}
@ -162,7 +162,7 @@ void test3(const pt::ptree& prm)
// should create bridge only once
// test openclSolver without Jacobi matrix
auto sol = testOpenclSolver<bz>(bridge, matrix, rhs2);
auto sol = testOpenclSolver<bz>(*bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
for (size_t i = 0; i < sol.size(); ++i) {
for (int row = 0; row < bz; ++row) {
@ -171,7 +171,7 @@ void test3(const pt::ptree& prm)
}
// test openclSolver with Jacobi matrix
auto solJacobi = testOpenclSolverJacobi<bz>(bridge, matrix, rhs2);
auto solJacobi = testOpenclSolverJacobi<bz>(*bridge, matrix, rhs2);
BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
for (size_t i = 0; i < solJacobi.size(); ++i) {
for (int row = 0; row < bz; ++row) {