Merge pull request #4273 from Tongdongq/add-test

Add test for cusparseSolver
2025-02-25 18:55:30 -06:00 · 2022-11-22 13:55:11 +01:00
parent ab3da9815d a8e568f4c5
commit 42fdc38ae0
10 changed files with 107 additions and 63 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -162,6 +162,7 @@ endif()
 if(CUDA_FOUND)
  set(HAVE_CUDA 1)
  set(COMPILE_BDA_BRIDGE 1)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()
@@ -173,6 +174,7 @@ if(OpenCL_FOUND)
  find_file(CL2_HPP CL/cl2.hpp HINTS ${OpenCL_INCLUDE_DIRS})
  if(CL2_HPP)
    set(HAVE_OPENCL 1)
    set(COMPILE_BDA_BRIDGE 1)
    include_directories(${OpenCL_INCLUDE_DIRS})
    find_file(OPENCL_HPP CL/opencl.hpp HINTS ${OpenCL_INCLUDE_DIRS})
    if(OPENCL_HPP)
@@ -206,6 +208,7 @@ endif()
 find_package(amgcl)
 if(amgcl_FOUND)
  set(HAVE_AMGCL 1)
  set(COMPILE_BDA_BRIDGE 1)
  # Linking to target angcl::amgcl drags in OpenMP and -fopenmp as a compile
  # flag. With that nvcc fails as it does not that flag.
  # Hence we set AMGCL_INCLUDE_DIRS.
@@ -217,6 +220,7 @@ if(OpenCL_FOUND)
  find_package(VexCL)
  if(VexCL_FOUND)
    set(HAVE_VEXCL 1)
    set(COMPILE_BDA_BRIDGE 1)
    # generator expressions in vexcl do not seem to work and therefore
    # we cannot use the imported target. Hence we exract the needed info
    # from the targets
@@ -235,6 +239,7 @@ endif()
 find_package(rocalution)
 if(ROCALUTION_FOUND)
  set(HAVE_ROCALUTION 1)
  set(COMPILE_BDA_BRIDGE 1)
 endif()
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -148,7 +148,7 @@ endif()
 if(ROCALUTION_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
 endif()
-if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR amgcl_FOUND OR ROCALUTION_FOUND)
+if(COMPILE_BDA_BRIDGE)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
 endif()
 if(amgcl_FOUND)
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@@ -5,6 +5,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_EWOMS
  HAVE_MPI
  HAVE_PETSC
  COMPILE_BDA_BRIDGE
  HAVE_CUDA
  HAVE_OPENCL
  HAVE_OPENCL_HPP
--- a/opm/simulators/linalg/ISTLSolverEbos.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.cpp
@@ -31,7 +31,7 @@
 #include <opm/simulators/linalg/ParallelIstlInformation.hpp>
 #include <opm/simulators/utils/ParallelCommunication.hpp>
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 #endif
@@ -175,7 +175,7 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
    }
 }
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
 template<class Matrix, class Vector>
 BdaSolverInfo<Matrix,Vector>::
 BdaSolverInfo(const std::string& accelerator_mode,
@@ -329,7 +329,7 @@ copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
        }
    }
 }
-#endif
+#endif // COMPILE_BDA_BRIDGE
 template<int Dim>
 using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
@@ -346,7 +346,7 @@ using CommunicationType = Dune::CollectiveCommunication<int>;
    template void makeOverlapRowsInvalid<BM<Dim>>(BM<Dim>&, const std::vector<int>&); \
    template struct FlexibleSolverInfo<BM<Dim>,BV<Dim>,CommunicationType>;
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
 #define INSTANCE_GRID(Dim, Grid) \
    template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
@@ -376,7 +376,7 @@ using CommunicationType = Dune::CollectiveCommunication<int>;
 #else
 #define INSTANCE(Dim) \
    INSTANCE_FLEX(Dim)
-#endif
+#endif // COMPILE_BDA_BRIDGE
 INSTANCE(1)
 INSTANCE(2)
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -84,7 +84,7 @@ public:
 namespace Opm
 {
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
 template<class Matrix, class Vector, int block_size> class BdaBridge;
 class WellContributions;
 #endif
@@ -113,7 +113,7 @@ struct FlexibleSolverInfo
    size_t interiorCellNum_ = 0;
 };
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
 template<class Matrix, class Vector>
 struct BdaSolverInfo
 {
@@ -245,7 +245,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
                                     EWOMS_PARAM_IS_SET(TypeTag, int, LinearSolverMaxIter),
                                     EWOMS_PARAM_IS_SET(TypeTag, int, CprMaxEllIter));
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
            {
                std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
                if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
@@ -383,7 +383,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
            // Solve system.
            Dune::InverseOperatorResult result;
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
            std::function<void(WellContributions&)> getContribs =
                [this](WellContributions& w)
                {
@@ -555,7 +555,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
        Matrix* matrix_;
        Vector *rhs_;
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION
+#if COMPILE_BDA_BRIDGE
        std::unique_ptr<detail::BdaSolverInfo<Matrix, Vector>> bdaBridge;
 #endif
--- a/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu
@@ -52,7 +52,31 @@ const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
 template <unsigned int block_size>
-cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {}
+cusparseSolverBackend<block_size>::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, deviceID_) {
    // initialize CUDA device, stream and libraries
    cudaSetDevice(deviceID);
    cudaCheckLastError("Could not get device");
    struct cudaDeviceProp props;
    cudaGetDeviceProperties(&props, deviceID);
    cudaCheckLastError("Could not get device properties");
    std::ostringstream out;
    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
    OpmLog::info(out.str());
    cudaStreamCreate(&stream);
    cudaCheckLastError("Could not create stream");
    cublasCreate(&cublasHandle);
    cudaCheckLastError("Could not create cublasHandle");
    cusparseCreate(&cusparseHandle);
    cudaCheckLastError("Could not create cusparseHandle");
    cublasSetStream(cublasHandle, stream);
    cudaCheckLastError("Could not set stream to cublas");
    cusparseSetStream(cusparseHandle, stream);
    cudaCheckLastError("Could not set stream to cusparse");
 }
 template <unsigned int block_size>
 cusparseSolverBackend<block_size>::~cusparseSolverBackend() {
@@ -209,25 +233,6 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
    OpmLog::info(out.str());
    cudaSetDevice(deviceID);
    cudaCheckLastError("Could not get device");
    struct cudaDeviceProp props;
    cudaGetDeviceProperties(&props, deviceID);
    cudaCheckLastError("Could not get device properties");
    out.str("");
    out.clear();
    out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
    OpmLog::info(out.str());
    cudaStreamCreate(&stream);
    cudaCheckLastError("Could not create stream");
    cublasCreate(&cublasHandle);
    cudaCheckLastError("Could not create cublasHandle");
    cusparseCreate(&cusparseHandle);
    cudaCheckLastError("Could not create cusparseHandle");
    cudaMalloc((void**)&d_x, sizeof(double) * N);
    cudaMalloc((void**)&d_b, sizeof(double) * N);
    cudaMalloc((void**)&d_r, sizeof(double) * N);
@@ -251,11 +256,6 @@ void cusparseSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix
    }
    cudaCheckLastError("Could not allocate enough memory on GPU");
    cublasSetStream(cublasHandle, stream);
    cudaCheckLastError("Could not set stream to cublas");
    cusparseSetStream(cusparseHandle, stream);
    cudaCheckLastError("Could not set stream to cusparse");
 #if COPY_ROW_BY_ROW
    cudaMallocHost((void**)&vals_contiguous, sizeof(double) * nnz);
    cudaCheckLastError("Could not allocate pinned memory");
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.cpp
@@ -52,6 +52,11 @@ rocalutionSolverBackend<block_size>::rocalutionSolverBackend(int verbosity_, int
 template <unsigned int block_size>
 rocalutionSolverBackend<block_size>::~rocalutionSolverBackend() {
    // normally, these rocalution variables are destroyed after the destructor automatically,
    // but sometimes it segfaults, both with test_rocalutionSolver and with an actual case
    // release both variables here to prevent that segfault
    roc_prec.release();
    roc_solver.release();
    rocalution::stop_rocalution();
 }
--- a/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
@@ -60,7 +60,6 @@ private:
    int *tmp_colindices;     // store matrix on host, this pointer is given to and freed by rocalution
    double *tmp_nnzvalues;   // store matrix on host, this pointer is given to and freed by rocalution
    // must be declared in this order, to prevent a segfault during the test
    std::unique_ptr<rocalution::ILU<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_prec;
    std::unique_ptr<rocalution::BiCGStab<rocalution::LocalMatrix<double>, rocalution::LocalVector<double>, double> > roc_solver;
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@@ -87,24 +87,18 @@ getDuneSolution(Matrix<bz>& matrix, Vector<bz>& rhs)
 }
 template <int bz>
-Dune::BlockVector<Dune::FieldVector<double, bz>>
+void
-testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, Vector<bz>& rhs)
+createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge)
 {
    const int linear_solver_verbosity = prm.get<int>("verbosity");
    const int maxit = prm.get<int>("maxiter");
    const double tolerance = prm.get<double>("tol");
-    const bool opencl_ilu_parallel(true); // unused
+    const bool opencl_ilu_parallel(true);
-    const int platformID = 0;                     // unused
+    const int platformID = 0;
    const int deviceID = 0;
    const std::string accelerator_mode("cusparse");
    const std::string linsolver("ilu0");
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >(accelerator_mode,
                                                                               linear_solver_verbosity,
@@ -114,12 +108,6 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
                                                                               deviceID,
                                                                               opencl_ilu_parallel,
                                                                               linsolver);
        auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                            // matrix created by readMatrixMarket() did not have contiguous memory
        bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
        bridge->get_result(x);
        return x;
    } catch (const std::logic_error& error) {
        BOOST_WARN_MESSAGE(true, error.what());
        if (strstr(error.what(), "Could not get device") != nullptr)
@@ -129,6 +117,38 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix<bz>& matrix, V
    }
 }
 template <int bz>
 Dune::BlockVector<Dune::FieldVector<double, bz>>
 testCusparseSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
    bridge.get_result(x);
    return x;
 }
 template <int bz>
 Dune::BlockVector<Dune::FieldVector<double, bz>>
 testCusparseSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("cusparse", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
    auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
                        // the sparsity pattern and values are actually the same
    bridge.solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
    bridge.get_result(x);
    return x;
 }
 namespace pt = boost::property_tree;
 void test3(const pt::ptree& prm)
@@ -136,17 +156,31 @@ void test3(const pt::ptree& prm)
    const int bz = 3;
    Matrix<bz> matrix;
    Vector<bz> rhs;
    std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> > bridge;
    readLinearSystem("matr33.txt", "rhs3.txt", matrix, rhs);
    Vector<bz> rhs2 = rhs; // deep copy, getDuneSolution() changes values in rhs vector
    auto duneSolution = getDuneSolution<bz>(matrix, rhs);
    auto sol = testCusparseSolver<bz>(prm, matrix, rhs2);
    createBridge(prm, bridge); // create bridge with openclSolver
                               // should create bridge only once
    // test cusparseSolver without Jacobi matrix
    auto sol = testCusparseSolver<bz>(*bridge, matrix, rhs2);
    BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
    for (size_t i = 0; i < sol.size(); ++i) {
        for (int row = 0; row < bz; ++row) {
            BOOST_CHECK_CLOSE(sol[i][row], duneSolution[i][row], 1e-3);
        }
    }
    // test cusparseSolver with Jacobi matrix
    auto solJacobi = testCusparseSolverJacobi<bz>(*bridge, matrix, rhs2);
    BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
    for (size_t i = 0; i < solJacobi.size(); ++i) {
        for (int row = 0; row < bz; ++row) {
            BOOST_CHECK_CLOSE(solJacobi[i][row], duneSolution[i][row], 1e-3);
        }
    }
 }
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@@ -116,22 +116,22 @@ createBridge(const boost::property_tree::ptree& prm, std::unique_ptr<Opm::BdaBri
 template <int bz>
 Dune::BlockVector<Dune::FieldVector<double, bz>>
-testOpenclSolver(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
+testOpenclSolver(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
    auto wellContribs = Opm::WellContributions::create("opencl", false);
    auto mat2 = matrix; // deep copy to make sure nnz values are in contiguous memory
                        // matrix created by readMatrixMarket() did not have contiguous memory
-    bridge->solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
+    bridge.solve_system(&mat2, &mat2, /*numJacobiBlocks=*/0, rhs, *wellContribs, result);
-    bridge->get_result(x);
+    bridge.get_result(x);
    return x;
 }
 template <int bz>
 Dune::BlockVector<Dune::FieldVector<double, bz>>
-testOpenclSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz> >& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
+testOpenclSolverJacobi(Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz>& bridge, Matrix<bz>& matrix, Vector<bz>& rhs)
 {
    Dune::InverseOperatorResult result;
    Vector<bz> x(rhs.size());
@@ -140,8 +140,8 @@ testOpenclSolverJacobi(std::unique_ptr<Opm::BdaBridge<Matrix<bz>, Vector<bz>, bz
                        // matrix created by readMatrixMarket() did not have contiguous memory
    auto mat3 = matrix; // another deep copy, to make sure Jacobi matrix memory is different
                        // the sparsity pattern and values are actually the same
-    bridge->solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
+    bridge.solve_system(&mat2, &mat3, /*numJacobiBlocks=*/2, rhs, *wellContribs, result);
-    bridge->get_result(x);
+    bridge.get_result(x);
    return x;
 }
@@ -162,7 +162,7 @@ void test3(const pt::ptree& prm)
                               // should create bridge only once
    // test openclSolver without Jacobi matrix
-    auto sol = testOpenclSolver<bz>(bridge, matrix, rhs2);
+    auto sol = testOpenclSolver<bz>(*bridge, matrix, rhs2);
    BOOST_REQUIRE_EQUAL(sol.size(), duneSolution.size());
    for (size_t i = 0; i < sol.size(); ++i) {
        for (int row = 0; row < bz; ++row) {
@@ -171,7 +171,7 @@ void test3(const pt::ptree& prm)
    }
    // test openclSolver with Jacobi matrix
-    auto solJacobi = testOpenclSolverJacobi<bz>(bridge, matrix, rhs2);
+    auto solJacobi = testOpenclSolverJacobi<bz>(*bridge, matrix, rhs2);
    BOOST_REQUIRE_EQUAL(solJacobi.size(), duneSolution.size());
    for (size_t i = 0; i < solJacobi.size(); ++i) {
        for (int row = 0; row < bz; ++row) {