diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec7a476..39dff6180 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -213,6 +213,35 @@ if(OpenCL_FOUND) endif() endif() +find_package(amgcl) +if(amgcl_FOUND) + set(HAVE_AMGCL 1) + # Linking to target angcl::amgcl drags in OpenMP and -fopenmp as a compile + # flag. With that nvcc fails as it does not that flag. + # Hence we set AMGCL_INCLUDE_DIRS. + get_property(AMGCL_INCLUDE_DIRS TARGET amgcl::amgcl PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${AMGCL_INCLUDE_DIRS}) +endif() + +if(OpenCL_FOUND) + find_package(VexCL) + if(VexCL_FOUND) + set(HAVE_VEXCL 1) + # generator expressions in vexcl do not seem to work and therefore + # we cannot use the imported target. Hence we exract the needed info + # from the targets + get_property(VEXCL_INCLUDE_DIRS TARGET VexCL::Common PROPERTY INTERFACE_INCLUDE_DIRECTORIES) + get_property(VEXCL_LINK_LIBRARIES TARGET VexCL::Common PROPERTY INTERFACE_LINK_LIBRARIES) + get_property(VEXCL_COMPILE_DEFINITIONS TARGET VexCL::OpenCL PROPERTY INTERFACE_COMPILE_DEFINITIONS) + set(VEXCL_LINK_LIBRARIES "${VEXCL_LINK_LIBRARIES};OpenCL::OpenCL") + add_library(OPM::VexCL::OpenCL INTERFACE IMPORTED) + set_target_properties(OPM::VexCL::OpenCL PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${VEXCL_COMPILE_DEFINITIONS}" + INTERFACE_LINK_LIBRARIES "${VEXCL_LINK_LIBRARIES}") + target_include_directories(OPM::VexCL::OpenCL SYSTEM INTERFACE "${VEXCL_INCLUDE_DIRS}") + endif() +endif() + # read the list of components from this file (in the project directory); # it should set various lists with the names of the files to include include (CMakeLists_files.cmake) @@ -528,6 +557,9 @@ if(OpenCL_FOUND) target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} ) endif() +if(VexCL_FOUND) + target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL ) +endif() if(HAVE_FPGA) add_dependencies(opmsimulators FPGA_library) ExternalProject_Get_Property(FPGA_library binary_dir) diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index b4602a7a2..d72a0b420 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -114,7 +114,15 @@ if(HAVE_FPGA) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) endif() - +if(HAVE_AMGCL) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp) + if(CUDA_FOUND) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cu) + endif() +endif() if(MPI_FOUND) list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp opm/simulators/utils/ParallelSerialization.cpp) @@ -233,6 +241,7 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/aquifers/AquiferNumerical.hpp opm/simulators/aquifers/BlackoilAquiferModel.hpp opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp + opm/simulators/linalg/bda/amgclSolverBackend.hpp opm/simulators/linalg/bda/BdaBridge.hpp opm/simulators/linalg/bda/BdaResult.hpp opm/simulators/linalg/bda/BdaSolver.hpp diff --git a/opm-simulators-prereqs.cmake b/opm-simulators-prereqs.cmake index e66fddaa8..3a7958f7c 100644 --- a/opm-simulators-prereqs.cmake +++ b/opm-simulators-prereqs.cmake @@ -8,6 +8,8 @@ set (opm-simulators_CONFIG_VAR HAVE_CUDA HAVE_OPENCL HAVE_FPGA + HAVE_AMGCL + HAVE_VEXCL HAVE_SUITESPARSE_UMFPACK_H HAVE_DUNE_ISTL DUNE_ISTL_VERSION_MAJOR diff --git a/opm/simulators/linalg/FlowLinearSolverParameters.hpp b/opm/simulators/linalg/FlowLinearSolverParameters.hpp index 1c1afde76..7fe037779 100644 --- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp +++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp @@ -314,7 +314,7 @@ namespace Opm EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver"); EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate"); EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'"); - EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'"); + EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]'"); EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs"); EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly."); diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index 53dcd8957..7a1f5a022 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -35,7 +35,7 @@ #include -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL #include #endif @@ -93,7 +93,7 @@ namespace Opm using ElementMapper = GetPropType; constexpr static std::size_t pressureIndex = GetPropType::pressureSwitchIdx; -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL static const unsigned int block_size = Matrix::block_type::rows; std::unique_ptr> bdaBridge; #endif @@ -130,7 +130,7 @@ namespace Opm EWOMS_PARAM_IS_SET(TypeTag, int, LinearSolverMaxIter), EWOMS_PARAM_IS_SET(TypeTag, int, CprMaxEllIter)); -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL { std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) { @@ -150,7 +150,7 @@ namespace Opm } #else if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") { - OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled"); + OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake and FPGA was not enabled"); } #endif extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); @@ -257,17 +257,20 @@ namespace Opm // Use GPU if: available, chosen by user, and successful. // Use FPGA if: support compiled, chosen by user, and successful. -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL bool use_gpu = bdaBridge->getUseGpu(); bool use_fpga = bdaBridge->getUseFpga(); if (use_gpu || use_fpga) { const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); - WellContributions wellContribs(accelerator_mode); + WellContributions wellContribs(accelerator_mode, useWellConn_); bdaBridge->initWellContributions(wellContribs); + // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl +#if HAVE_CUDA || HAVE_OPENCL if (!useWellConn_) { simulator_.problem().wellModel().getWellContributions(wellContribs); } +#endif // Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. bdaBridge->solve_system(const_cast(&getMatrix()), *rhs_, wellContribs, result); diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index a20b36524..192b7c87a 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -38,8 +38,9 @@ #include #endif - -#define PRINT_TIMERS_BRIDGE 0 +#if HAVE_AMGCL +#include +#endif typedef Dune::InverseOperatorResult InverseOperatorResult; @@ -59,7 +60,7 @@ BdaBridge::BdaBridge(std::string acceler [[maybe_unused]] unsigned int platformID, unsigned int deviceID, [[maybe_unused]] std::string opencl_ilu_reorder) -: accelerator_mode(accelerator_mode_) +: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_) { if (accelerator_mode.compare("cusparse") == 0) { #if HAVE_CUDA @@ -103,12 +104,19 @@ BdaBridge::BdaBridge(std::string acceler backend.reset(new bda::FpgaSolverBackend(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder)); #else OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake"); +#endif + } else if (accelerator_mode.compare("amgcl") == 0) { +#if HAVE_AMGCL + use_gpu = true; // should be replaced by a 'use_bridge' boolean + backend.reset(new bda::amgclSolverBackend(linear_solver_verbosity, maxit, tolerance, platformID, deviceID)); +#else + OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake"); #endif } else if (accelerator_mode.compare("none") == 0) { use_gpu = false; use_fpga = false; } else { - OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]"); + OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]"); } } @@ -197,40 +205,30 @@ void BdaBridge::solve_system(BridgeMatri const int nnz = nnzb * dim * dim; if (dim != 3) { - OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); - use_gpu = false; + OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); + use_gpu = use_fpga = false; return; } if (h_rows.capacity() == 0) { h_rows.reserve(Nb+1); h_cols.reserve(nnzb); -#if PRINT_TIMERS_BRIDGE - Dune::Timer t; -#endif getSparsityPattern(*mat, h_rows, h_cols); -#if PRINT_TIMERS_BRIDGE - std::ostringstream out; - out << "getSparsityPattern() took: " << t.stop() << " s"; - OpmLog::info(out.str()); -#endif } -#if PRINT_TIMERS_BRIDGE Dune::Timer t_zeros; int numZeros = checkZeroDiagonal(*mat); - std::ostringstream out; - out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros"; - OpmLog::info(out.str()); -#else - checkZeroDiagonal(*mat); -#endif + if (verbosity >= 2) { + std::ostringstream out; + out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros"; + OpmLog::info(out.str()); + } ///////////////////////// // actually solve - // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour + // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour SolverStatus status = backend->solve_system(N, nnz, dim, static_cast(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast(&(b[0][0])), wellContribs, result); switch(status) { case SolverStatus::BDA_SOLVER_SUCCESS: diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index c3e91db47..b5faa94b8 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -45,6 +45,7 @@ template class BdaBridge { private: + int verbosity = 0; bool use_gpu = false; bool use_fpga = false; std::string accelerator_mode; diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 3ab48e59f..d31c65096 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -28,7 +28,7 @@ namespace Opm { -WellContributions::WellContributions(std::string accelerator_mode){ +WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){ if(accelerator_mode.compare("cusparse") == 0){ cuda_gpu = true; } @@ -38,6 +38,11 @@ WellContributions::WellContributions(std::string accelerator_mode){ else if(accelerator_mode.compare("fpga") == 0){ // unused for FPGA, but must be defined to avoid error } + else if(accelerator_mode.compare("amgcl") == 0){ + if (!useWellConn) { + OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true"); + } + } else{ OPM_THROW(std::logic_error, "Invalid accelerator mode"); } diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 87ed49a72..90afb3683 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -173,7 +173,9 @@ public: void alloc(); /// Create a new WellContributions - WellContributions(std::string accelerator_mode); + /// \param[in] accelerator_mode string indicating which solverBackend is used + /// \param[in] useWellConn true iff wellcontributions are added to the matrix + WellContributions(std::string accelerator_mode, bool useWellConn); /// Destroy a WellContributions, and free memory ~WellContributions(); diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp new file mode 100644 index 000000000..c013604d8 --- /dev/null +++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp @@ -0,0 +1,383 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +#if HAVE_VEXCL +#include +#include +#endif + +namespace bda +{ + +using Opm::OpmLog; +using Dune::Timer; + +template +amgclSolverBackend::amgclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {} + + +template +amgclSolverBackend::~amgclSolverBackend() {} + + +template +void amgclSolverBackend::initialize(int N_, int nnz_, int dim) { + this->N = N_; + this->nnz = nnz_; + this->nnzb = nnz_ / block_size / block_size; + + Nb = (N + dim - 1) / dim; + std::ostringstream out; + out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n"; + out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n"; + out << "DeviceID: " << deviceID << "\n"; + OpmLog::info(out.str()); + out.str(""); + out.clear(); + + A_vals.resize(nnz); + A_cols.resize(nnz); + A_rows.resize(N + 1); + rhs.resize(N); + x.resize(N); + + // try to read amgcl parameters via json file + std::string filename = "amgcl_options.json"; + std::ifstream file(filename); + std::string backend_type_string; + + if (file.is_open()) { // if file exists, read parameters from file + try { + boost::property_tree::read_json(file, prm); + } catch (boost::property_tree::json_parser::json_parser_error e) { + OPM_THROW(std::logic_error, "Error cannot parse json file '" + filename + "'"); + } + + // the prm.get reads data from the file, with default values if not specified + // the prm.put puts the data in the property_tree, so it gets printed + backend_type_string = prm.get("backend_type", "cpu"); + prm.put("backend_type", backend_type_string); + std::string t1 = prm.get("precond.class", "relaxation"); + prm.put("precond.class", t1); + t1 = prm.get("precond.type", "ilu0"); + prm.put("precond.type", t1); + double t2 = prm.get("precond.damping", 0.9); + prm.put("precond.damping", t2); + t1 = prm.get("solver.type", "bicgstab"); + prm.put("solver.type", t1); + t2 = prm.get("solver.tol", tolerance); + prm.put("solver.tol", t2); + int t3 = prm.get("solver.maxiter", maxit); + prm.put("solver.maxiter", t3); + bool t4 = prm.get("solver.verbose", verbosity >= 2); + prm.put("solver.verbose", t4); + out << "Using parameters from " << filename << " (with default values for omitted parameters):\n"; + } else { // otherwise use default parameters, same as Dune + prm.put("backend_type", "cpu"); // put it in the tree so it gets printed + prm.put("precond.class", "relaxation"); + prm.put("precond.type", "ilu0"); + prm.put("precond.damping", 0.9); + prm.put("solver.type", "bicgstab"); + prm.put("solver.tol", tolerance); + prm.put("solver.maxiter", maxit); + prm.put("solver.verbose", verbosity >= 2); + backend_type_string = prm.get("backend_type", "cpu"); + out << "Using default amgcl parameters:\n"; + } + + boost::property_tree::write_json(out, prm); // print amgcl parameters + prm.erase("backend_type"); // delete custom parameter, otherwise amgcl prints a warning + + if (backend_type_string == "cpu") { + backend_type = Amgcl_backend_type::cpu; + } else if (backend_type_string == "cuda") { + backend_type = Amgcl_backend_type::cuda; + } else if (backend_type_string == "vexcl") { + backend_type = Amgcl_backend_type::vexcl; + } else { + OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]"); + } + + if (backend_type == Amgcl_backend_type::cuda) { +#if !HAVE_CUDA + OPM_THROW(std::logic_error, "Error amgcl is trying to use CUDA, but CUDA was not found by CMake"); +#endif + } + if (backend_type == Amgcl_backend_type::vexcl) { +#if !HAVE_VEXCL + OPM_THROW(std::logic_error, "Error amgcl is trying to use VexCL, but VexCL was not found by CMake"); +#endif + } + OpmLog::info(out.str()); + + initialized = true; +} // end initialize() + + +template +void amgclSolverBackend::convert_sparsity_pattern(int *rows, int *cols) { + Timer t; + const unsigned int bs = block_size; + int idx = 0; // indicates the unblocked write index + + A_rows[0] = 0; + for (int row = 0; row < Nb; ++row) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + for (unsigned r = 0; r < bs; ++r) { + for (int ij = rowStart; ij < rowEnd; ++ij) { + for (unsigned c = 0; c < bs; ++c) { + A_cols[idx] = cols[ij] * bs + c; + idx++; + } + } + A_rows[row*bs + r + 1] = idx; + } + } + + if (verbosity >= 3) { + std::ostringstream out; + out << "amgclSolverBackend::convert_sparsity_pattern(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end convert_sparsity_pattern() + + +template +void amgclSolverBackend::convert_data(double *vals, int *rows) { + Timer t; + const unsigned int bs = block_size; + int idx = 0; // indicates the unblocked write index + + for (int row = 0; row < Nb; ++row) { + int rowStart = rows[row]; + int rowEnd = rows[row+1]; + for (unsigned r = 0; r < bs; ++r) { + for (int ij = rowStart; ij < rowEnd; ++ij) { + for (unsigned c = 0; c < bs; ++c) { + A_vals[idx] = vals[ij*bs*bs + r*bs + c]; + idx++; + } + } + } + } + + if (verbosity >= 3) { + std::ostringstream out; + out << "amgclSolverBackend::convert_data(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end convert_data() + +#if HAVE_VEXCL +void initialize_vexcl(std::vector& ctx, unsigned int platformID, unsigned int deviceID) { + std::vector platforms; + std::vector devices; + cl::Platform::get(&platforms); + + if (platforms.size() <= platformID) { + OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID"); + } + + std::string platform_name, device_name; + platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_name); + platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices); + + if (devices.size() <= deviceID){ + OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID"); + } + + devices[deviceID].getInfo(CL_DEVICE_NAME, &device_name); + + cl::Context c(devices[deviceID]); + cl::CommandQueue q(c, devices[deviceID]); + ctx.push_back(q); + + std::ostringstream out; + out << "Using VexCL on " << device_name << " (" << platform_name << ")\n"; + OpmLog::info(out.str()); +} + +template +void solve_vexcl( + const auto& A, + const boost::property_tree::ptree prm, + const std::vector& ctx, + double *b, + std::vector& x, + const int N, + int& iters, + double& error) +{ + typedef amgcl::backend::vexcl Backend; + typedef amgcl::make_solver, amgcl::runtime::solver::wrapper > Solver; + + typename Solver::backend_params bprm; + bprm.q = ctx; // set vexcl context + + Solver solve(A, prm, bprm); // create solver + + auto b_ptr = reinterpret_cast(b); + auto x_ptr = reinterpret_cast(x.data()); + vex::vector B(ctx, N / block_size, b_ptr); + vex::vector X(ctx, N / block_size, x_ptr); + + std::tie(iters, error) = solve(B, X); // actually perform solve + + vex::copy(X, x_ptr); +} +#endif + +template +void amgclSolverBackend::solve_system(double *b, BdaResult &res) { + Timer t; + + try { + if (backend_type == Amgcl_backend_type::cuda) { // use CUDA +#if HAVE_CUDA + solve_cuda(b); +#endif + } else if (backend_type == Amgcl_backend_type::cpu) { // use builtin backend (CPU) + // create matrix object + auto Atmp = std::tie(N, A_rows, A_cols, A_vals); + auto A = amgcl::adapter::block_matrix(Atmp); + + // create solver and construct preconditioner + // don't reuse this unless the preconditioner can be reused + CPU_Solver solve(A, prm); + + // print solver structure (once) + std::call_once(print_info, [&](){ + std::ostringstream out; + out << solve << std::endl; + OpmLog::info(out.str()); + }); + + // reset x vector + std::fill(x.begin(), x.end(), 0.0); + + // create blocked vectors + auto b_ptr = reinterpret_cast(b); + auto x_ptr = reinterpret_cast(x.data()); + auto B = amgcl::make_iterator_range(b_ptr, b_ptr + N / block_size); + auto X = amgcl::make_iterator_range(x_ptr, x_ptr + N / block_size); + + // actually solve + std::tie(iters, error) = solve(B, X); + } else if (backend_type == Amgcl_backend_type::vexcl) { +#if HAVE_VEXCL + static std::vector ctx; // using CommandQueue directly instead of vex::Context + std::call_once(vexcl_initialize, [&](){ + initialize_vexcl(ctx, platformID, deviceID); + }); + if constexpr(block_size == 1){ + auto A = std::tie(N, A_rows, A_cols, A_vals); + + solve_vexcl(A, prm, ctx, b, x, N, iters, error); + } else { + // allow vexcl to use blocked matrices + vex::scoped_program_header h1(ctx, amgcl::backend::vexcl_static_matrix_declaration()); + + auto Atmp = std::tie(N, A_rows, A_cols, A_vals); + auto A = amgcl::adapter::block_matrix(Atmp); + + solve_vexcl(A, prm, ctx, b, x, N, iters, error); + } +#endif + } + } catch (const std::exception& ex) { + std::cerr << "Caught exception: " << ex.what() << std::endl; + throw ex; + } + + double time_elapsed = t.stop(); + + res.iterations = iters; + res.reduction = 0.0; + res.elapsed = time_elapsed; + res.converged = (iters != maxit); + + if (verbosity >= 1) { + std::ostringstream out; + out << "=== converged: " << res.converged << ", time: " << res.elapsed << \ + ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters; + OpmLog::info(out.str()); + } + if (verbosity >= 3) { + std::ostringstream out; + out << "amgclSolverBackend::solve_system(): " << time_elapsed << " s"; + OpmLog::info(out.str()); + } + +} // end solve_system() + + +// copy result to host memory +// caller must be sure that x is a valid array +template +void amgclSolverBackend::get_result(double *x_) { + Timer t; + + std::copy(x.begin(), x.end(), x_); + + if (verbosity >= 3) { + std::ostringstream out; + out << "amgclSolverBackend::get_result(): " << t.stop() << " s"; + OpmLog::info(out.str()); + } +} // end get_result() + + +template +SolverStatus amgclSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res) { + if (initialized == false) { + initialize(N_, nnz_, dim); + convert_sparsity_pattern(rows, cols); + } + convert_data(vals, rows); + solve_system(b, res); + return SolverStatus::BDA_SOLVER_SUCCESS; +} + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template amgclSolverBackend::amgclSolverBackend(int, int, double, unsigned int, unsigned int); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace bda + diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cu b/opm/simulators/linalg/bda/amgclSolverBackend.cu new file mode 100644 index 000000000..a19901c01 --- /dev/null +++ b/opm/simulators/linalg/bda/amgclSolverBackend.cu @@ -0,0 +1,89 @@ +/* + Copyright 2021 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include + +#include + +#include +#include +#include + +/// This file is only compiled when both amgcl and CUDA are found by CMake + +namespace bda +{ + +using Opm::OpmLog; + + +template +void amgclSolverBackend::solve_cuda(double *b) { + typedef amgcl::backend::cuda CUDA_Backend; + typedef amgcl::make_solver, amgcl::runtime::solver::wrapper > CUDA_Solver; + + static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle + + // initialize cusparse handle for amgcl, cannot merge this call_once with 'print solver structure' + std::call_once(cuda_initialize, [&](){ + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, deviceID); + std::ostringstream out; + out << prop.name << std::endl; + OpmLog::info(out.str()); + cusparseCreate(&CUDA_bprm.cusparse_handle); + }); + + // create matrix object + auto A = std::tie(N, A_rows, A_cols, A_vals); + + // create solver and construct preconditioner + // don't reuse this unless the preconditioner can be reused + CUDA_Solver solve(A, prm, CUDA_bprm); + + // print solver structure (once) + std::call_once(print_info, [&](){ + std::ostringstream out; + out << solve << std::endl; + OpmLog::info(out.str()); + }); + + thrust::device_vector B(b, b + N); + thrust::device_vector X(N, 0.0); + + // actually solve + std::tie(iters, error) = solve(B, X); + + thrust::copy(X.begin(), X.end(), x.begin()); +} + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template void amgclSolverBackend::solve_cuda(double*); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} // namespace bda + diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp new file mode 100644 index 000000000..ffee9f1d3 --- /dev/null +++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp @@ -0,0 +1,156 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_AMGCLSOLVER_BACKEND_HEADER_INCLUDED +#define OPM_AMGCLSOLVER_BACKEND_HEADER_INCLUDED + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace bda +{ + +/// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving +/// Note amgcl also implements blocked solvers, but looks like it needs unblocked input data +template +class amgclSolverBackend : public BdaSolver +{ + typedef BdaSolver Base; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + using Base::platformID; + using Base::deviceID; + using Base::maxit; + using Base::tolerance; + using Base::initialized; + + typedef amgcl::static_matrix dmat_type; // matrix value type in double precision + typedef amgcl::static_matrix dvec_type; // the corresponding vector value type + typedef amgcl::backend::builtin CPU_Backend; + + typedef amgcl::make_solver, amgcl::runtime::solver::wrapper > CPU_Solver; + +private: + + // amgcl can use different backends, this lets the user choose + enum Amgcl_backend_type { + cpu, + cuda, + vexcl + }; + + // store matrix in CSR format + std::vector A_rows, A_cols; + std::vector A_vals, rhs; + std::vector x; + std::once_flag print_info; + Amgcl_backend_type backend_type = cpu; + + boost::property_tree::ptree prm; // amgcl parameters + int iters = 0; + double error = 0.0; + +#if HAVE_CUDA + std::once_flag cuda_initialize; + void solve_cuda(double *b); +#endif + +#if HAVE_VEXCL + std::once_flag vexcl_initialize; +#endif + /// Initialize host memory and determine amgcl parameters + /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] dim size of block + void initialize(int N, int nnz, int dim); + + /// Convert the BCSR sparsity pattern to a CSR one + /// \param[in] rows array of rowPointers, contains N/dim+1 values + /// \param[in] cols array of columnIndices, contains nnz values + void convert_sparsity_pattern(int *rows, int *cols); + + /// Convert the BCSR nonzero data to a CSR format + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values + void convert_data(double *vals, int *rows); + + /// Solve linear system + /// \param[in] b pointer to b vector + /// \param[inout] res summary of solver result + void solve_system(double *b, BdaResult &res); + +public: + /// Construct a openclSolver + /// \param[in] linear_solver_verbosity verbosity of openclSolver + /// \param[in] maxit maximum number of iterations for openclSolver + /// \param[in] tolerance required relative tolerance for openclSolver + /// \param[in] platformID the OpenCL platform to be used + /// \param[in] deviceID the device to be used + amgclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID); + + /// Destroy a openclSolver, and free memory + ~amgclSolverBackend(); + + /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format + /// \param[in] N number of rows, divide by dim to get number of blockrows + /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] nnz_prec number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks + /// \param[in] dim size of block + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values + /// \param[in] cols array of columnIndices, contains nnz values + /// \param[in] vals_prec array of nonzeroes for preconditioner + /// \param[in] rows_prec array of rowPointers for preconditioner + /// \param[in] cols_prec array of columnIndices for preconditioner + /// \param[in] b input vector, contains N values + /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A + /// \param[inout] res summary of solver result + /// \return status code + SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + + /// Get result after linear solve, and peform postprocessing if necessary + /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array + void get_result(double *x) override; + +}; // end class amgclSolverBackend + +} // namespace bda + +#endif + + diff --git a/tests/test_cusparseSolver.cpp b/tests/test_cusparseSolver.cpp index 6aace8aa2..0d40ab199 100644 --- a/tests/test_cusparseSolver.cpp +++ b/tests/test_cusparseSolver.cpp @@ -80,7 +80,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma Dune::InverseOperatorResult result; Vector x(rhs.size()); - Opm::WellContributions wellContribs("cusparse"); + Opm::WellContributions wellContribs("cusparse", false); std::unique_ptr > bridge; try { bridge = std::make_unique >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder); diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp index 94bb0b88d..7c611405b 100644 --- a/tests/test_openclSolver.cpp +++ b/tests/test_openclSolver.cpp @@ -79,7 +79,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr Dune::InverseOperatorResult result; Vector x(rhs.size()); - Opm::WellContributions wellContribs("opencl"); + Opm::WellContributions wellContribs("opencl", false); std::unique_ptr > bridge; try { bridge = std::make_unique >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);