Merge pull request #3346 from Tongdongq/amgcl-support

Amgcl support
2025-02-25 18:55:30 -06:00 · 2021-07-21 15:02:25 +02:00 · 2021-07-21 15:02:25 +02:00 · ea4e4a1520
commit ea4e4a1520
parent bb9f941f2f 796d1b7f8a
14 changed files with 714 additions and 34 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -213,6 +213,35 @@ if(OpenCL_FOUND)
  endif()
 endif()

+find_package(amgcl)
+if(amgcl_FOUND)
+  set(HAVE_AMGCL 1)
+  # Linking to target angcl::amgcl drags in OpenMP and -fopenmp as a compile
+  # flag. With that nvcc fails as it does not that flag.
+  # Hence we set AMGCL_INCLUDE_DIRS.
+  get_property(AMGCL_INCLUDE_DIRS TARGET amgcl::amgcl PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+  include_directories(SYSTEM ${AMGCL_INCLUDE_DIRS})
+endif()
+
+if(OpenCL_FOUND)
+  find_package(VexCL)
+  if(VexCL_FOUND)
+    set(HAVE_VEXCL 1)
+    # generator expressions in vexcl do not seem to work and therefore
+    # we cannot use the imported target. Hence we exract the needed info
+    # from the targets
+    get_property(VEXCL_INCLUDE_DIRS TARGET VexCL::Common PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    get_property(VEXCL_LINK_LIBRARIES TARGET VexCL::Common PROPERTY INTERFACE_LINK_LIBRARIES)
+    get_property(VEXCL_COMPILE_DEFINITIONS TARGET VexCL::OpenCL PROPERTY INTERFACE_COMPILE_DEFINITIONS)
+    set(VEXCL_LINK_LIBRARIES "${VEXCL_LINK_LIBRARIES};OpenCL::OpenCL")
+    add_library(OPM::VexCL::OpenCL INTERFACE IMPORTED)
+    set_target_properties(OPM::VexCL::OpenCL PROPERTIES
+      INTERFACE_COMPILE_DEFINITIONS "${VEXCL_COMPILE_DEFINITIONS}"
+      INTERFACE_LINK_LIBRARIES "${VEXCL_LINK_LIBRARIES}")
+    target_include_directories(OPM::VexCL::OpenCL SYSTEM INTERFACE "${VEXCL_INCLUDE_DIRS}")
+  endif()
+endif()
+
 # read the list of components from this file (in the project directory);
 # it should set various lists with the names of the files to include
 include (CMakeLists_files.cmake)
@ -528,6 +557,9 @@ if(OpenCL_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
 endif()

+if(VexCL_FOUND)
+  target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
+endif()
 if(HAVE_FPGA)
  add_dependencies(opmsimulators FPGA_library)
  ExternalProject_Get_Property(FPGA_library binary_dir)
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -114,7 +114,15 @@ if(HAVE_FPGA)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
 endif()
-
+if(HAVE_AMGCL)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
+  if(CUDA_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cu)
+  endif()
+endif()
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
                                opm/simulators/utils/ParallelSerialization.cpp)
@ -233,6 +241,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/aquifers/AquiferNumerical.hpp
  opm/simulators/aquifers/BlackoilAquiferModel.hpp
  opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
+  opm/simulators/linalg/bda/amgclSolverBackend.hpp
  opm/simulators/linalg/bda/BdaBridge.hpp
  opm/simulators/linalg/bda/BdaResult.hpp
  opm/simulators/linalg/bda/BdaSolver.hpp
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -8,6 +8,8 @@ set (opm-simulators_CONFIG_VAR
  HAVE_CUDA
  HAVE_OPENCL
  HAVE_FPGA
+  HAVE_AMGCL
+  HAVE_VEXCL
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -314,7 +314,7 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]'");
            EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
            EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -35,7 +35,7 @@
 #include <opm/simulators/linalg/setupPropertyTree.hpp>


-#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #endif

@ -93,7 +93,7 @@ namespace Opm
        using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
        constexpr static std::size_t pressureIndex = GetPropType<TypeTag, Properties::Indices>::pressureSwitchIdx;

-#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL
        static const unsigned int block_size = Matrix::block_type::rows;
        std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
 #endif
@ -130,7 +130,7 @@ namespace Opm
                                     EWOMS_PARAM_IS_SET(TypeTag, int, LinearSolverMaxIter),
                                     EWOMS_PARAM_IS_SET(TypeTag, int, CprMaxEllIter));

-#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL
            {
                std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
                if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
@ -150,7 +150,7 @@ namespace Opm
            }
 #else
            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
-                OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
+                OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake and FPGA was not enabled");
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@ -257,17 +257,20 @@ namespace Opm

            // Use GPU if: available, chosen by user, and successful.
            // Use FPGA if: support compiled, chosen by user, and successful.
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL
            bool use_gpu = bdaBridge->getUseGpu();
            bool use_fpga = bdaBridge->getUseFpga();
            if (use_gpu || use_fpga) {
                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-                WellContributions wellContribs(accelerator_mode);
+                WellContributions wellContribs(accelerator_mode, useWellConn_);
                bdaBridge->initWellContributions(wellContribs);

+                // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
+#if HAVE_CUDA || HAVE_OPENCL
                if (!useWellConn_) {
                    simulator_.problem().wellModel().getWellContributions(wellContribs);
                }
+#endif

                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -38,8 +38,9 @@
 #include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
 #endif

-
-#define PRINT_TIMERS_BRIDGE 0
+#if HAVE_AMGCL
+#include <opm/simulators/linalg/bda/amgclSolverBackend.hpp>
+#endif

 typedef Dune::InverseOperatorResult InverseOperatorResult;

@ -59,7 +60,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
                                                             [[maybe_unused]] unsigned int platformID,
                                                             unsigned int deviceID,
                                                             [[maybe_unused]] std::string opencl_ilu_reorder)
-: accelerator_mode(accelerator_mode_)
+: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
 {
    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
@ -103,12 +104,19 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
        backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
 #else
        OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
+#endif
+    } else if (accelerator_mode.compare("amgcl") == 0) {
+#if HAVE_AMGCL
+        use_gpu = true; // should be replaced by a 'use_bridge' boolean
+        backend.reset(new bda::amgclSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, platformID, deviceID));
+#else
+        OPM_THROW(std::logic_error, "Error amgclSolver was chosen, but amgcl was not found by CMake");
 #endif
    } else if (accelerator_mode.compare("none") == 0) {
        use_gpu = false;
        use_fpga = false;
    } else {
-        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
+        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]");
    }
 }

@ -197,40 +205,30 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        const int nnz = nnzb * dim * dim;

        if (dim != 3) {
-            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
-            use_gpu = false;
+            OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
+            use_gpu = use_fpga = false;
            return;
        }

        if (h_rows.capacity() == 0) {
            h_rows.reserve(Nb+1);
            h_cols.reserve(nnzb);
-#if PRINT_TIMERS_BRIDGE
-            Dune::Timer t;
-#endif
            getSparsityPattern(*mat, h_rows, h_cols);
-#if PRINT_TIMERS_BRIDGE
-            std::ostringstream out;
-            out << "getSparsityPattern() took: " << t.stop() << " s";
-            OpmLog::info(out.str());
-#endif
        }

-#if PRINT_TIMERS_BRIDGE
        Dune::Timer t_zeros;
        int numZeros = checkZeroDiagonal(*mat);
-        std::ostringstream out;
-        out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
-        OpmLog::info(out.str());
-#else
-        checkZeroDiagonal(*mat);
-#endif
+        if (verbosity >= 2) {
+            std::ostringstream out;
+            out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
+            OpmLog::info(out.str());
+        }


        /////////////////////////
        // actually solve

-        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
+        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
        SolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), wellContribs, result);
        switch(status) {
        case SolverStatus::BDA_SOLVER_SUCCESS:
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -45,6 +45,7 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
+    int verbosity = 0;
    bool use_gpu = false;
    bool use_fpga = false;
    std::string accelerator_mode;
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -28,7 +28,7 @@
 namespace Opm
 {

-WellContributions::WellContributions(std::string accelerator_mode){
+WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
    if(accelerator_mode.compare("cusparse") == 0){
        cuda_gpu = true;
    }
@ -38,6 +38,11 @@ WellContributions::WellContributions(std::string accelerator_mode){
    else if(accelerator_mode.compare("fpga") == 0){
        // unused for FPGA, but must be defined to avoid error
    }
+    else if(accelerator_mode.compare("amgcl") == 0){
+        if (!useWellConn) {
+            OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");
+        }
+    }
    else{
        OPM_THROW(std::logic_error, "Invalid accelerator mode");
    }
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -173,7 +173,9 @@ public:
    void alloc();

    /// Create a new WellContributions
-    WellContributions(std::string accelerator_mode);
+    /// \param[in] accelerator_mode    string indicating which solverBackend is used
+    /// \param[in] useWellConn         true iff wellcontributions are added to the matrix
+    WellContributions(std::string accelerator_mode, bool useWellConn);

    /// Destroy a WellContributions, and free memory
    ~WellContributions();
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@ -0,0 +1,383 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <dune/common/timer.hh>
+#include <opm/material/common/Unused.hpp>
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/amgclSolverBackend.hpp>
+
+#include <boost/property_tree/json_parser.hpp>
+
+#if HAVE_VEXCL
+#include <amgcl/backend/vexcl.hpp>
+#include <amgcl/backend/vexcl_static_matrix.hpp>
+#endif
+
+namespace bda
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <unsigned int block_size>
+amgclSolverBackend<block_size>::amgclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {}
+
+
+template <unsigned int block_size>
+amgclSolverBackend<block_size>::~amgclSolverBackend() {}
+
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
+    this->N = N_;
+    this->nnz = nnz_;
+    this->nnzb = nnz_ / block_size / block_size;
+
+    Nb = (N + dim - 1) / dim;
+    std::ostringstream out;
+    out << "Initializing amgclSolverBackend, matrix size: " << N << " blocks, nnzb: " << nnzb << "\n";
+    out << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance << "\n";
+    out << "DeviceID: " << deviceID << "\n";
+    OpmLog::info(out.str());
+    out.str("");
+    out.clear();
+
+    A_vals.resize(nnz);
+    A_cols.resize(nnz);
+    A_rows.resize(N + 1);
+    rhs.resize(N);
+    x.resize(N);
+
+    // try to read amgcl parameters via json file
+    std::string filename = "amgcl_options.json";
+    std::ifstream file(filename);
+    std::string backend_type_string;
+
+    if (file.is_open()) { // if file exists, read parameters from file
+        try {
+            boost::property_tree::read_json(file, prm);
+        } catch (boost::property_tree::json_parser::json_parser_error e) {
+            OPM_THROW(std::logic_error, "Error cannot parse json file '" + filename + "'");
+        }
+
+        // the prm.get reads data from the file, with default values if not specified
+        // the prm.put puts the data in the property_tree, so it gets printed
+        backend_type_string = prm.get("backend_type", "cpu");
+        prm.put("backend_type", backend_type_string);
+        std::string t1 = prm.get("precond.class", "relaxation");
+        prm.put("precond.class", t1);
+        t1 = prm.get("precond.type", "ilu0");
+        prm.put("precond.type", t1);
+        double t2 = prm.get("precond.damping", 0.9);
+        prm.put("precond.damping", t2);
+        t1 = prm.get("solver.type", "bicgstab");
+        prm.put("solver.type", t1);
+        t2 = prm.get("solver.tol", tolerance);
+        prm.put("solver.tol", t2);
+        int t3 = prm.get("solver.maxiter", maxit);
+        prm.put("solver.maxiter", t3);
+        bool t4 = prm.get("solver.verbose", verbosity >= 2);
+        prm.put("solver.verbose", t4);
+        out << "Using parameters from " << filename << " (with default values for omitted parameters):\n";
+    } else { // otherwise use default parameters, same as Dune
+        prm.put("backend_type", "cpu"); // put it in the tree so it gets printed
+        prm.put("precond.class", "relaxation");
+        prm.put("precond.type", "ilu0");
+        prm.put("precond.damping", 0.9);
+        prm.put("solver.type", "bicgstab");
+        prm.put("solver.tol", tolerance);
+        prm.put("solver.maxiter", maxit);
+        prm.put("solver.verbose", verbosity >= 2);
+        backend_type_string = prm.get("backend_type", "cpu");
+        out << "Using default amgcl parameters:\n";
+    }
+
+    boost::property_tree::write_json(out, prm); // print amgcl parameters
+    prm.erase("backend_type");                  // delete custom parameter, otherwise amgcl prints a warning
+
+    if (backend_type_string == "cpu") {
+        backend_type = Amgcl_backend_type::cpu;
+    } else if (backend_type_string == "cuda") {
+        backend_type = Amgcl_backend_type::cuda;
+    } else if (backend_type_string == "vexcl") {
+        backend_type = Amgcl_backend_type::vexcl;
+    } else {
+        OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
+    }
+
+    if (backend_type == Amgcl_backend_type::cuda) {
+#if !HAVE_CUDA
+        OPM_THROW(std::logic_error, "Error amgcl is trying to use CUDA, but CUDA was not found by CMake");
+#endif
+    }
+    if (backend_type == Amgcl_backend_type::vexcl) {
+#if !HAVE_VEXCL
+        OPM_THROW(std::logic_error, "Error amgcl is trying to use VexCL, but VexCL was not found by CMake");
+#endif
+    }
+    OpmLog::info(out.str());
+
+    initialized = true;
+} // end initialize()
+
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::convert_sparsity_pattern(int *rows, int *cols) {
+    Timer t;
+    const unsigned int bs = block_size;
+    int idx = 0; // indicates the unblocked write index
+
+    A_rows[0] = 0;
+    for (int row = 0; row < Nb; ++row) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        for (unsigned r = 0; r < bs; ++r) {
+            for (int ij = rowStart; ij < rowEnd; ++ij) {
+                for (unsigned c = 0; c < bs; ++c) {
+                    A_cols[idx] = cols[ij] * bs + c;
+                    idx++;
+                }
+            }
+            A_rows[row*bs + r + 1] = idx;
+        }
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "amgclSolverBackend::convert_sparsity_pattern(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end convert_sparsity_pattern()
+
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::convert_data(double *vals, int *rows) {
+    Timer t;
+    const unsigned int bs = block_size;
+    int idx = 0; // indicates the unblocked write index
+
+    for (int row = 0; row < Nb; ++row) {
+        int rowStart = rows[row];
+        int rowEnd = rows[row+1];
+        for (unsigned r = 0; r < bs; ++r) {
+            for (int ij = rowStart; ij < rowEnd; ++ij) {
+                for (unsigned c = 0; c < bs; ++c) {
+                    A_vals[idx] = vals[ij*bs*bs + r*bs + c];
+                    idx++;
+                }
+            }
+        }
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "amgclSolverBackend::convert_data(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end convert_data()
+
+#if HAVE_VEXCL
+void initialize_vexcl(std::vector<cl::CommandQueue>& ctx, unsigned int platformID, unsigned int deviceID) {
+    std::vector<cl::Platform> platforms;
+    std::vector<cl::Device> devices;
+    cl::Platform::get(&platforms);
+
+    if (platforms.size() <= platformID) {
+        OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
+    }
+
+    std::string platform_name, device_name;
+    platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_name);
+    platforms[platformID].getDevices(CL_DEVICE_TYPE_ALL, &devices);
+
+    if (devices.size() <= deviceID){
+        OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
+    }
+
+    devices[deviceID].getInfo(CL_DEVICE_NAME, &device_name);
+
+    cl::Context c(devices[deviceID]);
+    cl::CommandQueue q(c, devices[deviceID]);
+    ctx.push_back(q);
+
+    std::ostringstream out;
+    out << "Using VexCL on " << device_name << " (" << platform_name << ")\n";
+    OpmLog::info(out.str());
+}
+
+template <typename vexcl_matrix_type, typename vexcl_vector_type, unsigned int block_size>
+void solve_vexcl(
+    const auto& A,
+    const boost::property_tree::ptree prm,
+    const std::vector<cl::CommandQueue>& ctx,
+    double *b,
+    std::vector<double>& x,
+    const int N,
+    int& iters,
+    double& error)
+{
+    typedef amgcl::backend::vexcl<vexcl_matrix_type> Backend;
+    typedef amgcl::make_solver<amgcl::runtime::preconditioner<Backend>, amgcl::runtime::solver::wrapper<Backend> > Solver;
+
+    typename Solver::backend_params bprm;
+    bprm.q = ctx;  // set vexcl context
+
+    Solver solve(A, prm, bprm); // create solver
+
+    auto b_ptr = reinterpret_cast<vexcl_vector_type*>(b);
+    auto x_ptr = reinterpret_cast<vexcl_vector_type*>(x.data());
+    vex::vector<vexcl_vector_type> B(ctx, N / block_size, b_ptr);
+    vex::vector<vexcl_vector_type> X(ctx, N / block_size, x_ptr);
+
+    std::tie(iters, error) = solve(B, X); // actually perform solve
+
+    vex::copy(X, x_ptr);
+}
+#endif
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
+    Timer t;
+
+    try {
+        if (backend_type == Amgcl_backend_type::cuda) { // use CUDA
+#if HAVE_CUDA
+            solve_cuda(b);
+#endif
+        } else if (backend_type == Amgcl_backend_type::cpu) { // use builtin backend (CPU)
+            // create matrix object
+            auto Atmp = std::tie(N, A_rows, A_cols, A_vals);
+            auto A = amgcl::adapter::block_matrix<dmat_type>(Atmp);
+
+            // create solver and construct preconditioner
+            // don't reuse this unless the preconditioner can be reused
+            CPU_Solver solve(A, prm);
+
+            // print solver structure (once)
+            std::call_once(print_info, [&](){
+                std::ostringstream out;
+                out << solve << std::endl;
+                OpmLog::info(out.str());
+            });
+
+            // reset x vector
+            std::fill(x.begin(), x.end(), 0.0);
+
+            // create blocked vectors
+            auto b_ptr = reinterpret_cast<dvec_type*>(b);
+            auto x_ptr = reinterpret_cast<dvec_type*>(x.data());
+            auto B = amgcl::make_iterator_range(b_ptr, b_ptr + N / block_size);
+            auto X = amgcl::make_iterator_range(x_ptr, x_ptr + N / block_size);
+
+            // actually solve
+            std::tie(iters, error) = solve(B, X);
+        } else if (backend_type == Amgcl_backend_type::vexcl) {
+#if HAVE_VEXCL
+            static std::vector<cl::CommandQueue> ctx; // using CommandQueue directly instead of vex::Context
+            std::call_once(vexcl_initialize, [&](){
+                initialize_vexcl(ctx, platformID, deviceID);
+            });
+            if constexpr(block_size == 1){
+                auto A = std::tie(N, A_rows, A_cols, A_vals);
+
+                solve_vexcl<double, double, block_size>(A, prm, ctx, b, x, N, iters, error);
+            } else {
+                // allow vexcl to use blocked matrices
+                vex::scoped_program_header h1(ctx, amgcl::backend::vexcl_static_matrix_declaration<double, block_size>());
+
+                auto Atmp = std::tie(N, A_rows, A_cols, A_vals);
+                auto A = amgcl::adapter::block_matrix<dmat_type>(Atmp);
+
+                solve_vexcl<dmat_type, dvec_type, block_size>(A, prm, ctx, b, x, N, iters, error);
+            }
+#endif
+        }
+    } catch (const std::exception& ex) {
+        std::cerr << "Caught exception: " << ex.what() << std::endl;
+        throw ex;
+    }
+
+    double time_elapsed = t.stop();
+
+    res.iterations = iters;
+    res.reduction = 0.0;
+    res.elapsed = time_elapsed;
+    res.converged = (iters != maxit);
+
+    if (verbosity >= 1) {
+        std::ostringstream out;
+        out << "=== converged: " << res.converged << ", time: " << res.elapsed << \
+            ", time per iteration: " << res.elapsed / iters << ", iterations: " << iters;
+        OpmLog::info(out.str());
+    }
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "amgclSolverBackend::solve_system(): " << time_elapsed << " s";
+        OpmLog::info(out.str());
+    }
+
+} // end solve_system()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::get_result(double *x_) {
+    Timer t;
+
+    std::copy(x.begin(), x.end(), x_);
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "amgclSolverBackend::get_result(): " << t.stop() << " s";
+        OpmLog::info(out.str());
+    }
+} // end get_result()
+
+
+template <unsigned int block_size>
+SolverStatus amgclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res) {
+    if (initialized == false) {
+        initialize(N_, nnz_, dim);
+        convert_sparsity_pattern(rows, cols);
+    }
+    convert_data(vals, rows);
+    solve_system(b, res);
+    return SolverStatus::BDA_SOLVER_SUCCESS;
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                \
+template amgclSolverBackend<n>::amgclSolverBackend(int, int, double, unsigned int, unsigned int);   \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda
+
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cu
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cu
@ -0,0 +1,89 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+
+#include <amgcl/backend/cuda.hpp>
+#include <amgcl/relaxation/cusparse_ilu0.hpp>
+#include <opm/simulators/linalg/bda/amgclSolverBackend.hpp>
+
+/// This file is only compiled when both amgcl and CUDA are found by CMake
+
+namespace bda
+{
+
+using Opm::OpmLog;
+
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::solve_cuda(double *b) {
+    typedef amgcl::backend::cuda<double> CUDA_Backend;
+    typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
+
+    static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle
+
+    // initialize cusparse handle for amgcl, cannot merge this call_once with 'print solver structure'
+    std::call_once(cuda_initialize, [&](){
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, deviceID);
+        std::ostringstream out;
+        out << prop.name << std::endl;
+        OpmLog::info(out.str());
+        cusparseCreate(&CUDA_bprm.cusparse_handle);
+    });
+
+    // create matrix object
+    auto A = std::tie(N, A_rows, A_cols, A_vals);
+
+    // create solver and construct preconditioner
+    // don't reuse this unless the preconditioner can be reused
+    CUDA_Solver solve(A, prm, CUDA_bprm);
+
+    // print solver structure (once)
+    std::call_once(print_info, [&](){
+        std::ostringstream out;
+        out << solve << std::endl;
+        OpmLog::info(out.str());
+    });
+
+    thrust::device_vector<double> B(b, b + N);
+    thrust::device_vector<double> X(N, 0.0);
+
+    // actually solve
+    std::tie(iters, error) = solve(B, X);
+
+    thrust::copy(X.begin(), X.end(), x.begin());
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                      \
+template void amgclSolverBackend<n>::solve_cuda(double*); \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda
+
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@ -0,0 +1,156 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_AMGCLSOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_AMGCLSOLVER_BACKEND_HEADER_INCLUDED
+
+#include <mutex>
+
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#include <boost/property_tree/ptree.hpp>
+
+#include <amgcl/amg.hpp>
+#include <amgcl/backend/builtin.hpp>
+#include <amgcl/adapter/crs_tuple.hpp>
+#include <amgcl/make_block_solver.hpp>
+#include <amgcl/relaxation/as_preconditioner.hpp>
+#include <amgcl/relaxation/ilu0.hpp>
+#include <amgcl/solver/bicgstab.hpp>
+#include <amgcl/preconditioner/runtime.hpp>
+#include <amgcl/value_type/static_matrix.hpp>
+
+namespace bda
+{
+
+/// This class does not implement a solver, but converts the BCSR format to normal CSR and uses amgcl for solving
+/// Note amgcl also implements blocked solvers, but looks like it needs unblocked input data
+template <unsigned int block_size>
+class amgclSolverBackend : public BdaSolver<block_size>
+{
+    typedef BdaSolver<block_size> Base;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::platformID;
+    using Base::deviceID;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::initialized;
+
+    typedef amgcl::static_matrix<double, block_size, block_size> dmat_type; // matrix value type in double precision
+    typedef amgcl::static_matrix<double, block_size, 1> dvec_type; // the corresponding vector value type
+    typedef amgcl::backend::builtin<dmat_type> CPU_Backend;
+
+    typedef amgcl::make_solver<amgcl::runtime::preconditioner<CPU_Backend>, amgcl::runtime::solver::wrapper<CPU_Backend> > CPU_Solver;
+
+private:
+
+    // amgcl can use different backends, this lets the user choose
+    enum Amgcl_backend_type {
+        cpu,
+        cuda,
+        vexcl
+    };
+
+    // store matrix in CSR format
+    std::vector<unsigned> A_rows, A_cols;
+    std::vector<double> A_vals, rhs;
+    std::vector<double> x;
+    std::once_flag print_info;
+    Amgcl_backend_type backend_type = cpu;
+
+    boost::property_tree::ptree prm;         // amgcl parameters
+    int iters = 0;
+    double error = 0.0;
+
+#if HAVE_CUDA
+    std::once_flag cuda_initialize;
+    void solve_cuda(double *b);
+#endif
+
+#if HAVE_VEXCL
+    std::once_flag vexcl_initialize;
+#endif
+    /// Initialize host memory and determine amgcl parameters
+    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    void initialize(int N, int nnz, int dim);
+
+    /// Convert the BCSR sparsity pattern to a CSR one
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    void convert_sparsity_pattern(int *rows, int *cols);
+
+    /// Convert the BCSR nonzero data to a CSR format
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    void convert_data(double *vals, int *rows);
+
+    /// Solve linear system
+    /// \param[in] b              pointer to b vector
+    /// \param[inout] res         summary of solver result
+    void solve_system(double *b, BdaResult &res);
+
+public:
+    /// Construct a openclSolver
+    /// \param[in] linear_solver_verbosity    verbosity of openclSolver
+    /// \param[in] maxit                      maximum number of iterations for openclSolver
+    /// \param[in] tolerance                  required relative tolerance for openclSolver
+    /// \param[in] platformID                 the OpenCL platform to be used
+    /// \param[in] deviceID                   the device to be used
+    amgclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID);
+
+    /// Destroy a openclSolver, and free memory
+    ~amgclSolverBackend();
+
+    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz_prec       number of nonzeroes of matrix for ILU0, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] vals_prec      array of nonzeroes for preconditioner
+    /// \param[in] rows_prec      array of rowPointers for preconditioner
+    /// \param[in] cols_prec      array of columnIndices for preconditioner
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] wellContribs   WellContributions, to apply them separately, instead of adding them to matrix A
+    /// \param[inout] res         summary of solver result
+    /// \return                   status code
+    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+
+    /// Get result after linear solve, and peform postprocessing if necessary
+    /// \param[inout] x          resulting x vector, caller must guarantee that x points to a valid array
+    void get_result(double *x) override;
+
+}; // end class amgclSolverBackend
+
+} // namespace bda
+
+#endif
+
+
--- a/tests/test_cusparseSolver.cpp
+++ b/tests/test_cusparseSolver.cpp
@ -80,7 +80,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("cusparse");
+    Opm::WellContributions wellContribs("cusparse", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
--- a/tests/test_openclSolver.cpp
+++ b/tests/test_openclSolver.cpp
@ -79,7 +79,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
    Dune::InverseOperatorResult result;

    Vector x(rhs.size());
-    Opm::WellContributions wellContribs("opencl");
+    Opm::WellContributions wellContribs("opencl", false);
    std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
    try {
        bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);