Merge pull request #4784 from hnil/gpu_solver_by_inheritance

- moded all bda/gpu spesific tings to separete class
2025-02-25 18:55:30 -06:00 · 2023-08-11 15:32:20 +02:00 · 2023-08-11 15:32:20 +02:00 · e6597b6b20
commit e6597b6b20
parent d6f2f1e3ff 0883d46d50
12 changed files with 642 additions and 403 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -31,6 +31,7 @@ option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires
 option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU" OFF)
 option(BUILD_FLOW_ALU_GRID "Build flow blackoil with alu grid" OFF)
 option(USE_DAMARIS_LIB "Use the Damaris library for asynchronous I/O?" OFF)
+option(USE_BDA_BRIDGE "Enable the BDA bridge (GPU/AMGCL solvers)" ON)

 # The following was copied from CMakeLists.txt in opm-common.
 # TODO: factor out the common parts in opm-common and opm-simulator as a cmake module
@ -102,6 +103,10 @@ if(USE_MPI)
  set(HDF5_PREFER_PARALLEL TRUE)
 endif()

+if(USE_BDA_BRIDGE)
+  set(COMPILE_BDA_BRIDGE 1)
+endif()
+
 # not the same location as most of the other projects? this hook overrides
 macro (dir_hook)
 endmacro (dir_hook)
@ -170,7 +175,6 @@ endif()

 if(CUDA_FOUND)
  set(HAVE_CUDA 1)
-  set(COMPILE_BDA_BRIDGE 1)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()

@ -182,7 +186,6 @@ if(OpenCL_FOUND)
  find_file(CL2_HPP CL/cl2.hpp HINTS ${OpenCL_INCLUDE_DIRS})
  if(CL2_HPP)
    set(HAVE_OPENCL 1)
-    set(COMPILE_BDA_BRIDGE 1)
    include_directories(${OpenCL_INCLUDE_DIRS})
    find_file(OPENCL_HPP CL/opencl.hpp HINTS ${OpenCL_INCLUDE_DIRS})
    if(OPENCL_HPP)
@ -216,7 +219,6 @@ endif()
 find_package(amgcl)
 if(amgcl_FOUND)
  set(HAVE_AMGCL 1)
-  set(COMPILE_BDA_BRIDGE 1)
  # Linking to target angcl::amgcl drags in OpenMP and -fopenmp as a compile
  # flag. With that nvcc fails as it does not that flag.
  # Hence we set AMGCL_INCLUDE_DIRS.
@ -228,7 +230,6 @@ if(OpenCL_FOUND)
  find_package(VexCL)
  if(VexCL_FOUND)
    set(HAVE_VEXCL 1)
-    set(COMPILE_BDA_BRIDGE 1)
    # generator expressions in vexcl do not seem to work and therefore
    # we cannot use the imported target. Hence we exract the needed info
    # from the targets
@ -287,14 +288,12 @@ macro (files_hook)
    else()
      if(rocsparse_FOUND AND rocblas_FOUND)
        set(HAVE_ROCSPARSE 1)
-        set(COMPILE_BDA_BRIDGE 1)
      else()
        unset(HAVE_ROCSPARSE)
      endif()
    endif()
    if(ROCALUTION_FOUND)
      set(HAVE_ROCALUTION 1)
-      set(COMPILE_BDA_BRIDGE 1)
    endif()
  endif()
  if(MPI_FOUND AND HDF5_FOUND AND NOT HDF5_IS_PARALLEL)
@ -632,7 +631,9 @@ add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
 if(CUDA_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
-  set_tests_properties(cusparseSolver PROPERTIES LABELS gpu_cuda)
+  if(USE_BDA_BRIDGE)
+    set_tests_properties(cusparseSolver PROPERTIES LABELS gpu_cuda)
+  endif()

  # CUISTL
  set_tests_properties(cusparse_safe_call
@ -649,25 +650,27 @@ if(CUDA_FOUND)
                       PROPERTIES LABELS gpu_cuda)
 endif()

-if(OpenCL_FOUND)
-  target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
-  set_tests_properties(openclSolver solvetransposed3x3 csrToCscOffsetMap
-                       PROPERTIES LABELS gpu_opencl)
-endif()
+if(USE_BDA_BRIDGE)
+  if(OpenCL_FOUND)
+    target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
+    set_tests_properties(openclSolver solvetransposed3x3 csrToCscOffsetMap
+                         PROPERTIES LABELS gpu_opencl)
+  endif()

-if(ROCALUTION_FOUND)
-  target_include_directories(opmsimulators PUBLIC ${rocalution_INCLUDE_DIR}/rocalution)
-  set_tests_properties(rocalutionSolver PROPERTIES LABELS gpu_rocm)
-endif()
+  if(ROCALUTION_FOUND)
+    target_include_directories(opmsimulators PUBLIC ${rocalution_INCLUDE_DIR}/rocalution)
+    set_tests_properties(rocalutionSolver PROPERTIES LABELS gpu_rocm)
+  endif()

-if(rocsparse_FOUND AND rocblas_FOUND)
-  target_link_libraries( opmsimulators PUBLIC roc::rocsparse )
-  target_link_libraries( opmsimulators PUBLIC roc::rocblas )
-  set_tests_properties(rocsparseSolver PROPERTIES LABELS gpu_rocm)
-endif()
+  if(rocsparse_FOUND AND rocblas_FOUND)
+    target_link_libraries( opmsimulators PUBLIC roc::rocsparse )
+    target_link_libraries( opmsimulators PUBLIC roc::rocblas )
+    set_tests_properties(rocsparseSolver PROPERTIES LABELS gpu_rocm)
+  endif()

-if(VexCL_FOUND)
-  target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
+  if(VexCL_FOUND)
+    target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
+  endif()
 endif()

 if(Damaris_FOUND)
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -53,8 +53,6 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/flow/SimulatorFullyImplicitBlackoilEbos.cpp
  opm/simulators/flow/ValidationFunctions.cpp
  opm/simulators/flow/partitionCells.cpp
-  opm/simulators/linalg/bda/WellContributions.cpp
-  opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
  opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
  opm/simulators/linalg/FlexibleSolver1.cpp
  opm/simulators/linalg/FlexibleSolver2.cpp
@ -143,14 +141,17 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/wells/WGState.cpp
  )

+
 if (DAMARIS_FOUND AND MPI_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/DamarisOutputModule.cpp) 
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/DamarisOutputModule.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/DamarisKeywords.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/initDamarisXmlFile.cpp)
 endif()
 if(CUDA_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cuWellContributions.cu)
+  if(USE_BDA_BRIDGE)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cuWellContributions.cu)
+  endif()

  # CUISTL SOURCE
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.cpp)
@ -176,7 +177,7 @@ if(CUDA_FOUND)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/safe_conversion.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp)
-  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp)  
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/vector_operations.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/has_function.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/preconditioner_should_call_post_pre.hpp)
@ -191,33 +192,36 @@ if(CUDA_FOUND)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/set_device.hpp)

 endif()
-if(OPENCL_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BILU0.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/ChowPatelIlu.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BISAI.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/CPR.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/opencl.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclKernels.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclWellContributions.cpp)
-endif()
-if(ROCALUTION_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
-endif()
-if(rocsparse_FOUND AND rocblas_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
-endif()
-if(COMPILE_BDA_BRIDGE)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
-endif()
-if(amgcl_FOUND)
-  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
-  if(CUDA_FOUND)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/amgclSolverBackend.cu)
+if(USE_BDA_BRIDGE)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp
+                                 opm/simulators/linalg/bda/WellContributions.cpp
+                                 opm/simulators/linalg/bda/MultisegmentWellContribution.cpp
+                                 opm/simulators/linalg/ISTLSolverEbosBda.cpp)
+  if(OPENCL_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BlockedMatrix.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BILU0.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/Reorder.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/ChowPatelIlu.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/BISAI.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/CPR.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/opencl.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclKernels.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/OpenclMatrix.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/Preconditioner.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl/openclWellContributions.cpp)
+  endif()
+  if(ROCALUTION_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocalutionSolverBackend.cpp)
+  endif()
+  if(rocsparse_FOUND AND rocblas_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
+  endif()
+  if(amgcl_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
+    if(CUDA_FOUND)
+      list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/amgclSolverBackend.cu)
+    endif()
  endif()
 endif()
 if(MPI_FOUND)
@ -271,7 +275,9 @@ if(MPI_FOUND)
 endif()

 if(CUDA_FOUND)
-  list(APPEND TEST_SOURCE_FILES tests/test_cusparseSolver.cpp)
+  if(USE_BDA_BRIDGE)
+    list(APPEND TEST_SOURCE_FILES tests/test_cusparseSolver.cpp)
+  endif()
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_safe_call.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_safe_call.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuda_safe_call.cpp)
@ -288,17 +294,21 @@ if(CUDA_FOUND)


 endif()
-if(OPENCL_FOUND)
-  list(APPEND TEST_SOURCE_FILES tests/test_openclSolver.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/test_solvetransposed3x3.cpp)
+
+if(USE_BDA_BRIDGE)
+  if(OPENCL_FOUND)
+    list(APPEND TEST_SOURCE_FILES tests/test_openclSolver.cpp)
+    list(APPEND TEST_SOURCE_FILES tests/test_solvetransposed3x3.cpp)
  list(APPEND TEST_SOURCE_FILES tests/test_csrToCscOffsetMap.cpp)
+  endif()
+  if(ROCALUTION_FOUND)
+    list(APPEND TEST_SOURCE_FILES tests/test_rocalutionSolver.cpp)
+  endif()
+  if(rocsparse_FOUND AND rocblas_FOUND)
+    list(APPEND TEST_SOURCE_FILES tests/test_rocsparseSolver.cpp)
+  endif()
 endif()
-if(ROCALUTION_FOUND)
-  list(APPEND TEST_SOURCE_FILES tests/test_rocalutionSolver.cpp)
-endif()
-if(rocsparse_FOUND AND rocblas_FOUND)
-  list(APPEND TEST_SOURCE_FILES tests/test_rocsparseSolver.cpp)
-endif()
+
 if(HDF5_FOUND)
  list(APPEND TEST_SOURCE_FILES tests/test_HDF5File.cpp)
  list(APPEND TEST_SOURCE_FILES tests/test_HDF5Serializer.cpp)
@ -464,6 +474,7 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/FlowLinearSolverParameters.hpp
  opm/simulators/linalg/GraphColoring.hpp
  opm/simulators/linalg/ISTLSolverEbos.hpp
+  opm/simulators/linalg/ISTLSolverEbosBda.hpp
  opm/simulators/linalg/MatrixMarketSpecializations.hpp
  opm/simulators/linalg/OwningBlockPreconditioner.hpp
  opm/simulators/linalg/OwningTwoLevelPreconditioner.hpp
--- a/opm/simulators/flow/BlackoilModelEbosNldd.hpp
+++ b/opm/simulators/flow/BlackoilModelEbosNldd.hpp
@ -34,7 +34,12 @@
 #include <opm/simulators/flow/SubDomain.hpp>

 #include <opm/simulators/linalg/extractMatrix.hpp>
+
+#if COMPILE_BDA_BRIDGE
+#include <opm/simulators/linalg/ISTLSolverEbosBda.hpp>
+#else
 #include <opm/simulators/linalg/ISTLSolverEbos.hpp>
+#endif

 #include <opm/simulators/timestepping/ConvergenceReport.hpp>
 #include <opm/simulators/timestepping/SimulatorReport.hpp>
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -31,10 +31,13 @@

 namespace Opm {
 template <class TypeTag>
+class ISTLSolverEbosBda;
+template <class TypeTag>
 class ISTLSolverEbos;
 }


+
 namespace Opm::Properties {

 namespace TTag {
@ -50,7 +53,7 @@ template<class TypeTag, class MyTypeTag>
 struct RelaxedLinearSolverReduction {
    using type = UndefinedProperty;
 };
-    
+
 template<class TypeTag, class MyTypeTag>
 struct LinearSolverMaxIter {
    using type = UndefinedProperty;
@ -135,7 +138,7 @@ template<class TypeTag>
 struct RelaxedLinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
    using type = GetPropType<TypeTag, Scalar>;
    static constexpr type value = 1e-2;
-};    
+};
 template<class TypeTag>
 struct LinearSolverMaxIter<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr int value = 200;
@ -217,9 +220,12 @@ struct OpenclIluParallel<TypeTag, TTag::FlowIstlSolverParams> {
 // Set the backend to be used.
 template<class TypeTag>
 struct LinearSolverBackend<TypeTag, TTag::FlowIstlSolverParams> {
+#if COMPLE_BDA_BRIDGE
+    using type = ISTLSolverEbosBda<TypeTag>;
+#else
    using type = ISTLSolverEbos<TypeTag>;
+#endif
 };
-
 } // namespace Opm::Properties

 namespace Opm
--- a/opm/simulators/linalg/ISTLSolverEbos.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.cpp
@ -202,169 +202,6 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
    }
 }

-#if COMPILE_BDA_BRIDGE
-template<class Matrix, class Vector>
-BdaSolverInfo<Matrix,Vector>::
-BdaSolverInfo(const std::string& accelerator_mode,
-              const int linear_solver_verbosity,
-              const int maxit,
-              const double tolerance,
-              const int platformID,
-              const int deviceID,
-              const bool opencl_ilu_parallel,
-              const std::string& linsolver)
-    : bridge_(std::make_unique<Bridge>(accelerator_mode,
-                                       linear_solver_verbosity, maxit,
-                                       tolerance, platformID, deviceID,
-                                       opencl_ilu_parallel, linsolver))
-    , accelerator_mode_(accelerator_mode)
-{}
-
-template<class Matrix, class Vector>
-BdaSolverInfo<Matrix,Vector>::~BdaSolverInfo() = default;
-
-template<class Matrix, class Vector>
-template<class Grid>
-void BdaSolverInfo<Matrix,Vector>::
-prepare(const Grid& grid,
-        const Dune::CartesianIndexMapper<Grid>& cartMapper,
-        const std::vector<Well>& wellsForConn,
-        const std::vector<int>& cellPartition,
-        const size_t nonzeroes,
-        const bool useWellConn)
-{
-    if (numJacobiBlocks_ > 1) {
-      detail::setWellConnections(grid, cartMapper, wellsForConn,
-                                 useWellConn,
-                                 wellConnectionsGraph_,
-                                 numJacobiBlocks_);
-      this->blockJacobiAdjacency(grid, cellPartition, nonzeroes);
-    }
-}
-
-template<class Matrix, class Vector>
-bool BdaSolverInfo<Matrix,Vector>::
-apply(Vector& rhs,
-      const bool useWellConn,
-      WellContribFunc getContribs,
-      const int rank,
-      Matrix& matrix,
-      Vector& x,
-      Dune::InverseOperatorResult& result)
-{
-    bool use_gpu = bridge_->getUseGpu();
-    if (use_gpu) {
-        auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
-        bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
-
-        // the WellContributions can only be applied separately with CUDA or OpenCL, not with amgcl or rocalution
-#if HAVE_CUDA || HAVE_OPENCL
-        if (!useWellConn) {
-            getContribs(*wellContribs);
-        }
-#endif
-
-        if (numJacobiBlocks_ > 1) {
-            this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
-            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
-                                  numJacobiBlocks_, rhs, *wellContribs, result);
-        }
-        else
-            bridge_->solve_system(&matrix, &matrix,
-                                  numJacobiBlocks_, rhs, *wellContribs, result);
-        if (result.converged) {
-            // get result vector x from non-Dune backend, iff solve was successful
-            bridge_->get_result(x);
-            return true;
-        } else {
-            // warn about CPU fallback
-            // BdaBridge might have disabled its BdaSolver for this simulation due to some error
-            // in that case the BdaBridge is disabled and flexibleSolver is always used
-            // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
-            if (rank == 0) {
-                OpmLog::warning(bridge_->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
-            }
-        }
-    }
-
-    return false;
-}
-
-template<class Matrix, class Vector>
-bool BdaSolverInfo<Matrix,Vector>::
-gpuActive()
-{
-    return bridge_->getUseGpu();
-}
-
-template<class Matrix, class Vector>
-template<class Grid>
-void BdaSolverInfo<Matrix,Vector>::
-blockJacobiAdjacency(const Grid& grid,
-                     const std::vector<int>& cell_part,
-                     size_t nonzeroes)
-{
-    using size_type = typename Matrix::size_type;
-    using Iter = typename Matrix::CreateIterator;
-    size_type numCells = grid.size(0);
-    blockJacobiForGPUILU0_ = std::make_unique<Matrix>(numCells, numCells,
-                                                      nonzeroes, Matrix::row_wise);
-
-    const auto& lid = grid.localIdSet();
-    const auto& gridView = grid.leafGridView();
-    auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
-
-    //Loop over cells
-    for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
-    {
-        const auto& elem = *elemIt;
-        size_type idx = lid.id(elem);
-        row.insert(idx);
-
-        // Add well non-zero connections
-        for (const auto wc : wellConnectionsGraph_[idx]) {
-            row.insert(wc);
-        }
-
-        int locPart = cell_part[idx];
-
-        //Add neighbor if it is on the same part
-        auto isend = gridView.iend(elem);
-        for (auto is = gridView.ibegin(elem); is!=isend; ++is)
-        {
-            //check if face has neighbor
-            if (is->neighbor())
-            {
-                size_type nid = lid.id(is->outside());
-                int nabPart = cell_part[nid];
-                if (locPart == nabPart) {
-                    row.insert(nid);
-                }
-            }
-        }
-    }
-}
-
-template<class Matrix, class Vector>
-void BdaSolverInfo<Matrix,Vector>::
-copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
-{
-    auto rbegin = blockJac.begin();
-    auto rend = blockJac.end();
-    auto outerRow = mat.begin();
-    for (auto row = rbegin; row != rend; ++row, ++outerRow) {
-        auto outerCol = (*outerRow).begin();
-        for (auto col = (*row).begin(); col != (*row).end(); ++col) {
-            // outerRow is guaranteed to have all column entries that row has!
-            while(outerCol.index() < col.index()) ++outerCol;
-            assert(outerCol.index() == col.index());
-            *col = *outerCol; // copy nonzero block
-        }
-    }
-}
-#endif // COMPILE_BDA_BRIDGE
-
 template<int Dim>
 using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
 template<int Dim>
@ -380,46 +217,12 @@ using CommunicationType = Dune::CollectiveCommunication<int>;
    template void makeOverlapRowsInvalid<BM<Dim>>(BM<Dim>&, const std::vector<int>&); \
    template struct FlexibleSolverInfo<BM<Dim>,BV<Dim>,CommunicationType>;

-#if COMPILE_BDA_BRIDGE
-
-#define INSTANCE_GRID(Dim, Grid) \
-    template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
-    prepare(const Grid&, \
-            const Dune::CartesianIndexMapper<Grid>&, \
-            const std::vector<Well>&, \
-            const std::vector<int>&, \
-            const size_t, const bool);
-using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
-#if HAVE_DUNE_ALUGRID
-#if HAVE_MPI
-    using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridMPIComm>;
-#else
-    using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridNoComm>;
-#endif //HAVE_MPI
-#define INSTANCE(Dim) \
-    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
-    INSTANCE_GRID(Dim,Dune::CpGrid) \
-    INSTANCE_GRID(Dim,ALUGrid3CN) \
-    INSTANCE_GRID(Dim,PolyHedralGrid3D) \
-    INSTANCE_FLEX(Dim)
-#else
-#define INSTANCE(Dim) \
-    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
-    INSTANCE_GRID(Dim,Dune::CpGrid) \
-    INSTANCE_GRID(Dim,PolyHedralGrid3D)  \
-    INSTANCE_FLEX(Dim)
-#endif
-#else
-#define INSTANCE(Dim) \
-    INSTANCE_FLEX(Dim)
-#endif // COMPILE_BDA_BRIDGE
-
-INSTANCE(1)
-INSTANCE(2)
-INSTANCE(3)
-INSTANCE(4)
-INSTANCE(5)
-INSTANCE(6)
+INSTANCE_FLEX(1)
+INSTANCE_FLEX(2)
+INSTANCE_FLEX(3)
+INSTANCE_FLEX(4)
+INSTANCE_FLEX(5)
+INSTANCE_FLEX(6)

 }
 }
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -87,10 +87,6 @@ public:
 namespace Opm
 {

-#if COMPILE_BDA_BRIDGE
-template<class Matrix, class Vector, int block_size> class BdaBridge;
-class WellContributions;
-#endif

 namespace detail
 {
@ -116,60 +112,6 @@ struct FlexibleSolverInfo
    size_t interiorCellNum_ = 0;
 };

-#if COMPILE_BDA_BRIDGE
-template<class Matrix, class Vector>
-struct BdaSolverInfo
-{
-  using WellContribFunc = std::function<void(WellContributions&)>;
-  using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
-
-  BdaSolverInfo(const std::string& accelerator_mode,
-                const int linear_solver_verbosity,
-                const int maxit,
-                const double tolerance,
-                const int platformID,
-                const int deviceID,
-                const bool opencl_ilu_parallel,
-                const std::string& linsolver);
-
-  ~BdaSolverInfo();
-
-  template<class Grid>
-  void prepare(const Grid& grid,
-               const Dune::CartesianIndexMapper<Grid>& cartMapper,
-               const std::vector<Well>& wellsForConn,
-               const std::vector<int>& cellPartition,
-               const size_t nonzeroes,
-               const bool useWellConn);
-
-  bool apply(Vector& rhs,
-             const bool useWellConn,
-             WellContribFunc getContribs,
-             const int rank,
-             Matrix& matrix,
-             Vector& x,
-             Dune::InverseOperatorResult& result);
-
-  bool gpuActive();
-
-  int numJacobiBlocks_ = 0;
-
-private:
-  /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
-  /// Do not initialize the values, that is done in copyMatToBlockJac()
-  template<class Grid>
-  void blockJacobiAdjacency(const Grid& grid,
-                            const std::vector<int>& cell_part,
-                            size_t nonzeroes);
-
-  void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
-
-  std::unique_ptr<Bridge> bridge_;
-  std::string accelerator_mode_;
-  std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
-  std::vector<std::set<int>> wellConnectionsGraph_;
-};
-#endif

 #ifdef HAVE_MPI
 /// Copy values in parallel.
@ -259,7 +201,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
            initialize();
        }

-        void initialize()
+        void initialize(bool have_gpu = false)
        {
            OPM_TIMEBLOCK(IstlSolverEbos);
            prm_ = setupPropertyTree(parameters_,
@ -270,37 +212,6 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
 #if HAVE_MPI
            comm_.reset( new CommunicationType( simulator_.vanguard().grid().comm() ) );
 #endif
-
-#if COMPILE_BDA_BRIDGE
-            {
-                std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-                if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
-                    if (on_io_rank) {
-                        OpmLog::warning("Cannot use GPU with MPI, GPU are disabled");
-                    }
-                    accelerator_mode = "none";
-                }
-                const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
-                const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
-                const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
-                const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
-                const bool opencl_ilu_parallel = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel);
-                const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-                std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolver);
-                bdaBridge = std::make_unique<detail::BdaSolverInfo<Matrix,Vector>>(accelerator_mode,
-                                                                                   linear_solver_verbosity,
-                                                                                   maxit,
-                                                                                   tolerance,
-                                                                                   platformID,
-                                                                                   deviceID,
-                                                                                   opencl_ilu_parallel,
-                                                                                   linsolver);
-            }
-#else
-            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
-                OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake");
-            }
-#endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);

            // For some reason simulator_.model().elementMapper() is not initialized at this stage
@ -327,6 +238,12 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
                prm_.write_json(os, true);
                OpmLog::note(os.str());
            }
+            if(have_gpu){
+                if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
+                    OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake");
+                }
+            }
+
        }

        // nothing to clean here
@ -341,7 +258,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,

        void prepare(const Matrix& M, Vector& b)
        {
-            OPM_TIMEBLOCK(istlSolverEbosPrepare);        
+            OPM_TIMEBLOCK(istlSolverEbosPrepare);
            const bool firstcall = (matrix_ == nullptr);
 #if HAVE_MPI
            if (firstcall) {
@ -359,14 +276,6 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,

                useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
                // setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
-#if HAVE_OPENCL
-                bdaBridge->numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
-                bdaBridge->prepare(simulator_.vanguard().grid(),
-                                   simulator_.vanguard().cartesianIndexMapper(),
-                                   simulator_.vanguard().schedule().getWellsatEnd(),
-                                   simulator_.vanguard().cellPartition(),
-                                   getMatrix().nonzeroes(), useWellConn_);
-#endif
            } else {
                // Pointers should not change
                if ( &M != matrix_ ) {
@ -379,13 +288,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
            if (isParallel() && prm_.get<std::string>("preconditioner.type") != "ParOverILU0") {
                detail::makeOverlapRowsInvalid(getMatrix(), overlapRows_);
            }
-#if COMPILE_BDA_BRIDGE
-            if(!bdaBridge->gpuActive()){
-                prepareFlexibleSolver();
-            }
-#else
            prepareFlexibleSolver();
-#endif
        }


@ -406,7 +309,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,

        bool solve(Vector& x)
        {
-            OPM_TIMEBLOCK(istlSolverEbosSolve);        
+            OPM_TIMEBLOCK(istlSolverEbosSolve);
            calls_ += 1;
            // Write linear system if asked for.
            const int verbosity = prm_.get<int>("verbosity", 0);
@ -420,25 +323,8 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,

            // Solve system.
            Dune::InverseOperatorResult result;
-
-#if COMPILE_BDA_BRIDGE
-            std::function<void(WellContributions&)> getContribs =
-                [this](WellContributions& w)
-                {
-                    this->simulator_.problem().wellModel().getWellContributions(w);
-                };
-            if (!bdaBridge->apply(*rhs_, useWellConn_, getContribs,
-                                  simulator_.gridView().comm().rank(),
-                                  const_cast<Matrix&>(this->getMatrix()),
-                                  x, result))
-#endif
            {
                OPM_TIMEBLOCK(flexibleSolverApply);
-#if COMPILE_BDA_BRIDGE
-                if(bdaBridge->gpuActive()){
-                    prepareFlexibleSolver();
-                }
-#endif
                assert(flexibleSolver_.solver_);
                flexibleSolver_.solver_->apply(x, *rhs_, result);
            }
@ -521,7 +407,7 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
            }
            else
            {
-                OPM_TIMEBLOCK(flexibleSolverUpdate);        
+                OPM_TIMEBLOCK(flexibleSolverUpdate);
                flexibleSolver_.pre_->update();
            }
        }
@ -609,10 +495,6 @@ std::unique_ptr<Matrix> blockJacobiAdjacency(const Grid& grid,
        Matrix* matrix_;
        Vector *rhs_;

-#if COMPILE_BDA_BRIDGE
-        std::unique_ptr<detail::BdaSolverInfo<Matrix, Vector>> bdaBridge;
-#endif
-
        detail::FlexibleSolverInfo<Matrix,Vector,CommunicationType> flexibleSolver_;
        std::vector<int> overlapRows_;
        std::vector<int> interiorRows_;
--- a/opm/simulators/linalg/ISTLSolverEbosBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbosBda.cpp
@ -0,0 +1,249 @@
+/*
+  Copyright 2016 IRIS AS
+  Copyright 2019, 2020 Equinor ASA
+  Copyright 2020 SINTEF Digital, Mathematics and Cybernetics
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/simulators/linalg/ISTLSolverEbosBda.hpp>
+
+#include <dune/istl/schwarz.hh>
+
+#include <opm/grid/CpGrid.hpp>
+
+#include <opm/simulators/linalg/FlexibleSolver.hpp>
+#include <opm/simulators/linalg/ParallelIstlInformation.hpp>
+#include <opm/simulators/utils/ParallelCommunication.hpp>
+
+#include <fmt/format.h>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+#if HAVE_DUNE_ALUGRID
+#include <dune/alugrid/grid.hh>
+#include <ebos/alucartesianindexmapper.hh>
+#endif // HAVE_DUNE_ALUGRID
+
+#include <opm/grid/polyhedralgrid.hh>
+
+namespace Opm {
+namespace detail {
+
+template<class Matrix, class Vector>
+BdaSolverInfo<Matrix,Vector>::
+BdaSolverInfo(const std::string& accelerator_mode,
+              const int linear_solver_verbosity,
+              const int maxit,
+              const double tolerance,
+              const int platformID,
+              const int deviceID,
+              const bool opencl_ilu_parallel,
+              const std::string& linsolver)
+    : bridge_(std::make_unique<Bridge>(accelerator_mode,
+                                       linear_solver_verbosity, maxit,
+                                       tolerance, platformID, deviceID,
+                                       opencl_ilu_parallel, linsolver))
+    , accelerator_mode_(accelerator_mode)
+{}
+
+template<class Matrix, class Vector>
+BdaSolverInfo<Matrix,Vector>::~BdaSolverInfo() = default;
+
+template<class Matrix, class Vector>
+template<class Grid>
+void BdaSolverInfo<Matrix,Vector>::
+prepare(const Grid& grid,
+        const Dune::CartesianIndexMapper<Grid>& cartMapper,
+        const std::vector<Well>& wellsForConn,
+        const std::vector<int>& cellPartition,
+        const size_t nonzeroes,
+        const bool useWellConn)
+{
+    if (numJacobiBlocks_ > 1) {
+      detail::setWellConnections(grid, cartMapper, wellsForConn,
+                                 useWellConn,
+                                 wellConnectionsGraph_,
+                                 numJacobiBlocks_);
+      this->blockJacobiAdjacency(grid, cellPartition, nonzeroes);
+    }
+}
+
+template<class Matrix, class Vector>
+bool BdaSolverInfo<Matrix,Vector>::
+apply(Vector& rhs,
+      const bool useWellConn,
+      WellContribFunc getContribs,
+      const int rank,
+      Matrix& matrix,
+      Vector& x,
+      Dune::InverseOperatorResult& result)
+{
+    bool use_gpu = bridge_->getUseGpu();
+    if (use_gpu) {
+        auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
+        bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
+
+        // the WellContributions can only be applied separately with CUDA or OpenCL, not with amgcl or rocalution
+#if HAVE_CUDA || HAVE_OPENCL
+        if (!useWellConn) {
+            getContribs(*wellContribs);
+        }
+#endif
+
+        if (numJacobiBlocks_ > 1) {
+            this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
+            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
+            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
+                                  numJacobiBlocks_, rhs, *wellContribs, result);
+        }
+        else
+            bridge_->solve_system(&matrix, &matrix,
+                                  numJacobiBlocks_, rhs, *wellContribs, result);
+        if (result.converged) {
+            // get result vector x from non-Dune backend, iff solve was successful
+            bridge_->get_result(x);
+            return true;
+        } else {
+            // warn about CPU fallback
+            // BdaBridge might have disabled its BdaSolver for this simulation due to some error
+            // in that case the BdaBridge is disabled and flexibleSolver is always used
+            // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
+            if (rank == 0) {
+                OpmLog::warning(bridge_->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
+            }
+        }
+    }
+
+    return false;
+}
+
+template<class Matrix, class Vector>
+bool BdaSolverInfo<Matrix,Vector>::
+gpuActive()
+{
+    return bridge_->getUseGpu();
+}
+
+template<class Matrix, class Vector>
+template<class Grid>
+void BdaSolverInfo<Matrix,Vector>::
+blockJacobiAdjacency(const Grid& grid,
+                     const std::vector<int>& cell_part,
+                     size_t nonzeroes)
+{
+    using size_type = typename Matrix::size_type;
+    using Iter = typename Matrix::CreateIterator;
+    size_type numCells = grid.size(0);
+    blockJacobiForGPUILU0_ = std::make_unique<Matrix>(numCells, numCells,
+                                                      nonzeroes, Matrix::row_wise);
+
+    const auto& lid = grid.localIdSet();
+    const auto& gridView = grid.leafGridView();
+    auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
+
+    //Loop over cells
+    for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
+    {
+        const auto& elem = *elemIt;
+        size_type idx = lid.id(elem);
+        row.insert(idx);
+
+        // Add well non-zero connections
+        for (const auto wc : wellConnectionsGraph_[idx]) {
+            row.insert(wc);
+        }
+
+        int locPart = cell_part[idx];
+
+        //Add neighbor if it is on the same part
+        auto isend = gridView.iend(elem);
+        for (auto is = gridView.ibegin(elem); is!=isend; ++is)
+        {
+            //check if face has neighbor
+            if (is->neighbor())
+            {
+                size_type nid = lid.id(is->outside());
+                int nabPart = cell_part[nid];
+                if (locPart == nabPart) {
+                    row.insert(nid);
+                }
+            }
+        }
+    }
+}
+
+template<class Matrix, class Vector>
+void BdaSolverInfo<Matrix,Vector>::
+copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
+{
+    auto rbegin = blockJac.begin();
+    auto rend = blockJac.end();
+    auto outerRow = mat.begin();
+    for (auto row = rbegin; row != rend; ++row, ++outerRow) {
+        auto outerCol = (*outerRow).begin();
+        for (auto col = (*row).begin(); col != (*row).end(); ++col) {
+            // outerRow is guaranteed to have all column entries that row has!
+            while(outerCol.index() < col.index()) ++outerCol;
+            assert(outerCol.index() == col.index());
+            *col = *outerCol; // copy nonzero block
+        }
+    }
+}
+
+template<int Dim>
+using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
+template<int Dim>
+using BV = Dune::BlockVector<Dune::FieldVector<double,Dim>>;
+
+
+#define INSTANCE_GRID(Dim, Grid)                   \
+    template void BdaSolverInfo<BM<Dim>,BV<Dim>>:: \
+    prepare(const Grid&, \
+            const Dune::CartesianIndexMapper<Grid>&, \
+            const std::vector<Well>&, \
+            const std::vector<int>&, \
+            const size_t, const bool);
+using PolyHedralGrid3D = Dune::PolyhedralGrid<3, 3>;
+#if HAVE_DUNE_ALUGRID
+#if HAVE_MPI
+    using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridMPIComm>;
+#else
+    using ALUGrid3CN = Dune::ALUGrid<3, 3, Dune::cube, Dune::nonconforming, Dune::ALUGridNoComm>;
+#endif //HAVE_MPI
+#define INSTANCE(Dim) \
+    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
+    INSTANCE_GRID(Dim,Dune::CpGrid) \
+    INSTANCE_GRID(Dim,ALUGrid3CN) \
+    INSTANCE_GRID(Dim,PolyHedralGrid3D)
+#else
+#define INSTANCE(Dim) \
+    template struct BdaSolverInfo<BM<Dim>,BV<Dim>>; \
+    INSTANCE_GRID(Dim,Dune::CpGrid) \
+    INSTANCE_GRID(Dim,PolyHedralGrid3D)
+#endif
+INSTANCE(1)
+INSTANCE(2)
+INSTANCE(3)
+INSTANCE(4)
+INSTANCE(5)
+INSTANCE(6)
+
+} // namespace detail
+} // namespace Opm
--- a/opm/simulators/linalg/ISTLSolverEbosBda.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbosBda.hpp
@ -0,0 +1,265 @@
+/*
+  Copyright 2016 IRIS AS
+  Copyright 2019, 2020 Equinor ASA
+  Copyright 2020 SINTEF Digital, Mathematics and Cybernetics
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_ISTLSOLVER_EBOS_WITH_BDA_INCLUDED
+#define OPM_ISTLSOLVER_EBOS_WITH_BDA_INCLUDED
+
+#include <opm/simulators/linalg/ISTLSolverEbos.hpp>
+
+namespace Opm {
+
+class Well;
+
+template<class Matrix, class Vector, int block_size> class BdaBridge;
+class WellContributions;
+namespace detail {
+
+template<class Matrix, class Vector>
+struct BdaSolverInfo
+{
+  using WellContribFunc = std::function<void(WellContributions&)>;
+  using Bridge = BdaBridge<Matrix,Vector,Matrix::block_type::rows>;
+
+  BdaSolverInfo(const std::string& accelerator_mode,
+                const int linear_solver_verbosity,
+                const int maxit,
+                const double tolerance,
+                const int platformID,
+                const int deviceID,
+                const bool opencl_ilu_parallel,
+                const std::string& linsolver);
+
+  ~BdaSolverInfo();
+
+  template<class Grid>
+  void prepare(const Grid& grid,
+               const Dune::CartesianIndexMapper<Grid>& cartMapper,
+               const std::vector<Well>& wellsForConn,
+               const std::vector<int>& cellPartition,
+               const size_t nonzeroes,
+               const bool useWellConn);
+
+  bool apply(Vector& rhs,
+             const bool useWellConn,
+             WellContribFunc getContribs,
+             const int rank,
+             Matrix& matrix,
+             Vector& x,
+             Dune::InverseOperatorResult& result);
+
+  bool gpuActive();
+
+  int numJacobiBlocks_ = 0;
+
+private:
+  /// Create sparsity pattern for block-Jacobi matrix based on partitioning of grid.
+  /// Do not initialize the values, that is done in copyMatToBlockJac()
+  template<class Grid>
+  void blockJacobiAdjacency(const Grid& grid,
+                            const std::vector<int>& cell_part,
+                            size_t nonzeroes);
+
+  void copyMatToBlockJac(const Matrix& mat, Matrix& blockJac);
+
+  std::unique_ptr<Bridge> bridge_;
+  std::string accelerator_mode_;
+  std::unique_ptr<Matrix> blockJacobiForGPUILU0_;
+  std::vector<std::set<int>> wellConnectionsGraph_;
+};
+
+}
+
+/// This class solves the fully implicit black-oil system by
+/// solving the reduced system (after eliminating well variables)
+/// as a block-structured matrix (one block for all cell variables) for a fixed
+/// number of cell variables np .
+template <class TypeTag>
+class ISTLSolverEbosBda : public ISTLSolverEbos<TypeTag>
+{
+protected:
+    using ParentType = ISTLSolverEbos<TypeTag>;
+    using GridView = GetPropType<TypeTag, Properties::GridView>;
+    using Scalar = GetPropType<TypeTag, Properties::Scalar>;
+    using SparseMatrixAdapter = GetPropType<TypeTag, Properties::SparseMatrixAdapter>;
+    using Vector = GetPropType<TypeTag, Properties::GlobalEqVector>;
+    using Indices = GetPropType<TypeTag, Properties::Indices>;
+    using WellModel = GetPropType<TypeTag, Properties::EclWellModel>;
+    using Simulator = GetPropType<TypeTag, Properties::Simulator>;
+    using Matrix = typename SparseMatrixAdapter::IstlMatrix;
+    using ThreadManager = GetPropType<TypeTag, Properties::ThreadManager>;
+    using ElementContext = GetPropType<TypeTag, Properties::ElementContext>;
+    using AbstractSolverType = Dune::InverseOperator<Vector, Vector>;
+    using AbstractOperatorType = Dune::AssembledLinearOperator<Matrix, Vector, Vector>;
+    using AbstractPreconditionerType = Dune::PreconditionerWithUpdate<Vector, Vector>;
+    using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
+    using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
+    constexpr static std::size_t pressureIndex = GetPropType<TypeTag, Properties::Indices>::pressureSwitchIdx;
+
+#if HAVE_MPI
+    using CommunicationType = Dune::OwnerOverlapCopyCommunication<int,int>;
+#else
+    using CommunicationType = Dune::CollectiveCommunication<int>;
+#endif
+
+public:
+    using AssembledLinearOperatorType = Dune::AssembledLinearOperator< Matrix, Vector, Vector >;
+
+    /// Construct a system solver.
+    /// \param[in] simulator   The opm-models simulator object
+    /// \param[in] parameters  Explicit parameters for solver setup, do not
+    ///                        read them from command line parameters.
+    ISTLSolverEbosBda(const Simulator& simulator, const FlowLinearSolverParameters& parameters)
+        : ParentType(simulator, parameters)
+    {
+        bool have_gpu = true;
+        this->initialize(have_gpu);
+    }
+
+    /// Construct a system solver.
+    /// \param[in] simulator   The opm-models simulator object
+    explicit ISTLSolverEbosBda(const Simulator& simulator)
+    : ParentType(simulator)
+    {
+    }
+
+    void initialize()
+    {
+        OPM_TIMEBLOCK(initialize);
+        ParentType::initialize(false);
+        const bool on_io_rank = (this->simulator_.gridView().comm().rank() == 0);
+        {
+            std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
+            if ((this->simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
+                if (on_io_rank) {
+                    OpmLog::warning("Cannot use GPU with MPI, GPU is disabled");
+                }
+                accelerator_mode = "none";
+            }
+            const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
+            const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
+            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
+            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
+            const bool opencl_ilu_parallel = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel);
+            const int linear_solver_verbosity = this->parameters_.linear_solver_verbosity_;
+            std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolver);
+            bdaBridge_ = std::make_unique<detail::BdaSolverInfo<Matrix,Vector>>(accelerator_mode,
+                                                                               linear_solver_verbosity,
+                                                                               maxit,
+                                                                               tolerance,
+                                                                               platformID,
+                                                                               deviceID,
+                                                                               opencl_ilu_parallel,
+                                                                               linsolver);
+        }
+    }
+
+    void prepare(const Matrix& M, Vector& b)
+    {
+        OPM_TIMEBLOCK(prepare);
+        ParentType::prepare(M,b);
+        const bool firstcall = (this->matrix_ == nullptr);
+        // update matrix entries for solvers.
+        if (firstcall) {
+            // ebos will not change the matrix object. Hence simply store a pointer
+            // to the original one with a deleter that does nothing.
+            // Outch! We need to be able to scale the linear system! Hence const_cast
+            // setup sparsity pattern for jacobi matrix for preconditioner (only used for openclSolver)
+#if HAVE_OPENCL
+            bdaBridge_->numJacobiBlocks_ = EWOMS_GET_PARAM(TypeTag, int, NumJacobiBlocks);
+            bdaBridge_->prepare(this->simulator_.vanguard().grid(),
+                               this->simulator_.vanguard().cartesianIndexMapper(),
+                               this->simulator_.vanguard().schedule().getWellsatEnd(),
+                               this->simulator_.vanguard().cellPartition(),
+                               this->getMatrix().nonzeroes(), this->useWellConn_);
+#endif
+        }
+    }
+
+
+    void setResidual(Vector& /* b */)
+    {
+        // rhs_ = &b; // Must be handled in prepare() instead.
+    }
+
+    void getResidual(Vector& b) const
+    {
+        b = *(this->rhs_);
+    }
+
+    void setMatrix(const SparseMatrixAdapter& /* M */)
+    {
+        // matrix_ = &M.istlMatrix(); // Must be handled in prepare() instead.
+    }
+
+    bool solve(Vector& x)
+    {
+        OPM_TIMEBLOCK(solve);
+        this->calls_ += 1;
+        // Write linear system if asked for.
+        const int verbosity = this->prm_.template get<int>("verbosity", 0);
+        const bool write_matrix = verbosity > 10;
+        if (write_matrix) {
+            Helper::writeSystem(this->simulator_, //simulator is only used to get names
+                                this->getMatrix(),
+                                *(this->rhs_),
+                                this->comm_.get());
+        }
+
+        // Solve system.
+        Dune::InverseOperatorResult result;
+
+        std::function<void(WellContributions&)> getContribs =
+            [this](WellContributions& w)
+            {
+                this->simulator_.problem().wellModel().getWellContributions(w);
+            };
+        if (!bdaBridge_->apply(*(this->rhs_), this->useWellConn_, getContribs,
+                              this->simulator_.gridView().comm().rank(),
+                              const_cast<Matrix&>(this->getMatrix()),
+                              x, result))
+        {
+            if(bdaBridge_->gpuActive()){
+                // bda solve fails use istl solver setup need to be done since it is not setup in prepare
+                ParentType::prepareFlexibleSolver();
+            }
+            assert(this->flexibleSolver_.solver_);
+            this->flexibleSolver_.solver_->apply(x, *(this->rhs_), result);
+        }
+
+        // Check convergence, iterations etc.
+        this->checkConvergence(result);
+
+        return this->converged_;
+    }
+
+protected:
+    void prepareFlexibleSolver()
+    {
+        if(bdaBridge_->gpuActive()){
+            ParentType::prepareFlexibleSolver();
+        }
+    }
+    std::unique_ptr<detail::BdaSolverInfo<Matrix, Vector>> bdaBridge_;
+}; // end ISTLSolver
+
+} // namespace Opm
+
+#endif // OPM_ISTLSOLVER_EBOS_BDA_HEADER_INCLUDED
--- a/opm/simulators/wells/MultisegmentWellEquations.cpp
+++ b/opm/simulators/wells/MultisegmentWellEquations.cpp
@ -28,7 +28,10 @@

 #include <opm/input/eclipse/Schedule/MSW/WellSegments.hpp>

+#if COMPILE_BDA_BRIDGE
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
+#endif
+
 #include <opm/simulators/linalg/istlsparsematrixadapter.hh>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <opm/simulators/linalg/SmallDenseMatrixUtils.hpp>
@ -189,6 +192,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
    xw = mswellhelpers::applyUMFPack(*duneDSolver_, resWell);
 }

+#if COMPILE_BDA_BRIDGE
 template<class Scalar, int numWellEq, int numEq>
 void MultisegmentWellEquations<Scalar,numWellEq,numEq>::
 extract(WellContributions& wellContribs) const
@ -255,7 +259,7 @@ extract(WellContributions& wellContribs) const
                                                 Drows,
                                                 Cvals);
 }
-
+#endif

 template<class Scalar, int numWellEq, int numEq>
 template<class SparseMatrixAdapter>
--- a/opm/simulators/wells/MultisegmentWellEquations.hpp
+++ b/opm/simulators/wells/MultisegmentWellEquations.hpp
@ -38,7 +38,9 @@ namespace Opm

 template<class Scalar, int numWellEq, int numEq> class MultisegmentWellEquationAccess;
 template<class Scalar> class MultisegmentWellGeneric;
+#if COMPILE_BDA_BRIDGE
 class WellContributions;
+#endif
 class WellInterfaceGeneric;
 class WellState;

@ -98,8 +100,10 @@ public:
    //! \details xw = inv(D)*(rw - C*x)
    void recoverSolutionWell(const BVector& x, BVectorWell& xw) const;

+#if COMPILE_BDA_BRIDGE
    //! \brief Add the matrices of this well to the WellContributions object.
    void extract(WellContributions& wellContribs) const;
+#endif

    //! \brief Add the matrices of this well to the sparse matrix adapter.
    template<class SparseMatrixAdapter>
--- a/opm/simulators/wells/StandardWellEquations.cpp
+++ b/opm/simulators/wells/StandardWellEquations.cpp
@ -24,9 +24,10 @@
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/wells/StandardWellEquations.hpp>

-
-
+#if COMPILE_BDA_BRIDGE
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
+#endif
+
 #include <opm/simulators/linalg/istlsparsematrixadapter.hh>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <opm/simulators/linalg/SmallDenseMatrixUtils.hpp>
@ -187,6 +188,7 @@ recoverSolutionWell(const BVector& x, BVectorWell& xw) const
    invDuneD_.mv(resWell, xw);
 }

+#if COMPILE_BDA_BRIDGE
 template<class Scalar, int numEq>
 void StandardWellEquations<Scalar,numEq>::
 extract(const int numStaticWellEq,
@ -240,6 +242,7 @@ extract(const int numStaticWellEq,
    wellContribs.addMatrix(WellContributions::MatrixType::B,
                           colIndices.data(), nnzValues.data(), duneB_.nonzeroes());
 }
+#endif

 template<class Scalar, int numEq>
 template<class SparseMatrixAdapter>
--- a/opm/simulators/wells/StandardWellEquations.hpp
+++ b/opm/simulators/wells/StandardWellEquations.hpp
@ -36,7 +36,9 @@ namespace Opm

 class ParallelWellInfo;
 template<class Scalar, int numEq> class StandardWellEquationAccess;
+#if COMPILE_BDA_BRIDGE
 class WellContributions;
+#endif
 class WellInterfaceGeneric;
 class WellState;

@ -94,9 +96,11 @@ public:
    //! \details xw = inv(D)*(rw - C*x)
    void recoverSolutionWell(const BVector& x, BVectorWell& xw) const;

+#if COMPILE_BDA_BRIDGE
    //! \brief Add the matrices of this well to the WellContributions object.
    void extract(const int numStaticWellEq,
                 WellContributions& wellContribs) const;
+#endif

    //! \brief Add the matrices of this well to the sparse matrix adapter.
    template<class SparseMatrixAdapter>