diff --git a/CMakeLists.txt b/CMakeLists.txt
index d722402a7..39dff6180 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -221,9 +221,6 @@ if(amgcl_FOUND)
   # Hence we set AMGCL_INCLUDE_DIRS.
   get_property(AMGCL_INCLUDE_DIRS TARGET amgcl::amgcl PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
   include_directories(SYSTEM ${AMGCL_INCLUDE_DIRS})
-  if(CUDA_FOUND)
-    set_source_files_properties(opm/simulators/linalg/bda/amgclSolverBackend.cpp PROPERTIES LANGUAGE CUDA)
-  endif()
 endif()
 
 if(OpenCL_FOUND)
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 202fc1cdc..d72a0b420 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -119,6 +119,9 @@ if(HAVE_AMGCL)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
+  if(CUDA_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cu)
+  endif()
 endif()
 if(MPI_FOUND)
   list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cpp b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
index dfb39065f..9602f26c0 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cpp
@@ -30,6 +30,10 @@
 
 #include <boost/property_tree/json_parser.hpp>
 
+#if HAVE_VEXCL
+#include <amgcl/backend/vexcl.hpp>
+#include <amgcl/backend/vexcl_static_matrix.hpp>
+#endif
 
 namespace bda
 {
@@ -74,10 +78,10 @@ void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
     if (file.is_open()) { // if file exists, read parameters from file
         boost::property_tree::read_json(file, prm);
 
-        backend_type_string = prm.get<std::string>("backend_type"); // defaults to cpu if not specified        
+        backend_type_string = prm.get("backend_type", "cpu"); // defaults to cpu if not specified
         out << "Using parameters from " << filename << ":\n";
     } else { // otherwise use default parameters, same as Dune
-        prm.put("backend_type", "cpu");
+        prm.put("backend_type", "cpu"); // put it in the tree so it gets printed
         prm.put("precond.class", "relaxation");
         prm.put("precond.type", "ilu0");
         prm.put("precond.damping", 0.9);
@@ -85,6 +89,7 @@ void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
         prm.put("solver.tol", tolerance);
         prm.put("solver.maxiter", maxit);
         prm.put("solver.verbose", verbosity >= 2);
+        backend_type_string = prm.get("backend_type", "cpu");
         out << "Using default amgcl parameters:\n";
     }
 
@@ -98,16 +103,11 @@ void amgclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim) {
     } else if (backend_type_string == "vexcl") {
         backend_type = Amgcl_backend_type::vexcl;
     } else {
-        OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type'");
+        OPM_THROW(std::logic_error, "Error unknown value for amgcl parameter 'backend_type', use [cpu|cuda|vexcl]");
     }
 
     if (backend_type == Amgcl_backend_type::cuda) {
-#if HAVE_CUDA
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, deviceID);
-        out << prop.name << std::endl;
-        cusparseCreate(&CUDA_bprm.cusparse_handle);
-#else
+#if !HAVE_CUDA
         OPM_THROW(std::logic_error, "Error amgcl is trying to use CUDA, but CUDA was not found by CMake");
 #endif
     }
@@ -211,33 +211,11 @@ void solve_vexcl(
 template <unsigned int block_size>
 void amgclSolverBackend<block_size>::solve_system(double *b, BdaResult &res) {
     Timer t;
-    int iters = 0;
-    double error = 0.0;
 
     try {
         if (backend_type == Amgcl_backend_type::cuda) { // use CUDA
 #if HAVE_CUDA
-            // create matrix object
-            auto A = std::tie(N, A_rows, A_cols, A_vals);
-
-            // create solver and construct preconditioner
-            // don't reuse this unless the preconditioner can be reused
-            CUDA_Solver solve(A, prm, CUDA_bprm);
-
-            // print solver structure (once)
-            std::call_once(print_info, [&](){
-                std::ostringstream out;
-                out << solve << std::endl;
-                OpmLog::info(out.str());
-            });
-
-            thrust::device_vector<double> B(b, b + N);
-            thrust::device_vector<double> X(N, 0.0);
-
-            // actually solve
-            std::tie(iters, error) = solve(B, X);
-
-            thrust::copy(X.begin(), X.end(), x.begin());
+            solve_cuda(b);
 #endif
         } else if (backend_type == Amgcl_backend_type::cpu) { // use builtin backend (CPU)
             // create matrix object
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.cu b/opm/simulators/linalg/bda/amgclSolverBackend.cu
new file mode 100644
index 000000000..a19901c01
--- /dev/null
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.cu
@@ -0,0 +1,89 @@
+/*
+  Copyright 2021 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+
+#include <amgcl/backend/cuda.hpp>
+#include <amgcl/relaxation/cusparse_ilu0.hpp>
+#include <opm/simulators/linalg/bda/amgclSolverBackend.hpp>
+
+/// This file is only compiled when both amgcl and CUDA are found by CMake
+
+namespace bda
+{
+
+using Opm::OpmLog;
+
+
+template <unsigned int block_size>
+void amgclSolverBackend<block_size>::solve_cuda(double *b) {
+    typedef amgcl::backend::cuda<double> CUDA_Backend;
+    typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
+
+    static typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle
+
+    // initialize cusparse handle for amgcl, cannot merge this call_once with 'print solver structure'
+    std::call_once(cuda_initialize, [&](){
+        cudaDeviceProp prop;
+        cudaGetDeviceProperties(&prop, deviceID);
+        std::ostringstream out;
+        out << prop.name << std::endl;
+        OpmLog::info(out.str());
+        cusparseCreate(&CUDA_bprm.cusparse_handle);
+    });
+
+    // create matrix object
+    auto A = std::tie(N, A_rows, A_cols, A_vals);
+
+    // create solver and construct preconditioner
+    // don't reuse this unless the preconditioner can be reused
+    CUDA_Solver solve(A, prm, CUDA_bprm);
+
+    // print solver structure (once)
+    std::call_once(print_info, [&](){
+        std::ostringstream out;
+        out << solve << std::endl;
+        OpmLog::info(out.str());
+    });
+
+    thrust::device_vector<double> B(b, b + N);
+    thrust::device_vector<double> X(N, 0.0);
+
+    // actually solve
+    std::tie(iters, error) = solve(B, X);
+
+    thrust::copy(X.begin(), X.end(), x.begin());
+}
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                      \
+template void amgclSolverBackend<n>::solve_cuda(double*); \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} // namespace bda
+
diff --git a/opm/simulators/linalg/bda/amgclSolverBackend.hpp b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
index 54264e37b..b8bcf2e55 100644
--- a/opm/simulators/linalg/bda/amgclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/amgclSolverBackend.hpp
@@ -38,16 +38,6 @@
 #include <amgcl/preconditioner/runtime.hpp>
 #include <amgcl/value_type/static_matrix.hpp>
 
-#if HAVE_CUDA
-#include <amgcl/backend/cuda.hpp>
-#include <amgcl/relaxation/cusparse_ilu0.hpp>
-#endif
-
-#if HAVE_VEXCL
-#include <amgcl/backend/vexcl.hpp>
-#include <amgcl/backend/vexcl_static_matrix.hpp>
-#endif
-
 namespace bda
 {
 
@@ -69,11 +59,6 @@ class amgclSolverBackend : public BdaSolver<block_size>
     using Base::tolerance;
     using Base::initialized;
 
-#if HAVE_CUDA
-    typedef amgcl::backend::cuda<double> CUDA_Backend;
-    typedef amgcl::make_solver<amgcl::runtime::preconditioner<CUDA_Backend>, amgcl::runtime::solver::wrapper<CUDA_Backend> > CUDA_Solver;
-#endif
-
     typedef amgcl::static_matrix<double, block_size, block_size> dmat_type; // matrix value type in double precision
     typedef amgcl::static_matrix<double, block_size, 1> dvec_type; // the corresponding vector value type
     typedef amgcl::backend::builtin<dmat_type> CPU_Backend;
@@ -97,8 +82,12 @@ private:
     Amgcl_backend_type backend_type = cpu;
 
     boost::property_tree::ptree prm;         // amgcl parameters
+    int iters = 0;
+    double error = 0.0;
+
 #if HAVE_CUDA
-    typename CUDA_Backend::params CUDA_bprm; // amgcl backend parameters, only used for cusparseHandle
+    std::once_flag cuda_initialize;
+    void solve_cuda(double *b);
 #endif
 
     /// Initialize GPU and allocate memory