Replaced tabs with 4 spaces

2025-02-25 18:55:30 -06:00 · 2019-12-18 15:50:09 +01:00 · 2019-12-18 15:50:09 +01:00 · 950d1c92c1
commit 950d1c92c1
parent b543947e66
6 changed files with 590 additions and 592 deletions
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -37,9 +37,9 @@ namespace Opm

 BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED) : use_gpu(use_gpu_) {
 #if HAVE_CUDA
-	if(use_gpu){
-		backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
-	}
+    if(use_gpu){
+        backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+    }
 #endif
 }

@ -48,41 +48,41 @@ BdaBridge::BdaBridge(bool use_gpu_, int linear_solver_verbosity OPM_UNUSED, int
 #if HAVE_CUDA
 template <class BridgeMatrix>
 int checkZeroDiagonal(BridgeMatrix& mat) {
-	static std::vector<typename BridgeMatrix::size_type> diag_indices;   // contains offsets of the diagonal nnzs
-	int numZeros = 0;
-	const int dim = 3;
-	const double zero_replace = 1e-15;
-	if(diag_indices.size() == 0){
-		int N = mat.N();
-		diag_indices.reserve(N);
-		int roff = 0;
-		for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
-			auto diag = r->find(r.index());  // diag is an iterator
-			assert(diag.index() == r.index());
-			for(int rr = 0; rr < dim; ++rr){
-			    auto& val = (*diag)[rr][rr]; // reference to easily change the value
-			    if (val == 0.0){             // could be replaced by '< 1e-30' or similar
-			        val = zero_replace;
-			        ++numZeros;
-			    }
-			}
-		    diag_indices.emplace_back(diag.offset());
-		}
-	}else{
-		for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
-			typename BridgeMatrix::size_type offset = diag_indices[r.index()];
-			auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
-			for(int rr = 0; rr < dim; ++rr){
-				auto& val = diag_block[rr][rr];
-				if(val == 0.0){                     // could be replaced by '< 1e-30' or similar
-					val = zero_replace;
-					++numZeros;
-				}
-			}
-		}
-	}
+    static std::vector<typename BridgeMatrix::size_type> diag_indices;   // contains offsets of the diagonal nnzs
+    int numZeros = 0;
+    const int dim = 3;
+    const double zero_replace = 1e-15;
+    if(diag_indices.size() == 0){
+        int N = mat.N();
+        diag_indices.reserve(N);
+        int roff = 0;
+        for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
+            auto diag = r->find(r.index());  // diag is an iterator
+            assert(diag.index() == r.index());
+            for(int rr = 0; rr < dim; ++rr){
+                auto& val = (*diag)[rr][rr]; // reference to easily change the value
+                if (val == 0.0){             // could be replaced by '< 1e-30' or similar
+                    val = zero_replace;
+                    ++numZeros;
+                }
+            }
+            diag_indices.emplace_back(diag.offset());
+        }
+    }else{
+        for(typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r){
+            typename BridgeMatrix::size_type offset = diag_indices[r.index()];
+            auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
+            for(int rr = 0; rr < dim; ++rr){
+                auto& val = diag_block[rr][rr];
+                if(val == 0.0){                     // could be replaced by '< 1e-30' or similar
+                    val = zero_replace;
+                    ++numZeros;
+                }
+            }
+        }
+    }

-	return numZeros;
+    return numZeros;
 }


@ -91,24 +91,24 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
 template <class BridgeMatrix>
 void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
-	int sum_nnzs = 0;
+    int sum_nnzs = 0;

-	// convert colIndices and rowPointers
-	if(h_rows.size() == 0){
-		h_rows.emplace_back(0);
-			for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
-				int size_row = 0;
-				for(auto c = r->begin(); c != r->end(); ++c){
-					h_cols.emplace_back(c.index());
-					size_row++;
-				}
-				sum_nnzs += size_row;
-				h_rows.emplace_back(sum_nnzs);
-			}
-		if(h_rows[mat.N()] != mat.nonzeroes()){
-			OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
-		}
-	}
+    // convert colIndices and rowPointers
+    if(h_rows.size() == 0){
+        h_rows.emplace_back(0);
+            for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
+                int size_row = 0;
+                for(auto c = r->begin(); c != r->end(); ++c){
+                    h_cols.emplace_back(c.index());
+                    size_row++;
+                }
+                sum_nnzs += size_row;
+                h_rows.emplace_back(sum_nnzs);
+            }
+        if(h_rows[mat.N()] != mat.nonzeroes()){
+            OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
+        }
+    }
 } // end getSparsityPattern()

 #endif
@ -118,73 +118,73 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
 {

 #if HAVE_CUDA
-	if(use_gpu){
-		BdaResult result;
-		result.converged = false;
-		static std::vector<int> h_rows;
-		static std::vector<int> h_cols;
-		int dim = (*mat)[0][0].N();
-		int N = mat->N()*dim;
-		int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
+    if(use_gpu){
+        BdaResult result;
+        result.converged = false;
+        static std::vector<int> h_rows;
+        static std::vector<int> h_cols;
+        int dim = (*mat)[0][0].N();
+        int N = mat->N()*dim;
+        int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;

-		if(dim != 3){
-			OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
-			use_gpu = false;
-		}
+        if(dim != 3){
+            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
+            use_gpu = false;
+        }

-		if(h_rows.capacity() == 0){
-			h_rows.reserve(N+1);
-			h_cols.reserve(nnz);
+        if(h_rows.capacity() == 0){
+            h_rows.reserve(N+1);
+            h_cols.reserve(nnz);
 #if PRINT_TIMERS_BRIDGE
-			Dune::Timer t;
+            Dune::Timer t;
 #endif
-			getSparsityPattern(*mat, h_rows, h_cols);
+            getSparsityPattern(*mat, h_rows, h_cols);
 #if PRINT_TIMERS_BRIDGE
-			std::ostringstream out;
-			out << "getSparsityPattern() took: " << t.stop() << " s";
-			OpmLog::info(out.str());
+            std::ostringstream out;
+            out << "getSparsityPattern() took: " << t.stop() << " s";
+            OpmLog::info(out.str());
 #endif
-		}
+        }

 #if PRINT_TIMERS_BRIDGE
-		Dune::Timer t_zeros;
-		int numZeros = checkZeroDiagonal(*mat);
-		std::ostringstream out;
-		out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
-		OpmLog::info(out.str());
+        Dune::Timer t_zeros;
+        int numZeros = checkZeroDiagonal(*mat);
+        std::ostringstream out;
+        out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
+        OpmLog::info(out.str());
 #else
-		checkZeroDiagonal(*mat);
+        checkZeroDiagonal(*mat);
 #endif


-		/////////////////////////
-		// actually solve
+        /////////////////////////
+        // actually solve

-		typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
-		// assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
-		cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
-		switch(status){
-		case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
-			//OpmLog::info("cusparseSolver converged");
-			break;
-		case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
-			OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
-			break;
-		case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
-			OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
-			break;
-		default:
-			OpmLog::warning("cusparseSolver returned unknown status code");
-		}
+        typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
+        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
+        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
+        switch(status){
+        case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
+            //OpmLog::info("cusparseSolver converged");
+            break;
+        case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
+            OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            break;
+        case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
+            OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            break;
+        default:
+            OpmLog::warning("cusparseSolver returned unknown status code");
+        }

-		res.iterations = result.iterations;
-		res.reduction = result.reduction;
-		res.converged = result.converged;
-		res.conv_rate = result.conv_rate;
-		res.elapsed = result.elapsed;
-	}else{
-		res.converged = false;
-	}
+        res.iterations = result.iterations;
+        res.reduction = result.reduction;
+        res.converged = result.converged;
+        res.conv_rate = result.conv_rate;
+        res.elapsed = result.elapsed;
+    }else{
+        res.converged = false;
+    }
 #endif // HAVE_CUDA
 }

@ -192,9 +192,9 @@ void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_U
 template <class BridgeVector>
 void BdaBridge::get_result(BridgeVector &x OPM_UNUSED){
 #if HAVE_CUDA
-	if(use_gpu){
-		backend->post_process(static_cast<double*>(&(x[0][0])));
-	}
+    if(use_gpu){
+        backend->post_process(static_cast<double*>(&(x[0][0])));
+    }
 #endif
 }

@ -202,22 +202,22 @@ template void BdaBridge::solve_system< \
 Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
 Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
-	Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
-	InverseOperatorResult &res);
+    Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
+    InverseOperatorResult &res);

 template void BdaBridge::solve_system< \
 Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
 Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
-	Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
-	InverseOperatorResult &res);
+    Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
+    InverseOperatorResult &res);

 template void BdaBridge::solve_system< \
 Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
 Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
 (Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
-	Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
-	InverseOperatorResult &res);
+    Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
+    InverseOperatorResult &res);


 template void BdaBridge::get_result< \
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -40,20 +40,18 @@ class BdaBridge
 {
 private:
 #if HAVE_CUDA
-	std::unique_ptr<cusparseSolverBackend> backend;
+    std::unique_ptr<cusparseSolverBackend> backend;
 #endif
-	bool use_gpu;
+    bool use_gpu;

 public:
-	BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
+    BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);

-	~BdaBridge();
+    template <class BridgeMatrix, class BridgeVector>
+    void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);

-	template <class BridgeMatrix, class BridgeVector>
-	void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
-
-	template <class BridgeVector>
-	void get_result(BridgeVector &x);
+    template <class BridgeVector>
+    void get_result(BridgeVector &x);

 }; // end class BdaBridge

--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@ -28,13 +28,13 @@ class BdaResult
 {

 public:
-	int iterations = 0;			// number of iterations
-	double reduction = 0.0;		// reduction of norm, norm_start / norm_final
-	bool converged = false;		// true iff the linear solver reached the desired norm within maxit iterations
-	double conv_rate = 0.0;		// average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
-	double elapsed = 0.0;		// time in seconds to run the linear solver
+    int iterations = 0;         // number of iterations
+    double reduction = 0.0;     // reduction of norm, norm_start / norm_final
+    bool converged = false;     // true iff the linear solver reached the desired norm within maxit iterations
+    double conv_rate = 0.0;     // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
+    double elapsed = 0.0;       // time in seconds to run the linear solver

-	// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
+    // Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult

 }; // end class BdaResult

--- a/opm/simulators/linalg/bda/cuda_header.hpp
+++ b/opm/simulators/linalg/bda/cuda_header.hpp
@ -25,12 +25,12 @@
 #define cudaCheckLastError(msg)    __cudaCheckError( __FILE__, __LINE__, #msg )

 inline void __cudaCheckError(const char *file, const int line, const char *msg){
-	cudaError err = cudaGetLastError();
-	if (cudaSuccess != err){
-		std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
-		std::cerr << "BDA error message: " << msg << std::endl;
-		exit(1);
-	}
+    cudaError err = cudaGetLastError();
+    if (cudaSuccess != err){
+        std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
+        std::cerr << "BDA error message: " << msg << std::endl;
+        exit(1);
+    }
 }

 #endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -18,7 +18,7 @@
 */

 #ifndef __NVCC__
-	#error "Cannot compile for cusparse: NVIDIA compiler not found"
+    #error "Cannot compile for cusparse: NVIDIA compiler not found"
 #endif

 #include <cstdio>
@ -41,456 +41,456 @@
 namespace Opm
 {

-	const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-	const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
-	const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
-
-	double second(void){
-		struct timeval tv;
-		gettimeofday(&tv, nullptr);
-		return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
-	}
-
-	cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
-	}
-
-	cusparseSolverBackend::~cusparseSolverBackend(){
-		finalize();
-	}
-
-	void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
-		double t_total1, t_total2;
-		int n = N;
-		double rho = 1.0, rhop;
-		double alpha, nalpha, beta;
-		double omega, nomega, tmp1, tmp2;
-		double norm, norm_0;
-		double zero = 0.0;
-		double one  = 1.0;
-		double mone = -1.0;
-		float it;
-
-		t_total1 = second();
-
-		cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
-
-		cublasDscal(cublasHandle, n, &mone, d_r, 1);
-		cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
-		cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
-		cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
-		cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
-
-		if(verbosity > 1){
-			std::ostringstream out;
-			out << std::scientific << "cusparseSolver initial norm: " << norm_0;
-			OpmLog::info(out.str());
-		}
-
-		for(it = 0.5; it < maxit; it+=0.5){
-			rhop = rho;
-			cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
-
-			if(it > 1){
-				beta = (rho/rhop) * (alpha/omega);
-				nomega = -omega;
-				cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
-				cublasDscal(cublasHandle, n, &beta, d_p, 1);
-				cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
-			}
-
-			// apply ilu0
-			cusparseDbsrsv2_solve(cusparseHandle, order, \
-				operation, Nb, nnzb, &one, \
-				descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
-			cusparseDbsrsv2_solve(cusparseHandle, order, \
-				operation, Nb, nnzb, &one, \
-				descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
-
-			// spmv
-			cusparseDbsrmv(cusparseHandle, order, \
-				operation, Nb, Nb, nnzb, \
-				&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
-
-			cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
-			alpha = rho / tmp1;
-			nalpha = -alpha;
-			cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
-			cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
-			cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-			if(norm < tolerance * norm_0 && it > minit){
-				break;
-			}
-
-			it += 0.5;
-
-			// apply ilu0
-			cusparseDbsrsv2_solve(cusparseHandle, order, \
-				operation, Nb, nnzb, &one, \
-				descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
-			cusparseDbsrsv2_solve(cusparseHandle, order, \
-				operation, Nb, nnzb, &one, \
-				descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
-
-			// spmv
-			cusparseDbsrmv(cusparseHandle, order, \
-				operation, Nb, Nb, nnzb, &one, descr_M, \
-				d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
-
-			cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
-			cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
-			omega = tmp1 / tmp2;
-			nomega = -omega;
-			cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
-			cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
-
-			cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
-
-
-			if(norm < tolerance * norm_0 && it > minit){
-				break;
-			}
-
-			if(verbosity > 1){
-				std::ostringstream out;
-				out << "it: " << it << std::scientific << ", norm: " << norm;
-				OpmLog::info(out.str());
-			}
-		}
-
-		t_total2 = second();
-
-		res.iterations = std::min(it, (float)maxit);
-		res.reduction = norm/norm_0;
-		res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
-		res.elapsed = t_total2 - t_total1;
-		res.converged = (it != (maxit + 0.5));
-
-		if(verbosity > 0){
-			std::ostringstream out;
-			out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
-			       ", time per iteration: " << res.elapsed/it << ", iterations: " << it;
-			OpmLog::info(out.str());
-		}
-	}
-
-
-	void cusparseSolverBackend::initialize(int N, int nnz, int dim){
-		this->N = N;
-		this->nnz = nnz;
-		this->BLOCK_SIZE = dim;
-		this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
-		Nb = (N + dim - 1) / dim;
-		std::ostringstream out;
-		out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
-		OpmLog::info(out.str());
-		out.str("");
-		out.clear();
-		out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
-		OpmLog::info(out.str());
-
-		int deviceID = 0;
-		cudaSetDevice(deviceID);
-		cudaCheckLastError("Could not get device");
-		struct cudaDeviceProp props;
-		cudaGetDeviceProperties(&props, deviceID);
-		cudaCheckLastError("Could not get device properties");
-		out.str("");
-		out.clear();
-		out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
-		OpmLog::info(out.str());
-
-		cudaStreamCreate(&stream);
-		cudaCheckLastError("Could not create stream");
-
-		cublasCreate(&cublasHandle);
-		cudaCheckLastError("Could not create cublasHandle");
-
-		cusparseCreate(&cusparseHandle);
-		cudaCheckLastError("Could not create cusparseHandle");
-
-		cudaMalloc((void**)&d_x, sizeof(double) * N);
-		cudaMalloc((void**)&d_b, sizeof(double) * N);
-		cudaMalloc((void**)&d_r, sizeof(double) * N);
-		cudaMalloc((void**)&d_rw,sizeof(double) * N);
-		cudaMalloc((void**)&d_p, sizeof(double) * N);
-		cudaMalloc((void**)&d_pw,sizeof(double) * N);
-		cudaMalloc((void**)&d_s, sizeof(double) * N);
-		cudaMalloc((void**)&d_t, sizeof(double) * N);
-		cudaMalloc((void**)&d_v, sizeof(double) * N);
-		cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
-		cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
-		cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
-		cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
-		cudaCheckLastError("Could not allocate enough memory on GPU");
-
-		cublasSetStream(cublasHandle, stream);
-		cudaCheckLastError("Could not set stream to cublas");
-		cusparseSetStream(cusparseHandle, stream);
-		cudaCheckLastError("Could not set stream to cusparse");
-
-		cudaMallocHost((void**)&x, sizeof(double) * N);
-		cudaCheckLastError("Could not allocate pinned host memory");
-
-		initialized = true;
-	} // end initialize()
-
-	void cusparseSolverBackend::finalize(){
-		cudaFree(d_x);
-		cudaFree(d_b);
-		cudaFree(d_r);
-		cudaFree(d_rw);
-		cudaFree(d_p);
-		cudaFree(d_pw);
-		cudaFree(d_s);
-		cudaFree(d_t);
-		cudaFree(d_v);
-		cudaFree(d_mVals);
-		cudaFree(d_bVals);
-		cudaFree(d_bCols);
-		cudaFree(d_bRows);
-		cudaFree(d_buffer);
-		cusparseDestroyBsrilu02Info(info_M);
-		cusparseDestroyBsrsv2Info(info_L);
-		cusparseDestroyBsrsv2Info(info_U);
-		cusparseDestroyMatDescr(descr_B);
-		cusparseDestroyMatDescr(descr_M);
-		cusparseDestroyMatDescr(descr_L);
-		cusparseDestroyMatDescr(descr_U);
-		cusparseDestroy(cusparseHandle);
-		cublasDestroy(cublasHandle);
-		cudaHostUnregister(vals);
-		cudaHostUnregister(cols);
-		cudaHostUnregister(rows);
-		cudaStreamDestroy(stream);
-		cudaFreeHost(x);
-	} // end finalize()
-
-
-	void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
-
-		double t1, t2;
-		if(verbosity > 2){
-			t1 = second();
-		}
-
-		// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
-		// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
-		cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
-		cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
-		cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
-		cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
-		cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-		cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
-		cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
-		cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-		cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-		this->vals = vals;
-		this->cols = cols;
-		this->rows = rows;
-
-		if(verbosity > 2){
-			cudaStreamSynchronize(stream);
-			t2 = second();
-			std::ostringstream out;
-			out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
-			OpmLog::info(out.str());
-		}
-	} // end copy_system_to_gpu()
-
-
-	// don't copy rowpointers and colindices, they stay the same
-	void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
-
-		double t1, t2;
-		if(verbosity > 2){
-			t1 = second();
-		}
-
-		cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
-		cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
-		cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
-
-		if(verbosity > 2){
-			cudaStreamSynchronize(stream);
-			t2 = second();
-			std::ostringstream out;
-			out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
-			OpmLog::info(out.str());
-		}
-	} // end update_system_on_gpu()
-
-
-	void cusparseSolverBackend::reset_prec_on_gpu(){
-		cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
-	}
-
-
-	bool cusparseSolverBackend::analyse_matrix(){
-
-		int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
-		double t1, t2;
-
-		if(verbosity > 2){
-			t1 = second();
-		}
-
-		cusparseCreateMatDescr(&descr_B);
-		cusparseCreateMatDescr(&descr_M);
-		cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
-		const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;		// matrices from Flow are base0
-
-		cusparseSetMatIndexBase(descr_B, base_type);
-		cusparseSetMatIndexBase(descr_M, base_type);
-
-		cusparseCreateMatDescr(&descr_L);
-		cusparseSetMatIndexBase(descr_L, base_type);
-		cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
-		cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
-
-		cusparseCreateMatDescr(&descr_U);
-		cusparseSetMatIndexBase(descr_U, base_type);
-		cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
-		cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
-		cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
-		cudaCheckLastError("Could not initialize matrix descriptions");
-
-		cusparseCreateBsrilu02Info(&info_M);
-		cusparseCreateBsrsv2Info(&info_L);
-		cusparseCreateBsrsv2Info(&info_U);
-		cudaCheckLastError("Could not create analysis info");
-
-		cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
-			descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
-		cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-			descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
-		cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
-			descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
-		cudaCheckLastError();
-		d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
-		
-		cudaMalloc((void**)&d_buffer, d_bufferSize);
-
-		// analysis of ilu LU decomposition
-		cusparseDbsrilu02_analysis(cusparseHandle, order, \
-			Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
-			BLOCK_SIZE, info_M, policy, d_buffer);
-
-		int structural_zero;
-		cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-		if(CUSPARSE_STATUS_ZERO_PIVOT == status){
-			return false;
-		}
-
-		// analysis of ilu apply
-		cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-			Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
-			BLOCK_SIZE, info_L, policy, d_buffer);
-
-		cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
-			Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
-			BLOCK_SIZE, info_U, policy, d_buffer);
-		cudaCheckLastError("Could not analyse level information");
-
-		if(verbosity > 2){
-			cudaStreamSynchronize(stream);
-			t2 = second();
-			std::ostringstream out;
-			out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
-			OpmLog::info(out.str());
-		}
-
-		return true;
-	} // end analyse_matrix()
-
-	bool cusparseSolverBackend::create_preconditioner(){
-
-		double t1, t2;
-		if(verbosity > 2){
-			t1 = second();
-		}
-
-		d_mCols = d_bCols;
-		d_mRows = d_bRows;
-		cusparseDbsrilu02(cusparseHandle, order, \
-			Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
-			BLOCK_SIZE, info_M, policy, d_buffer);
-
-		int structural_zero;
-		// cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
-		cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
-		if(CUSPARSE_STATUS_ZERO_PIVOT == status){
-			return false;
-		}
-
-		if(verbosity > 2){
-			cudaStreamSynchronize(stream);
-			t2 = second();
-			std::ostringstream out;
-			out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
-			OpmLog::info(out.str());
-		}
-		return true;
-	} // end create_preconditioner()
-
-
-	void cusparseSolverBackend::solve_system(BdaResult &res){
-		// actually solve
-		gpu_pbicgstab(res);
-		cudaStreamSynchronize(stream);
-		cudaCheckLastError("Something went wrong during the GPU solve");
-	} // end solve_system()
-
-
-	// copy result to host memory
-	// caller must be sure that x is a valid array
-	void cusparseSolverBackend::post_process(double *x){
-
-		if(!initialized){
-			cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
-		}
-
-		double t1, t2;
-		if(verbosity > 2){
-			t1 = second();
-		}
-
-		cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
-		cudaStreamSynchronize(stream);
-
-		if(verbosity > 2){
-			t2 = second();
-			std::ostringstream out;
-			out << "cusparseSolver::post_process(): " << t2-t1 << " s";
-			OpmLog::info(out.str());
-		}
-	} // end post_process()
-
-
-	typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
-
-    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){    	
-		if(initialized == false){
-			initialize(N, nnz, dim);
-			copy_system_to_gpu(vals, rows, cols, b);
-		}else{
-			update_system_on_gpu(vals, b);
-		}
-		if(analysis_done == false){
-			if(!analyse_matrix()){
-				return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
-			}
-		}
-		reset_prec_on_gpu();
-		if(create_preconditioner()){
-			solve_system(res);
-		}else{
-			return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
-		}
-		return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
+    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+
+    double second(void){
+        struct timeval tv;
+        gettimeofday(&tv, nullptr);
+        return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+    }
+
+    cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0){
+    }
+
+    cusparseSolverBackend::~cusparseSolverBackend(){
+        finalize();
+    }
+
+    void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
+        double t_total1, t_total2;
+        int n = N;
+        double rho = 1.0, rhop;
+        double alpha, nalpha, beta;
+        double omega, nomega, tmp1, tmp2;
+        double norm, norm_0;
+        double zero = 0.0;
+        double one  = 1.0;
+        double mone = -1.0;
+        float it;
+
+        t_total1 = second();
+
+        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
+
+        cublasDscal(cublasHandle, n, &mone, d_r, 1);
+        cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
+        cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
+        cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+
+        if(verbosity > 1){
+            std::ostringstream out;
+            out << std::scientific << "cusparseSolver initial norm: " << norm_0;
+            OpmLog::info(out.str());
+        }
+
+        for(it = 0.5; it < maxit; it+=0.5){
+            rhop = rho;
+            cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
+
+            if(it > 1){
+                beta = (rho/rhop) * (alpha/omega);
+                nomega = -omega;
+                cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
+                cublasDscal(cublasHandle, n, &beta, d_p, 1);
+                cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
+            }
+
+            // apply ilu0
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
+
+            // spmv
+            cusparseDbsrmv(cusparseHandle, order, \
+                operation, Nb, Nb, nnzb, \
+                &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
+
+            cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
+            alpha = rho / tmp1;
+            nalpha = -alpha;
+            cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
+            cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
+            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+            if(norm < tolerance * norm_0 && it > minit){
+                break;
+            }
+
+            it += 0.5;
+
+            // apply ilu0
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
+
+            // spmv
+            cusparseDbsrmv(cusparseHandle, order, \
+                operation, Nb, Nb, nnzb, &one, descr_M, \
+                d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
+
+            cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
+            cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
+            omega = tmp1 / tmp2;
+            nomega = -omega;
+            cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
+            cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
+
+            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+
+            if(norm < tolerance * norm_0 && it > minit){
+                break;
+            }
+
+            if(verbosity > 1){
+                std::ostringstream out;
+                out << "it: " << it << std::scientific << ", norm: " << norm;
+                OpmLog::info(out.str());
+            }
+        }
+
+        t_total2 = second();
+
+        res.iterations = std::min(it, (float)maxit);
+        res.reduction = norm/norm_0;
+        res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
+        res.elapsed = t_total2 - t_total1;
+        res.converged = (it != (maxit + 0.5));
+
+        if(verbosity > 0){
+            std::ostringstream out;
+            out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+                   ", time per iteration: " << res.elapsed/it << ", iterations: " << it;
+            OpmLog::info(out.str());
+        }
+    }
+
+
+    void cusparseSolverBackend::initialize(int N, int nnz, int dim){
+        this->N = N;
+        this->nnz = nnz;
+        this->BLOCK_SIZE = dim;
+        this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
+        Nb = (N + dim - 1) / dim;
+        std::ostringstream out;
+        out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+        out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+        OpmLog::info(out.str());
+
+        int deviceID = 0;
+        cudaSetDevice(deviceID);
+        cudaCheckLastError("Could not get device");
+        struct cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, deviceID);
+        cudaCheckLastError("Could not get device properties");
+        out.str("");
+        out.clear();
+        out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+        OpmLog::info(out.str());
+
+        cudaStreamCreate(&stream);
+        cudaCheckLastError("Could not create stream");
+
+        cublasCreate(&cublasHandle);
+        cudaCheckLastError("Could not create cublasHandle");
+
+        cusparseCreate(&cusparseHandle);
+        cudaCheckLastError("Could not create cusparseHandle");
+
+        cudaMalloc((void**)&d_x, sizeof(double) * N);
+        cudaMalloc((void**)&d_b, sizeof(double) * N);
+        cudaMalloc((void**)&d_r, sizeof(double) * N);
+        cudaMalloc((void**)&d_rw,sizeof(double) * N);
+        cudaMalloc((void**)&d_p, sizeof(double) * N);
+        cudaMalloc((void**)&d_pw,sizeof(double) * N);
+        cudaMalloc((void**)&d_s, sizeof(double) * N);
+        cudaMalloc((void**)&d_t, sizeof(double) * N);
+        cudaMalloc((void**)&d_v, sizeof(double) * N);
+        cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaCheckLastError("Could not allocate enough memory on GPU");
+
+        cublasSetStream(cublasHandle, stream);
+        cudaCheckLastError("Could not set stream to cublas");
+        cusparseSetStream(cusparseHandle, stream);
+        cudaCheckLastError("Could not set stream to cusparse");
+
+        cudaMallocHost((void**)&x, sizeof(double) * N);
+        cudaCheckLastError("Could not allocate pinned host memory");
+
+        initialized = true;
+    } // end initialize()
+
+    void cusparseSolverBackend::finalize(){
+        cudaFree(d_x);
+        cudaFree(d_b);
+        cudaFree(d_r);
+        cudaFree(d_rw);
+        cudaFree(d_p);
+        cudaFree(d_pw);
+        cudaFree(d_s);
+        cudaFree(d_t);
+        cudaFree(d_v);
+        cudaFree(d_mVals);
+        cudaFree(d_bVals);
+        cudaFree(d_bCols);
+        cudaFree(d_bRows);
+        cudaFree(d_buffer);
+        cusparseDestroyBsrilu02Info(info_M);
+        cusparseDestroyBsrsv2Info(info_L);
+        cusparseDestroyBsrsv2Info(info_U);
+        cusparseDestroyMatDescr(descr_B);
+        cusparseDestroyMatDescr(descr_M);
+        cusparseDestroyMatDescr(descr_L);
+        cusparseDestroyMatDescr(descr_U);
+        cusparseDestroy(cusparseHandle);
+        cublasDestroy(cublasHandle);
+        cudaHostUnregister(vals);
+        cudaHostUnregister(cols);
+        cudaHostUnregister(rows);
+        cudaStreamDestroy(stream);
+        cudaFreeHost(x);
+    } // end finalize()
+
+
+    void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
+
+        double t1, t2;
+        if(verbosity > 2){
+            t1 = second();
+        }
+
+        // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
+        // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
+        cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
+        cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
+        cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
+        cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
+        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+        this->vals = vals;
+        this->cols = cols;
+        this->rows = rows;
+
+        if(verbosity > 2){
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end copy_system_to_gpu()
+
+
+    // don't copy rowpointers and colindices, they stay the same
+    void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
+
+        double t1, t2;
+        if(verbosity > 2){
+            t1 = second();
+        }
+
+        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+        if(verbosity > 2){
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end update_system_on_gpu()
+
+
+    void cusparseSolverBackend::reset_prec_on_gpu(){
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
+
+
+    bool cusparseSolverBackend::analyse_matrix(){
+
+        int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+        double t1, t2;
+
+        if(verbosity > 2){
+            t1 = second();
+        }
+
+        cusparseCreateMatDescr(&descr_B);
+        cusparseCreateMatDescr(&descr_M);
+        cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
+        const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
+
+        cusparseSetMatIndexBase(descr_B, base_type);
+        cusparseSetMatIndexBase(descr_M, base_type);
+
+        cusparseCreateMatDescr(&descr_L);
+        cusparseSetMatIndexBase(descr_L, base_type);
+        cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+        cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+
+        cusparseCreateMatDescr(&descr_U);
+        cusparseSetMatIndexBase(descr_U, base_type);
+        cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+        cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+        cudaCheckLastError("Could not initialize matrix descriptions");
+
+        cusparseCreateBsrilu02Info(&info_M);
+        cusparseCreateBsrsv2Info(&info_L);
+        cusparseCreateBsrsv2Info(&info_U);
+        cudaCheckLastError("Could not create analysis info");
+
+        cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
+            descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
+        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+            descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
+        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+            descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
+        cudaCheckLastError();
+        d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+        
+        cudaMalloc((void**)&d_buffer, d_bufferSize);
+
+        // analysis of ilu LU decomposition
+        cusparseDbsrilu02_analysis(cusparseHandle, order, \
+            Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_M, policy, d_buffer);
+
+        int structural_zero;
+        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+        if(CUSPARSE_STATUS_ZERO_PIVOT == status){
+            return false;
+        }
+
+        // analysis of ilu apply
+        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+            Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_L, policy, d_buffer);
+
+        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+            Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_U, policy, d_buffer);
+        cudaCheckLastError("Could not analyse level information");
+
+        if(verbosity > 2){
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        return true;
+    } // end analyse_matrix()
+
+    bool cusparseSolverBackend::create_preconditioner(){
+
+        double t1, t2;
+        if(verbosity > 2){
+            t1 = second();
+        }
+
+        d_mCols = d_bCols;
+        d_mRows = d_bRows;
+        cusparseDbsrilu02(cusparseHandle, order, \
+            Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+            BLOCK_SIZE, info_M, policy, d_buffer);
+
+        int structural_zero;
+        // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
+        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+        if(CUSPARSE_STATUS_ZERO_PIVOT == status){
+            return false;
+        }
+
+        if(verbosity > 2){
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+        return true;
+    } // end create_preconditioner()
+
+
+    void cusparseSolverBackend::solve_system(BdaResult &res){
+        // actually solve
+        gpu_pbicgstab(res);
+        cudaStreamSynchronize(stream);
+        cudaCheckLastError("Something went wrong during the GPU solve");
+    } // end solve_system()
+
+
+    // copy result to host memory
+    // caller must be sure that x is a valid array
+    void cusparseSolverBackend::post_process(double *x){
+
+        if(!initialized){
+            cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
+        }
+
+        double t1, t2;
+        if(verbosity > 2){
+            t1 = second();
+        }
+
+        cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+
+        if(verbosity > 2){
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::post_process(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end post_process()
+
+
+    typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
+
+    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res){       
+        if(initialized == false){
+            initialize(N, nnz, dim);
+            copy_system_to_gpu(vals, rows, cols, b);
+        }else{
+            update_system_on_gpu(vals, b);
+        }
+        if(analysis_done == false){
+            if(!analyse_matrix()){
+                return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
+            }
+        }
+        reset_prec_on_gpu();
+        if(create_preconditioner()){
+            solve_system(res);
+        }else{
+            return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
    }


--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -91,10 +91,10 @@ private:
 public:

    enum class cusparseSolverStatus {
-      CUSPARSE_SOLVER_SUCCESS,
-      CUSPARSE_SOLVER_ANALYSIS_FAILED,
-      CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
-      CUSPARSE_SOLVER_UNKNOWN_ERROR
+        CUSPARSE_SOLVER_SUCCESS,
+        CUSPARSE_SOLVER_ANALYSIS_FAILED,
+        CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
+        CUSPARSE_SOLVER_UNKNOWN_ERROR
    };

    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);