Added cusparseSolver. Automatically compiled when CUDA is detected. Must be enabled at runtime by adding '--use-gpu=true'

2025-02-25 18:55:30 -06:00 · 2019-12-03 14:10:21 +01:00
parent becb29cec6
commit 4db2e7ca4e
10 changed files with 1023 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,10 +12,20 @@
 ###########################################################################

 # Mandatory call to project
+
 project(opm-simulators C CXX)

 cmake_minimum_required (VERSION 2.8)

+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  enable_language(CUDA)
+  set(HAVE_CUDA 1)
+endif()
+
 option(SIBLING_SEARCH "Search for other modules in sibling directories?" ON)
 set( USE_OPENMP_DEFAULT OFF ) # Use of OpenMP is considered experimental
 option(BUILD_FLOW "Build the production oriented flow simulator?" ON)
@@ -282,3 +292,10 @@ endif()
 if (OPM_ENABLE_PYTHON)
  add_subdirectory(python)
 endif()
+
+# must link libraries after target 'flow' has been defined
+if(CUDA_FOUND)
+  target_link_libraries( flow ${CUDA_cublas_LIBRARY} )
+  target_link_libraries( flow ${CUDA_cusparse_LIBRARY} )
+endif()
+
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -41,6 +41,10 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/wells/VFPInjProperties.cpp
  )

+if(CUDA_FOUND)
+  list (APPEND MAIN_SOURCE_FILES opm/bda/cusparseSolverBackend.cu)
+endif()
+
 # originally generated with the command:
 # find tests -name '*.cpp' -a ! -wholename '*/not-unit/*' -printf '\t%p\n' | sort
 list (APPEND TEST_SOURCE_FILES
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@@ -5,6 +5,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_EWOMS
  HAVE_MPI
  HAVE_PETSC
+  HAVE_CUDA
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@@ -1,6 +1,7 @@
 /*
  Copyright 2016 IRIS AS
  Copyright 2019 Equinor ASA
+  Copyright 2019 Big Data Accelerate

  This file is part of the Open Porous Media project (OPM).

@@ -222,6 +223,13 @@ protected:
        enum { pressureVarIndex = Indices::pressureSwitchIdx };
        static const int numEq = Indices::numEq;

+<<<<<<< HEAD:opm/simulators/linalg/ISTLSolverEbos.hpp
+=======
+#if HAVE_CUDA
+        BdaBridge *bdaBridge;
+#endif
+
+>>>>>>> 200e000... Changed cusparseSolver. Use find_package(CUDA) instead of setting a flag manually. Use HAVE_CUDA in sources to disable the BdaBridge when no GPU can be found anyway.:opm/autodiff/ISTLSolverEbos.hpp
    public:
        typedef Dune::AssembledLinearOperator< Matrix, Vector, Vector > AssembledLinearOperatorType;

@@ -239,10 +247,39 @@ protected:
              converged_(false)
        {
            parameters_.template init<TypeTag>();
+<<<<<<< HEAD:opm/simulators/linalg/ISTLSolverEbos.hpp
+=======
+#if HAVE_CUDA
+            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
+            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
+            const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+            if(use_gpu && !matrix_add_well_contributions){
+                std::cerr << "Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, due to the changing sparsity pattern" << std::endl;
+                exit(1);
+            }
+            bdaBridge = new BdaBridge(use_gpu, maxit, tolerance);
+#else
+            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            if(use_gpu){
+                std::cerr << "Error cannot use GPU solver since CUDA was not found during compilation" << std::endl;
+                exit(1);
+            }
+#endif
+>>>>>>> 200e000... Changed cusparseSolver. Use find_package(CUDA) instead of setting a flag manually. Use HAVE_CUDA in sources to disable the BdaBridge when no GPU can be found anyway.:opm/autodiff/ISTLSolverEbos.hpp
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
            detail::findOverlapRowsAndColumns(simulator_.vanguard().grid(),overlapRowAndColumns_);
        }

+<<<<<<< HEAD:opm/simulators/linalg/ISTLSolverEbos.hpp
+=======
+        ~ISTLSolverEbos(){
+#if HAVE_CUDA
+            delete bdaBridge;
+#endif
+        }
+
+>>>>>>> 200e000... Changed cusparseSolver. Use find_package(CUDA) instead of setting a flag manually. Use HAVE_CUDA in sources to disable the BdaBridge when no GPU can be found anyway.:opm/autodiff/ISTLSolverEbos.hpp
        // nothing to clean here
        void eraseMatrix() {
            matrix_for_preconditioner_.reset();
@@ -435,8 +472,28 @@ protected:
            else
 #endif
            {
+<<<<<<< HEAD:opm/simulators/linalg/ISTLSolverEbos.hpp
                // Construct preconditioner.
                auto precond = constructPrecond(linearOperator, parallelInformation_arg);
+=======
+                // tries to solve linear system
+                // solve_system() does nothing if Dune is selected
+#if HAVE_CUDA
+                bdaBridge->solve_system(matrix_.get(), istlb, result);
+
+                if(result.converged){
+                    // get result vector x from non-Dune backend, iff solve was successful
+                    bdaBridge->get_result(x);
+                }else{
+                    // CPU fallback, or default case for Dune
+                    auto precond = constructPrecond(linearOperator, parallelInformation_arg);
+                    solve(linearOperator, x, istlb, *sp, *precond, result);
+                } // end Dune call
+#else
+                auto precond = constructPrecond(linearOperator, parallelInformation_arg);
+                solve(linearOperator, x, istlb, *sp, *precond, result);
+#endif
+>>>>>>> 200e000... Changed cusparseSolver. Use find_package(CUDA) instead of setting a flag manually. Use HAVE_CUDA in sources to disable the BdaBridge when no GPU can be found anyway.:opm/autodiff/ISTLSolverEbos.hpp

                // Solve.
                solve(linearOperator, x, istlb, *sp, *precond, result);
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -0,0 +1,252 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <memory>
+
+#include <opm/bda/BdaBridge.hpp>
+#include <opm/bda/BdaResult.hpp>
+
+#define PRINT_TIMERS_BRIDGE_BRIDGE 0
+
+typedef Dune::InverseOperatorResult InverseOperatorResult;
+
+namespace Opm
+{
+
+BdaBridge::BdaBridge(bool use_gpu_, int maxit, double tolerance) : use_gpu(use_gpu_){
+#if HAVE_CUDA
+    if(use_gpu){
+    	backend = new cusparseSolverBackend(maxit, tolerance);
+    }
+#endif
+}
+
+BdaBridge::~BdaBridge(){
+#if HAVE_CUDA
+	if(use_gpu){
+		delete backend;
+	}
+#endif
+}
+
+#if HAVE_CUDA
+template <class BridgeMatrix>
+int checkZeroDiagonal(BridgeMatrix& mat) {
+	static std::vector<int> diag_indices;   // contains offsets of the diagonal nnzs
+	int numZeros = 0;
+	const int dim = 3;
+	const double zero_replace = 1e-15;
+	double *nnzs = &(mat[0][0][0][0]);
+	if(diag_indices.size() == 0){
+		int N = mat.N()*dim;
+		diag_indices.reserve(N);
+		for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
+			for(auto c = r->begin(); c != r->end(); ++c){
+				if(r.index() == c.index()){
+					for(int rr = 0; rr < dim; ++rr){
+						// pointer arithmetic
+						int offset = (int)((long unsigned)&(mat[r.index()][c.index()][rr][rr]) - (long unsigned)nnzs); // in bytes
+						offset /= sizeof(double);  // convert offset to doubles
+						diag_indices.emplace_back(offset);
+						double val = nnzs[offset];
+						if(val == 0.0){ // could be replaced by '< 1e-30' or similar
+							nnzs[offset] = zero_replace;
+							++numZeros;
+						}
+					}
+					break;
+				}
+			}
+		}
+	}else{
+		for(int offset : diag_indices){
+			if(nnzs[offset] == 0.0){ // could be replaced by '< 1e-30' or similar
+				nnzs[offset] = zero_replace;
+				++numZeros;
+			}
+		}
+	}
+	return numZeros;
+}
+
+
+// convert matrix to blocked csr (bsr) arrays
+// if only_vals, do not convert rowPointers and colIndices
+// sparsity pattern should stay the same due to matrix-add-well-contributions
+template <class BridgeMatrix>
+void convertMatrixBsr(BridgeMatrix& mat, std::vector<double> &h_vals, std::vector<int> &h_rows, std::vector<int> &h_cols, int dim, bool only_vals) {
+	int sum_nnzs = 0;
+	int nnz = mat.nonzeroes()*dim*dim;
+
+	// copy nonzeros
+	memcpy(h_vals.data(), &(mat[0][0][0][0]), sizeof(double)*nnz);
+
+	// convert colIndices and rowPointers
+	if(only_vals == false){
+		h_rows.emplace_back(0);
+			for(typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r){
+				int size_row = 0;
+				for(auto c = r->begin(); c != r->end(); ++c){
+					h_cols.emplace_back(c.index());
+					size_row++;
+				}
+				sum_nnzs += size_row;
+				h_rows.emplace_back(sum_nnzs);
+			}
+		// set last rowpointer
+		h_rows[mat.N()] = mat.nonzeroes();
+	}
+} // end convertMatrixBsr()
+
+// converts the BlockVector b to a flat array
+template <class BridgeVector>
+void convertBlockVectorToArray(BridgeVector& b, std::vector<double> &h_b) {
+	memcpy(h_b.data(), &(b[0]), sizeof(double) * b.N() * b[0].dim());
+}
+#endif
+
+template <class BridgeMatrix, class BridgeVector>
+void BdaBridge::solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &res)
+{
+
+#if HAVE_CUDA
+	if(use_gpu){
+		BdaResult result;
+		result.converged = false;
+		static std::vector<double> h_vals;
+		static std::vector<double> h_b;
+		static std::vector<int> h_rows;
+		static std::vector<int> h_cols;
+		int dim = (*mat)[0][0].N();
+		int N = mat->N()*dim;
+		int nnz = mat->nonzeroes()*dim*dim;
+
+		if(dim != 3){
+			std::cerr << "Error can only use cusparseSolver with blocksize = 3 at this time" << std::endl;
+			exit(1);
+		}
+
+		if(h_vals.capacity() == 0){
+			h_vals.reserve(nnz);
+			h_vals.resize(nnz);
+			h_b.reserve(N);
+			h_b.resize(N);
+			h_rows.reserve(N+1);
+			h_cols.reserve(nnz);
+		}
+
+#if PRINT_TIMERS_BRIDGE
+		Dune::Timer t_zeros;
+		int numZeros = checkZeroDiagonal(*mat);
+		printf("Checking zeros took %f s, found %d zeros\n", t_zeros.stop(), numZeros);
+#else
+		checkZeroDiagonal(*mat);
+#endif
+
+		bool initialized = backend->isInitialized();
+
+#if PRINT_TIMERS_BRIDGE
+		Dune::Timer t;
+#endif
+
+		convertMatrixBsr(*mat, h_vals, h_rows, h_cols, dim, initialized);
+		convertBlockVectorToArray(b, h_b);
+
+#if PRINT_TIMERS_BRIDGE
+		printf("Conversion to flat arrays: %.4f s\n", t.stop());
+#endif
+
+		/////////////////////////
+		// actually solve
+
+		if(initialized == false){
+			backend->initialize(N, nnz, dim);
+			backend->copy_system_to_gpu(h_vals.data(), h_rows.data(), h_cols.data(), h_b.data());
+			backend->analyse_matrix();
+		}else{
+			backend->update_system_on_gpu(h_vals.data(), h_b.data());
+		}
+		backend->reset_prec_on_gpu();
+		if(backend->create_preconditioner()){
+			backend->solve_system(result);
+		}
+		res.iterations = result.iterations;
+		res.reduction = result.reduction;
+		res.converged = result.converged;
+		res.conv_rate = result.conv_rate;
+		res.elapsed = result.elapsed;
+	}else{
+		res.converged = false;
+	}
+#endif // HAVE_CUDA
+}
+
+
+template <class BridgeVector>
+void BdaBridge::get_result(BridgeVector &x){
+#if HAVE_CUDA
+	if(use_gpu){
+		double *h_x = backend->post_process();
+        // convert flat array to blockvector
+        memcpy(&(x[0]), h_x, sizeof(double) * x.N() * x[0].dim());
+	}
+#endif
+}
+
+#if HAVE_CUDA
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 2, 2>, std::allocator<Ewoms::MatrixBlock<double, 2, 2> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+(Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 2, 2>, std::allocator<Ewoms::MatrixBlock<double, 2, 2> > > *mat, \
+	Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
+	InverseOperatorResult &res);
+
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 3, 3>, std::allocator<Ewoms::MatrixBlock<double, 3, 3> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+(Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 3, 3>, std::allocator<Ewoms::MatrixBlock<double, 3, 3> > > *mat, \
+	Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
+	InverseOperatorResult &res);
+
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 4, 4>, std::allocator<Ewoms::MatrixBlock<double, 4, 4> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+(Dune::BCRSMatrix<Ewoms::MatrixBlock<double, 4, 4>, std::allocator<Ewoms::MatrixBlock<double, 4, 4> > > *mat, \
+	Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
+	InverseOperatorResult &res);
+
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
+#endif
+
+
+}
+
+
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -0,0 +1,62 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BDABRIDGE_HEADER_INCLUDED
+#define BDABRIDGE_HEADER_INCLUDED
+
+#include <config.h>
+#include "dune/istl/solver.hh" // for struct InverseOperatorResult
+
+#include "dune/istl/bcrsmatrix.hh"
+#include <ewoms/linear/matrixblock.hh>
+
+#if HAVE_CUDA
+#include <opm/bda/cusparseSolverBackend.hpp>
+#endif
+
+namespace Opm
+{
+
+typedef Dune::InverseOperatorResult InverseOperatorResult;
+
+
+class BdaBridge
+{
+private:
+#if HAVE_CUDA
+	cusparseSolverBackend *backend;
+#endif
+	bool use_gpu;
+
+public:
+	BdaBridge(bool use_gpu, int maxit, double tolerance);
+
+	~BdaBridge();
+
+	template <class BridgeMatrix, class BridgeVector>
+	void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
+
+	template <class BridgeVector>
+	void get_result(BridgeVector &x);
+
+}; // end class BdaBridge
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@@ -0,0 +1,43 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BDARESULT_HEADER_INCLUDED
+#define BDARESULT_HEADER_INCLUDED
+
+namespace Opm
+{
+
+// based on InverseOperatorResult struct from dune/istl/solver.hh
+class BdaResult
+{
+
+public:
+	int iterations = 0;			// number of iterations
+	double reduction = 0.0;		// reduction of norm, norm_start / norm_final
+	bool converged = false;		// true iff the linear solver reached the desired norm within maxit iterations
+	double conv_rate = 0.0;		// average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
+	double elapsed = 0.0;		// time in seconds to run the linear solver
+
+	// Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
+
+}; // end class BdaResult
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/cuda_header.h
+++ b/opm/simulators/linalg/bda/cuda_header.h
@@ -0,0 +1,38 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CUDA_HEADER_H
+#define CUDA_HEADER_H
+
+#include <stdio.h>
+
+typedef double Block[9];
+
+#define cudaCheckLastError(msg)    __cudaCheckError( __FILE__, __LINE__, #msg )
+
+inline void __cudaCheckError(const char *file, const int line, const char *msg){
+	cudaError err = cudaGetLastError();
+	if (cudaSuccess != err){
+		std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
+		std::cerr << "BDA error message: " << msg << std::endl;
+		exit(1);
+	}
+}
+
+#endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@@ -0,0 +1,451 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __NVCC__
+	#error "Cannot compile for cusparse: NVIDIA compiler not found"
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <sys/time.h>
+
+#include <opm/bda/cusparseSolverBackend.hpp>
+#include <opm/bda/BdaResult.hpp>
+#include <opm/bda/cuda_header.h>
+
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
+
+// print initial, intermediate and final norms, and used iterations
+#define VERBOSE_BACKEND 0
+
+// print more detailed timers of various solve elements and backend functions
+#define PRINT_TIMERS_BACKEND 0
+
+namespace Opm
+{
+
+	const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+	const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+	const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+
+	double second(void){
+		struct timeval tv;
+		gettimeofday(&tv, nullptr);
+		return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+	}
+
+	cusparseSolverBackend::cusparseSolverBackend(int maxit_, double tolerance_) : maxit(maxit_), tolerance(tolerance_), minit(0){
+	}
+
+	cusparseSolverBackend::~cusparseSolverBackend(){
+		finalize();
+	}
+
+	// return true iff converged
+	bool cusparseSolverBackend::gpu_pbicgstab(BdaResult& res){
+		double t_total1, t_total2;
+		int n = N;
+		double rho = 1.0, rhop;
+		double alpha, nalpha, beta;
+		double omega, nomega, tmp1, tmp2;
+		double norm, norm_0;
+		double zero = 0.0;
+		double one  = 1.0;
+		double mone = -1.0;
+		float it;
+
+		t_total1 = second();
+
+		cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
+
+		cublasDscal(cublasHandle, n, &mone, d_r, 1);
+		cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
+		cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
+		cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
+		cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+
+#if VERBOSE_BACKEND
+		printf("Initial norm: %.5e\n", norm_0);
+		printf("Tolerance: %.0e, nnzb: %d\n", tolerance, nnzb);
+#endif
+
+		for (it = 0.5; it < maxit; it+=0.5){
+			rhop = rho;
+			cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
+
+			if (it > 1){
+				beta = (rho/rhop) * (alpha/omega);
+				nomega = -omega;
+				cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
+				cublasDscal(cublasHandle, n, &beta, d_p, 1);
+				cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
+			}
+
+			// apply ilu0
+			cusparseDbsrsv2_solve(cusparseHandle, order, \
+				operation, Nb, nnzb, &one, \
+				descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
+			cusparseDbsrsv2_solve(cusparseHandle, order, \
+				operation, Nb, nnzb, &one, \
+				descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
+
+			// spmv
+			cusparseDbsrmv(cusparseHandle, order, \
+				operation, Nb, Nb, nnzb, \
+				&one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
+
+			cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
+			alpha = rho / tmp1;
+			nalpha = -(alpha);
+			cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
+			cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
+			cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+			if (norm < tolerance * norm_0 && it > minit){
+				break;
+			}
+
+			// apply ilu0
+			cusparseDbsrsv2_solve(cusparseHandle, order, \
+				operation, Nb, nnzb, &one, \
+				descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
+			cusparseDbsrsv2_solve(cusparseHandle, order, \
+				operation, Nb, nnzb, &one, \
+				descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
+
+			// spmv
+			cusparseDbsrmv(cusparseHandle, order, \
+				operation, Nb, Nb, nnzb, &one, descr_M, \
+				d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
+
+			cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
+			cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
+			omega = tmp1 / tmp2;
+			nomega = -(omega);
+			cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
+			cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
+
+			cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+
+			if (norm < tolerance * norm_0 && it > minit){
+				break;
+			}
+#if VERBOSE_BACKEND
+			if(i % 1 == 0){
+				printf("it: %.1f, norm: %.5e\n", it, norm);
+			}
+#endif
+		}
+
+		t_total2 = second();
+#if PRINT_TIMERS_BACKEND
+		printf("Total solve time: %.6f s\n", t_total2-t_total1);
+#endif
+#if VERBOSE_BACKEND
+		printf("Iterations: %.1f\n", it);
+		printf("Final norm: %.5e\n", norm);
+#endif
+		res.iterations = std::min(it, (float)maxit);
+		res.reduction = norm/norm_0;
+		res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
+		res.elapsed = t_total2-t_total1;
+		res.converged = (it != (maxit + 0.5));
+		return res.converged;
+	}
+
+
+	void cusparseSolverBackend::initialize(int N, int nnz, int dim){
+		this->N = N;
+		this->nnz = nnz;
+		this->BLOCK_SIZE = dim;
+		this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
+		Nb = (N + dim - 1) / dim;
+		printf("Initializing GPU, N: %d, nnz: %d, Nb: %d\n", N, nnz, Nb);
+		printf("Minit: %d, maxit: %d, tolerance: %.1e\n", minit, maxit, tolerance);
+
+		int deviceID = 0;
+		cudaSetDevice(deviceID);
+		cudaCheckLastError("Could not get device");
+		struct cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, deviceID);
+		cudaCheckLastError("Could not get device properties");
+		std::cout << "Name: " << props.name << "\n";
+		printf("CC: %d.%d\n", props.major, props.minor);
+
+		cudaStreamCreate(&stream);
+		cudaCheckLastError("Could not create stream");
+
+		cublasCreate(&cublasHandle);
+		cudaCheckLastError("Could not create cublasHandle");
+
+		cusparseCreate(&cusparseHandle);
+		cudaCheckLastError("Could not create cusparseHandle");
+
+		cudaMalloc((void**)&d_x, sizeof(double) * N);
+		cudaMalloc((void**)&d_b, sizeof(double) * N);
+		cudaMalloc((void**)&d_r, sizeof(double) * N);
+		cudaMalloc((void**)&d_rw,sizeof(double) * N);
+		cudaMalloc((void**)&d_p, sizeof(double) * N);
+		cudaMalloc((void**)&d_pw,sizeof(double) * N);
+		cudaMalloc((void**)&d_s, sizeof(double) * N);
+		cudaMalloc((void**)&d_t, sizeof(double) * N);
+		cudaMalloc((void**)&d_v, sizeof(double) * N);
+		cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+		cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
+		cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
+		cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+		cudaCheckLastError("Could not allocate enough memory on GPU");
+
+		cublasSetStream(cublasHandle, stream);
+		cudaCheckLastError("Could not set stream to cublas");
+		cusparseSetStream(cusparseHandle, stream);
+		cudaCheckLastError("Could not set stream to cusparse");
+
+		cudaMallocHost((void**)&x, sizeof(double) * N);
+		cudaCheckLastError("Could not allocate pinned host memory");
+
+		initialized = true;
+	} // end initialize()
+
+	void cusparseSolverBackend::finalize(){
+		cudaFree(d_x);
+		cudaFree(d_b);
+		cudaFree(d_r);
+		cudaFree(d_rw);
+		cudaFree(d_p);
+		cudaFree(d_pw);
+		cudaFree(d_s);
+		cudaFree(d_t);
+		cudaFree(d_v);
+		cudaFree(d_mVals);
+		cudaFree(d_bVals);
+		cudaFree(d_bCols);
+		cudaFree(d_bRows);
+		cudaFree(d_buffer);
+		cusparseDestroyBsrilu02Info(info_M);
+		cusparseDestroyBsrsv2Info(info_L);
+		cusparseDestroyBsrsv2Info(info_U);
+		cusparseDestroyMatDescr(descr_B);
+		cusparseDestroyMatDescr(descr_M);
+		cusparseDestroyMatDescr(descr_L);
+		cusparseDestroyMatDescr(descr_U);
+		cusparseDestroy(cusparseHandle);
+		cublasDestroy(cublasHandle);
+		cudaHostUnregister(vals);
+		cudaHostUnregister(cols);
+		cudaHostUnregister(rows);
+		cudaStreamDestroy(stream);
+		cudaFreeHost(x);
+	} // end finalize()
+
+
+	void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b){
+
+#if PRINT_TIMERS_BACKEND
+		double t1, t2;
+		t1 = second();
+#endif
+
+		// information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
+		// possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
+		cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
+		cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
+		cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
+		cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+		cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+		this->vals = vals;
+		this->cols = cols;
+		this->rows = rows;
+
+#if PRINT_TIMERS_BACKEND
+		t2 = second();
+		printf("copy_system_to_gpu(): %f s\n", t2-t1);
+#endif
+	} // end copy_system_to_gpu()
+
+
+	// don't copy rowpointers and colindices, they stay the same
+	void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b){
+
+#if PRINT_TIMERS_BACKEND
+		double t1, t2;
+		t1 = second();
+#endif
+
+		cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+		cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+#if PRINT_TIMERS_BACKEND
+		t2 = second();
+		printf("update_system_on_gpu(): %f s\n", t2-t1);
+#endif
+	} // end update_system_on_gpu()
+
+
+	void cusparseSolverBackend::reset_prec_on_gpu(){
+		cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+	}
+
+
+	void cusparseSolverBackend::analyse_matrix(){
+
+		int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+
+		cusparseCreateMatDescr(&descr_B);
+		cusparseCreateMatDescr(&descr_M);
+		cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
+		const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;		// matrices from Flow are base0
+
+		cusparseSetMatIndexBase(descr_B, base_type);
+		cusparseSetMatIndexBase(descr_M, base_type);
+
+		cusparseCreateMatDescr(&descr_L);
+		cusparseSetMatIndexBase(descr_L, base_type);
+		cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+		cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+
+		cusparseCreateMatDescr(&descr_U);
+		cusparseSetMatIndexBase(descr_U, base_type);
+		cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+		cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+		cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+		cudaCheckLastError("Could not initialize matrix descriptions");
+
+		cusparseCreateBsrilu02Info(&info_M);
+		cusparseCreateBsrsv2Info(&info_L);
+		cusparseCreateBsrsv2Info(&info_U);
+		cudaCheckLastError("Could not create analysis info");
+
+		cudaMemcpyAsync(d_bRows, rows, sizeof(int)*(Nb+1), cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_bCols, cols, sizeof(int)*nnz, cudaMemcpyHostToDevice, stream);
+		cudaMemcpyAsync(d_bVals, vals, sizeof(double)*nnz, cudaMemcpyHostToDevice, stream);
+
+		cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
+			descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
+		cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+			descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
+		cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+			descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
+		cudaCheckLastError();
+		d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+		
+		cudaMalloc((void**)&d_buffer, d_bufferSize);
+
+		// analysis of ilu LU decomposition
+		cusparseDbsrilu02_analysis(cusparseHandle, order, \
+			Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+			BLOCK_SIZE, info_M, policy, d_buffer);
+
+		int structural_zero;
+		cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+		if(CUSPARSE_STATUS_ZERO_PIVOT == status){
+			fprintf(stderr, "ERROR block U(%d,%d) is not invertible\n", structural_zero, structural_zero);
+			fprintf(stderr, "cusparse fails when a block has a 0.0 on its diagonal, these should be replaced in BdaBridge::checkZeroDiagonal()\n");
+			exit(1);
+		}
+
+		// analysis of ilu apply
+		cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+			Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+			BLOCK_SIZE, info_L, policy, d_buffer);
+
+		cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+			Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+			BLOCK_SIZE, info_U, policy, d_buffer);
+		cudaCheckLastError("Could not analyse level information");
+
+	} // end analyse_matrix()
+
+	bool cusparseSolverBackend::create_preconditioner(){
+
+#if PRINT_TIMERS_BACKEND
+		double t1, t2;
+		t1 = second();
+#endif
+		d_mCols = d_bCols;
+		d_mRows = d_bRows;
+		cusparseDbsrilu02(cusparseHandle, order, \
+			Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+			BLOCK_SIZE, info_M, policy, d_buffer);
+		int structural_zero;
+		cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+		if(CUSPARSE_STATUS_ZERO_PIVOT == status){
+			fprintf(stderr, "WARNING block U(%d,%d) is not invertible\n", structural_zero, structural_zero);
+			fprintf(stderr, "cusparse fails when a block has a 0.0 on its diagonal, these should be replaced in BdaBridge::checkZeroDiagonal()\n");
+			return false;
+		}
+
+#if PRINT_TIMERS_BACKEND
+		cudaStreamSynchronize(stream);
+		t2 = second();
+		printf("Decomp time: %.6f s\n", t2-t1);
+#endif
+		return true;
+	} // end create_preconditioner()
+
+
+	// return true iff converged
+	bool cusparseSolverBackend::solve_system(BdaResult &res){
+		// actually solve
+		bool converged = gpu_pbicgstab(res);
+		cudaStreamSynchronize(stream);
+		cudaCheckLastError("Something went wrong during the GPU solve");
+		return converged;
+	} // end solve_system()
+
+
+	// copy result to host memory
+	double* cusparseSolverBackend::post_process(){
+
+#if PRINT_TIMERS_BACKEND
+		double t1, t2;
+		t1 = second();
+#endif
+
+		cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+		cudaStreamSynchronize(stream);
+
+#if PRINT_TIMERS_BACKEND
+		t2 = second();
+		printf("Copy result back to CPU: %.6f s\n", t2-t1);
+#endif
+
+		return x;
+	} // end post_process()
+
+
+	bool cusparseSolverBackend::isInitialized(){
+		return initialized;
+	}
+
+}
+
+
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@@ -0,0 +1,98 @@
+/*
+  Copyright 2019 Big Data Accelerate
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_cuSPARSESOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_cuSPARSESOLVER_BACKEND_HEADER_INCLUDED
+
+
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+
+#include "opm/bda/BdaResult.hpp"
+
+namespace Opm
+{
+
+class cusparseSolverBackend{
+
+private:
+
+    int minit;
+    int maxit;
+    double tolerance;
+
+    cublasHandle_t cublasHandle;
+    cusparseHandle_t cusparseHandle;
+    cudaStream_t stream;
+    cusparseMatDescr_t descr_B, descr_M, descr_L, descr_U;
+    bsrilu02Info_t info_M;
+    bsrsv2Info_t info_L, info_U;
+    // b: bsr matrix, m: preconditioner
+    double *d_bVals, *d_mVals;
+    int *d_bCols, *d_mCols;
+    int *d_bRows, *d_mRows;
+    double *d_x, *d_b, *d_r, *d_rw, *d_p;
+    double *d_pw, *d_s, *d_t, *d_v;
+    double *vals;
+    int *cols, *rows;
+    double *x, *b;
+    void *d_buffer;
+    int N, Nb, nnz, nnzb;
+
+    int BLOCK_SIZE;
+
+    bool initialized = false;
+
+public:
+
+    cusparseSolverBackend(int maxit, double tolerance);
+
+    ~cusparseSolverBackend();
+
+    // return true iff converged
+    bool gpu_pbicgstab(BdaResult& res);
+
+    void initialize(int N, int nnz, int dim);
+
+    void finalize();
+
+    void copy_system_to_gpu(double *vals, int *rows, int *cols, double *f);
+
+    // don't copy rowpointers and colindices, they stay the same
+    void update_system_on_gpu(double *vals, double *f);
+
+    void reset_prec_on_gpu();
+
+    void analyse_matrix();
+
+    bool create_preconditioner();
+
+    // return true iff converged
+    bool solve_system(BdaResult &res);
+
+    double* post_process();
+
+    bool isInitialized();
+
+}; // end class cusparseSolverBackend
+
+}
+
+#endif
+