Merge pull request #2209 from Tongdongq/master

Added cusparseSolver, needs GPU to be used
2025-02-25 18:55:30 -06:00 · 2019-12-20 11:37:27 +01:00 · 2019-12-20 11:37:27 +01:00 · b9062396be
commit b9062396be
parent 3968d23a5a 5ee7f26313
11 changed files with 1122 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -12,10 +12,20 @@
 ###########################################################################

 # Mandatory call to project
+
 project(opm-simulators C CXX)

 cmake_minimum_required (VERSION 2.8)

+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  enable_language(CUDA)
+  set(HAVE_CUDA 1)
+endif()
+
 option(SIBLING_SEARCH "Search for other modules in sibling directories?" ON)
 set( USE_OPENMP_DEFAULT OFF ) # Use of OpenMP is considered experimental
 option(BUILD_FLOW "Build the production oriented flow simulator?" ON)
@ -286,3 +296,10 @@ endif()
 if (OPM_ENABLE_PYTHON)
  add_subdirectory(python)
 endif()
+
+# must link libraries after target 'flow' has been defined
+if(CUDA_FOUND)
+  target_link_libraries( opmsimulators ${CUDA_cublas_LIBRARY} )
+  target_link_libraries( opmsimulators ${CUDA_cusparse_LIBRARY} )
+endif()
+
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -29,6 +29,7 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/flow/MissingFeatures.cpp
  opm/simulators/linalg/ExtractParallelGridInformationToISTL.cpp
  opm/simulators/linalg/setupPropertyTree.cpp
+  opm/simulators/linalg/bda/BdaBridge.cpp
  opm/simulators/timestepping/TimeStepControl.cpp
  opm/simulators/timestepping/AdaptiveSimulatorTimer.cpp
  opm/simulators/timestepping/SimulatorTimer.cpp
@ -41,6 +42,10 @@ list (APPEND MAIN_SOURCE_FILES
  opm/simulators/wells/VFPInjProperties.cpp
  )

+if(CUDA_FOUND)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
+endif()
+
 # originally generated with the command:
 # find tests -name '*.cpp' -a ! -wholename '*/not-unit/*' -printf '\t%p\n' | sort
 list (APPEND TEST_SOURCE_FILES
@ -129,6 +134,10 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/aquifers/AquiferFetkovich.hpp
  opm/simulators/aquifers/BlackoilAquiferModel.hpp
  opm/simulators/aquifers/BlackoilAquiferModel_impl.hpp
+  opm/simulators/linalg/bda/BdaBridge.hpp
+  opm/simulators/linalg/bda/BdaResult.hpp
+  opm/simulators/linalg/bda/cuda_header.hpp
+  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
  opm/simulators/linalg/BlackoilAmg.hpp
  opm/simulators/linalg/BlackoilAmgCpr.hpp
  opm/simulators/linalg/amgcpr.hh
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -5,6 +5,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_EWOMS
  HAVE_MPI
  HAVE_PETSC
+  HAVE_CUDA
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -67,6 +67,7 @@ NEW_PROP_TAG(CprMaxEllIter);
 NEW_PROP_TAG(CprEllSolvetype);
 NEW_PROP_TAG(CprReuseSetup);
 NEW_PROP_TAG(LinearSolverConfigurationJsonFile);
+NEW_PROP_TAG(UseGpu);

 SET_SCALAR_PROP(FlowIstlSolverParams, LinearSolverReduction, 1e-2);
 SET_SCALAR_PROP(FlowIstlSolverParams, IluRelaxation, 0.9);
@ -92,6 +93,7 @@ SET_INT_PROP(FlowIstlSolverParams, CprMaxEllIter, 20);
 SET_INT_PROP(FlowIstlSolverParams, CprEllSolvetype, 0);
 SET_INT_PROP(FlowIstlSolverParams, CprReuseSetup, 0);
 SET_STRING_PROP(FlowIstlSolverParams, LinearSolverConfigurationJsonFile, "none");
+SET_BOOL_PROP(FlowIstlSolverParams, UseGpu, false);



@ -163,6 +165,7 @@ namespace Opm
        std::string system_strategy_;
        bool scale_linear_system_;
        std::string linear_solver_configuration_json_file_;
+        bool use_gpu_;

        template <class TypeTag>
        void init()
@ -190,6 +193,7 @@ namespace Opm
            cpr_ell_solvetype_  =  EWOMS_GET_PARAM(TypeTag, int, CprEllSolvetype);
            cpr_reuse_setup_  =  EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
            linear_solver_configuration_json_file_ = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile);
+            use_gpu_ = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
        }

        template <class TypeTag>
@ -217,6 +221,7 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprEllSolvetype, "Solver type of elliptic pressure solve (0: bicgstab, 1: cg, 2: only amg preconditioner)");
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse Amg Setup");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolverConfigurationJsonFile, "Filename of JSON configuration for flexible linear solver system.");
+            EWOMS_REGISTER_PARAM(TypeTag, bool, UseGpu, "Use GPU cusparseSolver as the linear solver");
        }

        FlowLinearSolverParameters() { reset(); }
@ -238,6 +243,7 @@ namespace Opm
            ilu_milu_                 = MILU_VARIANT::ILU;
            ilu_redblack_             = false;
            ilu_reorder_sphere_       = true;
+            use_gpu_                  = false;
        }
    };

--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -45,6 +45,8 @@

 #include <opm/common/utility/platform_dependent/reenable_warnings.h>

+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+
 BEGIN_PROPERTIES

 NEW_TYPE_TAG(FlowIstlSolver, INHERITS_FROM(FlowIstlSolverParams));
@ -222,6 +224,10 @@ protected:
        enum { pressureVarIndex = Indices::pressureSwitchIdx };
        static const int numEq = Indices::numEq;

+#if HAVE_CUDA
+        std::unique_ptr<BdaBridge> bdaBridge;
+#endif
+
    public:
        typedef Dune::AssembledLinearOperator< Matrix, Vector, Vector > AssembledLinearOperatorType;

@ -239,6 +245,22 @@ protected:
              converged_(false)
        {
            parameters_.template init<TypeTag>();
+#if HAVE_CUDA
+            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            const int maxit = EWOMS_GET_PARAM(TypeTag, int, LinearSolverMaxIter);
+            const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
+            const bool matrix_add_well_contributions = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+            const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
+            if (use_gpu && !matrix_add_well_contributions) {
+                OPM_THROW(std::logic_error,"Error cannot use GPU solver if command line parameter --matrix-add-well-contributions is false, because the GPU solver performs a standard bicgstab");
+            }
+            bdaBridge.reset(new BdaBridge(use_gpu, linear_solver_verbosity, maxit, tolerance));
+#else
+            const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+            if (use_gpu) {
+                OPM_THROW(std::logic_error,"Error cannot use GPU solver since CUDA was not found during compilation");
+            }
+#endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
            const auto& gridForConn = simulator_.vanguard().grid();
            const auto wellsForConn = simulator_.vanguard().schedule().getWellsatEnd();
@ -436,11 +458,30 @@ protected:
            else
 #endif
            {
+                // tries to solve linear system
+                // solve_system() does nothing if Dune is selected
+#if HAVE_CUDA
+                bdaBridge->solve_system(matrix_.get(), istlb, result);
+
+                if (result.converged) {
+                    // get result vector x from non-Dune backend, iff solve was successful
+                    bdaBridge->get_result(x);
+                } else {
+                    // CPU fallback, or default case for Dune
+                    const bool use_gpu = EWOMS_GET_PARAM(TypeTag, bool, UseGpu);
+                    if (use_gpu) {
+                        OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
+                    }
+                    auto precond = constructPrecond(linearOperator, parallelInformation_arg);
+                    solve(linearOperator, x, istlb, *sp, *precond, result);
+                } // end Dune call
+#else
                // Construct preconditioner.
                auto precond = constructPrecond(linearOperator, parallelInformation_arg);

                // Solve.
                solve(linearOperator, x, istlb, *sp, *precond, result);
+#endif
            }
        }

--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -0,0 +1,240 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+#include <memory>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/material/common/Unused.hpp>
+
+#include <opm/simulators/linalg/bda/BdaBridge.hpp>
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+
+#define PRINT_TIMERS_BRIDGE 0
+
+typedef Dune::InverseOperatorResult InverseOperatorResult;
+
+namespace Opm
+{
+
+BdaBridge::BdaBridge(bool use_gpu_ OPM_UNUSED, int linear_solver_verbosity OPM_UNUSED, int maxit OPM_UNUSED, double tolerance OPM_UNUSED) : use_gpu(use_gpu_) {
+#if HAVE_CUDA
+    if (use_gpu) {
+        backend.reset(new cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance));
+    }
+#endif
+}
+
+
+
+#if HAVE_CUDA
+template <class BridgeMatrix>
+int checkZeroDiagonal(BridgeMatrix& mat) {
+    static std::vector<typename BridgeMatrix::size_type> diag_indices;   // contains offsets of the diagonal nnzs
+    int numZeros = 0;
+    const int dim = 3;                    // might be replaced with mat[0][0].N() or BridgeMatrix::block_type::size()
+    const double zero_replace = 1e-15;
+    if (diag_indices.size() == 0) {
+        int N = mat.N();
+        diag_indices.reserve(N);
+        for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
+            auto diag = r->find(r.index());  // diag is an iterator
+            assert(diag.index() == r.index());
+            for (int rr = 0; rr < dim; ++rr) {
+                auto& val = (*diag)[rr][rr]; // reference to easily change the value
+                if (val == 0.0) {             // could be replaced by '< 1e-30' or similar
+                    val = zero_replace;
+                    ++numZeros;
+                }
+            }
+            diag_indices.emplace_back(diag.offset());
+        }
+    }else{
+        for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
+            typename BridgeMatrix::size_type offset = diag_indices[r.index()];
+            auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
+            for (int rr = 0; rr < dim; ++rr) {
+                auto& val = diag_block[rr][rr];
+                if (val == 0.0) {                     // could be replaced by '< 1e-30' or similar
+                    val = zero_replace;
+                    ++numZeros;
+                }
+            }
+        }
+    }
+
+    return numZeros;
+}
+
+
+// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
+// sparsity pattern should stay the same due to matrix-add-well-contributions
+// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
+template <class BridgeMatrix>
+void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
+    int sum_nnzs = 0;
+
+    // convert colIndices and rowPointers
+    if (h_rows.size() == 0) {
+        h_rows.emplace_back(0);
+            for (typename BridgeMatrix::const_iterator r = mat.begin(); r != mat.end(); ++r) {
+                int size_row = 0;
+                for (auto c = r->begin(); c != r->end(); ++c) {
+                    h_cols.emplace_back(c.index());
+                    size_row++;
+                }
+                sum_nnzs += size_row;
+                h_rows.emplace_back(sum_nnzs);
+            }
+
+        // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
+        if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
+            OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::getSparsityPattern()");
+        }
+    }
+} // end getSparsityPattern()
+
+#endif
+
+template <class BridgeMatrix, class BridgeVector>
+void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
+{
+
+#if HAVE_CUDA
+    if (use_gpu) {
+        BdaResult result;
+        result.converged = false;
+        static std::vector<int> h_rows;
+        static std::vector<int> h_cols;
+        const int dim = (*mat)[0][0].N();
+        const int N = mat->N()*dim;
+        const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
+
+        if (dim != 3) {
+            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
+            use_gpu = false;
+        }
+
+        if (h_rows.capacity() == 0) {
+            h_rows.reserve(N+1);
+            h_cols.reserve(nnz);
+#if PRINT_TIMERS_BRIDGE
+            Dune::Timer t;
+#endif
+            getSparsityPattern(*mat, h_rows, h_cols);
+#if PRINT_TIMERS_BRIDGE
+            std::ostringstream out;
+            out << "getSparsityPattern() took: " << t.stop() << " s";
+            OpmLog::info(out.str());
+#endif
+        }
+
+#if PRINT_TIMERS_BRIDGE
+        Dune::Timer t_zeros;
+        int numZeros = checkZeroDiagonal(*mat);
+        std::ostringstream out;
+        out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
+        OpmLog::info(out.str());
+#else
+        checkZeroDiagonal(*mat);
+#endif
+
+
+        /////////////////////////
+        // actually solve
+
+        typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
+        // assume that underlying data (nonzeroes) from mat (Dune::BCRSMatrix) are contiguous, if this is not the case, cusparseSolver is expected to perform undefined behaviour
+        cusparseSolverStatus status = backend->solve_system(N, nnz, dim, static_cast<double*>(&(((*mat)[0][0][0][0]))), h_rows.data(), h_cols.data(), static_cast<double*>(&(b[0][0])), result);
+        switch(status) {
+        case cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS:
+            //OpmLog::info("cusparseSolver converged");
+            break;
+        case cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED:
+            OpmLog::warning("cusparseSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            break;
+        case cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED:
+            OpmLog::warning("cusparseSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            break;
+        default:
+            OpmLog::warning("cusparseSolver returned unknown status code");
+        }
+
+        res.iterations = result.iterations;
+        res.reduction = result.reduction;
+        res.converged = result.converged;
+        res.conv_rate = result.conv_rate;
+        res.elapsed = result.elapsed;
+    }else{
+        res.converged = false;
+    }
+#endif // HAVE_CUDA
+}
+
+
+template <class BridgeVector>
+void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) {
+#if HAVE_CUDA
+    if (use_gpu) {
+        backend->post_process(static_cast<double*>(&(x[0][0])));
+    }
+#endif
+}
+
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 2, 2>, std::allocator<Opm::MatrixBlock<double, 2, 2> > > *mat, \
+    Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &b, \
+    InverseOperatorResult &res);
+
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 3, 3>, std::allocator<Opm::MatrixBlock<double, 3, 3> > > *mat, \
+    Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &b, \
+    InverseOperatorResult &res);
+
+template void BdaBridge::solve_system< \
+Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > , \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+(Dune::BCRSMatrix<Opm::MatrixBlock<double, 4, 4>, std::allocator<Opm::MatrixBlock<double, 4, 4> > > *mat, \
+    Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &b, \
+    InverseOperatorResult &res);
+
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 2>, std::allocator<Dune::FieldVector<double, 2> > > &x);
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 3>, std::allocator<Dune::FieldVector<double, 3> > > &x);
+
+template void BdaBridge::get_result< \
+Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > > \
+(Dune::BlockVector<Dune::FieldVector<double, 4>, std::allocator<Dune::FieldVector<double, 4> > > &x);
+
+
+
+}
+
+
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -0,0 +1,73 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BDABRIDGE_HEADER_INCLUDED
+#define BDABRIDGE_HEADER_INCLUDED
+
+#include <config.h>
+#include "dune/istl/solver.hh" // for struct InverseOperatorResult
+
+#include "dune/istl/bcrsmatrix.hh"
+#include <opm/simulators/linalg/matrixblock.hh>
+
+#if HAVE_CUDA
+#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#endif
+
+namespace Opm
+{
+
+typedef Dune::InverseOperatorResult InverseOperatorResult;
+
+/// BdaBridge acts as interface between opm-simulators with the cusparseSolver
+/// if CUDA was not found during CMake, function bodies of this class are empty
+class BdaBridge
+{
+private:
+#if HAVE_CUDA
+    std::unique_ptr<cusparseSolverBackend> backend;
+#endif
+    bool use_gpu;
+
+public:
+    /// Construct a BdaBridge
+    /// \param[in] use_gpu                    true iff the cusparseSolver is used, is passed via command-line: '--use-gpu=[true|false]'
+    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
+    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
+    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
+    BdaBridge(bool use_gpu, int linear_solver_verbosity, int maxit, double tolerance);
+
+
+    /// Solve linear system, A*x = b
+    /// \param[in] mat     matrix A, should be of type Dune::BCRSMatrix
+    /// \param[in] b       vector b, should be of type Dune::BlockVector
+    /// \param[in] result  summary of solver result
+    template <class BridgeMatrix, class BridgeVector>
+    void solve_system(BridgeMatrix *mat, BridgeVector &b, InverseOperatorResult &result);
+
+    /// Get the resulting x vector
+    /// \param[inout] x    vector x, should be of type Dune::BlockVector
+    template <class BridgeVector>
+    void get_result(BridgeVector &x);
+
+}; // end class BdaBridge
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/BdaResult.hpp
+++ b/opm/simulators/linalg/bda/BdaResult.hpp
@ -0,0 +1,44 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef BDARESULT_HEADER_INCLUDED
+#define BDARESULT_HEADER_INCLUDED
+
+namespace Opm
+{
+
+/// This class is based on InverseOperatorResult struct from dune/istl/solver.hh
+/// It is needed to prevent a compile error in basearray.hh, the nvcc compiler might not support all features in there
+class BdaResult
+{
+
+public:
+    int iterations = 0;         // number of iterations
+    double reduction = 0.0;     // reduction of norm, norm_start / norm_final
+    bool converged = false;     // true iff the linear solver reached the desired norm within maxit iterations
+    double conv_rate = 0.0;     // average reduction of norm per iteration, usually calculated with 'static_cast<double>(pow(res.reduction,1.0/it));'
+    double elapsed = 0.0;       // time in seconds to run the linear solver
+
+    // Dune 2.6 has a member 'double condition_estimate = -1' in InverseOperatorResult
+
+}; // end class BdaResult
+
+}
+
+#endif
--- a/opm/simulators/linalg/bda/cuda_header.hpp
+++ b/opm/simulators/linalg/bda/cuda_header.hpp
@ -0,0 +1,41 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef CUDA_HEADER_HEADER_INCLUDED
+#define CUDA_HEADER_HEADER_INCLUDED
+
+#include <iostream>
+
+/// Runtime error checking of CUDA functions
+/// Usage:
+/// cudaMalloc(...);
+/// cudaCheckLastError("Error could not allocate memory");
+///
+#define cudaCheckLastError(msg)    __cudaCheckError( __FILE__, __LINE__, #msg )
+
+inline void __cudaCheckError(const char *file, const int line, const char *msg){
+    cudaError err = cudaGetLastError();
+    if (cudaSuccess != err){
+        std::cerr << "cudaCheckError() failed at " << file << ":" << line << ": " << cudaGetErrorString(err) << std::endl;
+        std::cerr << "BDA error message: " << msg << std::endl;
+        exit(1);
+    }
+}
+
+#endif
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.cu
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.cu
@ -0,0 +1,499 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __NVCC__
+    #error "Cannot compile for cusparse: NVIDIA compiler not found"
+#endif
+
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <sys/time.h>
+#include <sstream>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+
+#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/BdaResult.hpp>
+#include <opm/simulators/linalg/bda/cuda_header.hpp>
+
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+// For more information about cusparse, check https://docs.nvidia.com/cuda/cusparse/index.html
+
+namespace Opm
+{
+
+    const cusparseSolvePolicy_t policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    const cusparseOperation_t operation  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+    const cusparseDirection_t order = CUSPARSE_DIRECTION_ROW;
+
+    double second(void) {
+        struct timeval tv;
+        gettimeofday(&tv, nullptr);
+        return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+    }
+
+    cusparseSolverBackend::cusparseSolverBackend(int verbosity_, int maxit_, double tolerance_) : verbosity(verbosity_), maxit(maxit_), tolerance(tolerance_), minit(0) {
+    }
+
+    cusparseSolverBackend::~cusparseSolverBackend() {
+        finalize();
+    }
+
+    void cusparseSolverBackend::gpu_pbicgstab(BdaResult& res) {
+        double t_total1, t_total2;
+        int n = N;
+        double rho = 1.0, rhop;
+        double alpha, nalpha, beta;
+        double omega, nomega, tmp1, tmp2;
+        double norm, norm_0;
+        double zero = 0.0;
+        double one  = 1.0;
+        double mone = -1.0;
+        float it;
+
+        t_total1 = second();
+
+        cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_x, &zero, d_r);
+
+        cublasDscal(cublasHandle, n, &mone, d_r, 1);
+        cublasDaxpy(cublasHandle, n, &one, d_b, 1, d_r, 1);
+        cublasDcopy(cublasHandle, n, d_r, 1, d_rw, 1);
+        cublasDcopy(cublasHandle, n, d_r, 1, d_p, 1); 
+        cublasDnrm2(cublasHandle, n, d_r, 1, &norm_0);
+
+        if (verbosity > 1) {
+            std::ostringstream out;
+            out << std::scientific << "cusparseSolver initial norm: " << norm_0;
+            OpmLog::info(out.str());
+        }
+
+        for (it = 0.5; it < maxit; it+=0.5) {
+            rhop = rho;
+            cublasDdot(cublasHandle, n, d_rw, 1, d_r, 1, &rho);
+
+            if (it > 1) {
+                beta = (rho/rhop) * (alpha/omega);
+                nomega = -omega;
+                cublasDaxpy(cublasHandle, n, &nomega, d_v, 1, d_p, 1);
+                cublasDscal(cublasHandle, n, &beta, d_p, 1);
+                cublasDaxpy(cublasHandle, n, &one, d_r, 1, d_p, 1);
+            }
+
+            // apply ilu0
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_p, d_t, policy, d_buffer);
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_pw, policy, d_buffer);
+
+            // spmv
+            cusparseDbsrmv(cusparseHandle, order, \
+                operation, Nb, Nb, nnzb, \
+                &one, descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_pw, &zero, d_v);
+
+            cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
+            alpha = rho / tmp1;
+            nalpha = -alpha;
+            cublasDaxpy(cublasHandle, n, &nalpha, d_v, 1, d_r, 1);
+            cublasDaxpy(cublasHandle, n, &alpha, d_pw, 1, d_x, 1);
+            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+            if (norm < tolerance * norm_0 && it > minit) {
+                break;
+            }
+
+            it += 0.5;
+
+            // apply ilu0
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_L, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_L, d_r, d_t, policy, d_buffer);
+            cusparseDbsrsv2_solve(cusparseHandle, order, \
+                operation, Nb, nnzb, &one, \
+                descr_U, d_mVals, d_mRows, d_mCols, BLOCK_SIZE, info_U, d_t, d_s, policy, d_buffer);
+
+            // spmv
+            cusparseDbsrmv(cusparseHandle, order, \
+                operation, Nb, Nb, nnzb, &one, descr_M, \
+                d_bVals, d_bRows, d_bCols, BLOCK_SIZE, d_s, &zero, d_t);
+
+            cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);
+            cublasDdot(cublasHandle, n, d_t, 1, d_t, 1, &tmp2);
+            omega = tmp1 / tmp2;
+            nomega = -omega;
+            cublasDaxpy(cublasHandle, n, &omega, d_s, 1, d_x, 1);
+            cublasDaxpy(cublasHandle, n, &nomega, d_t, 1, d_r, 1);
+
+            cublasDnrm2(cublasHandle, n, d_r, 1, &norm);
+
+
+            if (norm < tolerance * norm_0 && it > minit) {
+                break;
+            }
+
+            if (verbosity > 1) {
+                std::ostringstream out;
+                out << "it: " << it << std::scientific << ", norm: " << norm;
+                OpmLog::info(out.str());
+            }
+        }
+
+        t_total2 = second();
+
+        res.iterations = std::min(it, (float)maxit);
+        res.reduction = norm/norm_0;
+        res.conv_rate  = static_cast<double>(pow(res.reduction,1.0/it));
+        res.elapsed = t_total2 - t_total1;
+        res.converged = (it != (maxit + 0.5));
+
+        if (verbosity > 0) {
+            std::ostringstream out;
+            out << "=== converged: " << res.converged << ", conv_rate: " << res.conv_rate << ", time: " << res.elapsed << \
+                   ", time per iteration: " << res.elapsed/it << ", iterations: " << it;
+            OpmLog::info(out.str());
+        }
+    }
+
+
+    void cusparseSolverBackend::initialize(int N, int nnz, int dim) {
+        this->N = N;
+        this->nnz = nnz;
+        this->BLOCK_SIZE = dim;
+        this->nnzb = nnz/BLOCK_SIZE/BLOCK_SIZE;
+        Nb = (N + dim - 1) / dim;
+        std::ostringstream out;
+        out << "Initializing GPU, matrix size: " << N << " blocks, nnz: " << nnzb << " blocks";
+        OpmLog::info(out.str());
+        out.str("");
+        out.clear();
+        out << "Minit: " << minit << ", maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+        OpmLog::info(out.str());
+
+        int deviceID = 0;
+        cudaSetDevice(deviceID);
+        cudaCheckLastError("Could not get device");
+        struct cudaDeviceProp props;
+        cudaGetDeviceProperties(&props, deviceID);
+        cudaCheckLastError("Could not get device properties");
+        out.str("");
+        out.clear();
+        out << "Name GPU: " << props.name << ", Compute Capability: " << props.major << "." << props.minor;
+        OpmLog::info(out.str());
+
+        cudaStreamCreate(&stream);
+        cudaCheckLastError("Could not create stream");
+
+        cublasCreate(&cublasHandle);
+        cudaCheckLastError("Could not create cublasHandle");
+
+        cusparseCreate(&cusparseHandle);
+        cudaCheckLastError("Could not create cusparseHandle");
+
+        cudaMalloc((void**)&d_x, sizeof(double) * N);
+        cudaMalloc((void**)&d_b, sizeof(double) * N);
+        cudaMalloc((void**)&d_r, sizeof(double) * N);
+        cudaMalloc((void**)&d_rw,sizeof(double) * N);
+        cudaMalloc((void**)&d_p, sizeof(double) * N);
+        cudaMalloc((void**)&d_pw,sizeof(double) * N);
+        cudaMalloc((void**)&d_s, sizeof(double) * N);
+        cudaMalloc((void**)&d_t, sizeof(double) * N);
+        cudaMalloc((void**)&d_v, sizeof(double) * N);
+        cudaMalloc((void**)&d_bVals, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_bCols, sizeof(double) * nnz);
+        cudaMalloc((void**)&d_bRows, sizeof(double) * (Nb+1));
+        cudaMalloc((void**)&d_mVals, sizeof(double) * nnz);
+        cudaCheckLastError("Could not allocate enough memory on GPU");
+
+        cublasSetStream(cublasHandle, stream);
+        cudaCheckLastError("Could not set stream to cublas");
+        cusparseSetStream(cusparseHandle, stream);
+        cudaCheckLastError("Could not set stream to cusparse");
+
+        cudaMallocHost((void**)&x, sizeof(double) * N);
+        cudaCheckLastError("Could not allocate pinned host memory");
+
+        initialized = true;
+    } // end initialize()
+
+    void cusparseSolverBackend::finalize() {
+        cudaFree(d_x);
+        cudaFree(d_b);
+        cudaFree(d_r);
+        cudaFree(d_rw);
+        cudaFree(d_p);
+        cudaFree(d_pw);
+        cudaFree(d_s);
+        cudaFree(d_t);
+        cudaFree(d_v);
+        cudaFree(d_mVals);
+        cudaFree(d_bVals);
+        cudaFree(d_bCols);
+        cudaFree(d_bRows);
+        cudaFree(d_buffer);
+        cusparseDestroyBsrilu02Info(info_M);
+        cusparseDestroyBsrsv2Info(info_L);
+        cusparseDestroyBsrsv2Info(info_U);
+        cusparseDestroyMatDescr(descr_B);
+        cusparseDestroyMatDescr(descr_M);
+        cusparseDestroyMatDescr(descr_L);
+        cusparseDestroyMatDescr(descr_U);
+        cusparseDestroy(cusparseHandle);
+        cublasDestroy(cublasHandle);
+        cudaHostUnregister(vals);
+        cudaHostUnregister(cols);
+        cudaHostUnregister(rows);
+        cudaStreamDestroy(stream);
+        cudaFreeHost(x);
+    } // end finalize()
+
+
+    void cusparseSolverBackend::copy_system_to_gpu(double *vals, int *rows, int *cols, double *b) {
+
+        double t1, t2;
+        if (verbosity > 2) {
+            t1 = second();
+        }
+
+        // information cudaHostRegister: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1ge8d5c17670f16ac4fc8fcb4181cb490c
+        // possible flags for cudaHostRegister: cudaHostRegisterDefault, cudaHostRegisterPortable, cudaHostRegisterMapped, cudaHostRegisterIoMemory
+        cudaHostRegister(vals, nnz * sizeof(double), cudaHostRegisterDefault);
+        cudaHostRegister(cols, nnz * sizeof(int), cudaHostRegisterDefault);
+        cudaHostRegister(rows, (Nb+1) * sizeof(int), cudaHostRegisterDefault);
+        cudaHostRegister(b, N * sizeof(double), cudaHostRegisterDefault);
+        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_bCols, cols, nnz * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_bRows, rows, (Nb+1) * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+        this->vals = vals;
+        this->cols = cols;
+        this->rows = rows;
+
+        if (verbosity > 2) {
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::copy_system_to_gpu(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end copy_system_to_gpu()
+
+
+    // don't copy rowpointers and colindices, they stay the same
+    void cusparseSolverBackend::update_system_on_gpu(double *vals, double *b) {
+
+        double t1, t2;
+        if (verbosity > 2) {
+            t1 = second();
+        }
+
+        cudaMemcpyAsync(d_bVals, vals, nnz * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_b, b, N * sizeof(double), cudaMemcpyHostToDevice, stream);
+        cudaMemsetAsync(d_x, 0, sizeof(double) * N, stream);
+
+        if (verbosity > 2) {
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::update_system_on_gpu(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end update_system_on_gpu()
+
+
+    void cusparseSolverBackend::reset_prec_on_gpu() {
+        cudaMemcpyAsync(d_mVals, d_bVals, nnz  * sizeof(double), cudaMemcpyDeviceToDevice, stream);
+    }
+
+
+    bool cusparseSolverBackend::analyse_matrix() {
+
+        int d_bufferSize_M, d_bufferSize_L, d_bufferSize_U, d_bufferSize;
+        double t1, t2;
+
+        if (verbosity > 2) {
+            t1 = second();
+        }
+
+        cusparseCreateMatDescr(&descr_B);
+        cusparseCreateMatDescr(&descr_M);
+        cusparseSetMatType(descr_B, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatType(descr_M, CUSPARSE_MATRIX_TYPE_GENERAL);
+        const cusparseIndexBase_t base_type = CUSPARSE_INDEX_BASE_ZERO;     // matrices from Flow are base0
+
+        cusparseSetMatIndexBase(descr_B, base_type);
+        cusparseSetMatIndexBase(descr_M, base_type);
+
+        cusparseCreateMatDescr(&descr_L);
+        cusparseSetMatIndexBase(descr_L, base_type);
+        cusparseSetMatType(descr_L, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatFillMode(descr_L, CUSPARSE_FILL_MODE_LOWER);
+        cusparseSetMatDiagType(descr_L, CUSPARSE_DIAG_TYPE_UNIT);
+
+        cusparseCreateMatDescr(&descr_U);
+        cusparseSetMatIndexBase(descr_U, base_type);
+        cusparseSetMatType(descr_U, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseSetMatFillMode(descr_U, CUSPARSE_FILL_MODE_UPPER);
+        cusparseSetMatDiagType(descr_U, CUSPARSE_DIAG_TYPE_NON_UNIT);
+        cudaCheckLastError("Could not initialize matrix descriptions");
+
+        cusparseCreateBsrilu02Info(&info_M);
+        cusparseCreateBsrsv2Info(&info_L);
+        cusparseCreateBsrsv2Info(&info_U);
+        cudaCheckLastError("Could not create analysis info");
+
+        cusparseDbsrilu02_bufferSize(cusparseHandle, order, Nb, nnzb,
+            descr_M, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_M, &d_bufferSize_M);
+        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+            descr_L, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_L, &d_bufferSize_L);
+        cusparseDbsrsv2_bufferSize(cusparseHandle, order, operation, Nb, nnzb,
+            descr_U, d_bVals, d_bRows, d_bCols, BLOCK_SIZE, info_U, &d_bufferSize_U);
+        cudaCheckLastError();
+        d_bufferSize = std::max(d_bufferSize_M, std::max(d_bufferSize_L, d_bufferSize_U));
+        
+        cudaMalloc((void**)&d_buffer, d_bufferSize);
+
+        // analysis of ilu LU decomposition
+        cusparseDbsrilu02_analysis(cusparseHandle, order, \
+            Nb, nnzb, descr_B, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_M, policy, d_buffer);
+
+        int structural_zero;
+        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+            return false;
+        }
+
+        // analysis of ilu apply
+        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+            Nb, nnzb, descr_L, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_L, policy, d_buffer);
+
+        cusparseDbsrsv2_analysis(cusparseHandle, order, operation, \
+            Nb, nnzb, descr_U, d_bVals, d_bRows, d_bCols, \
+            BLOCK_SIZE, info_U, policy, d_buffer);
+        cudaCheckLastError("Could not analyse level information");
+
+        if (verbosity > 2) {
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::analyse_matrix(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+
+        return true;
+    } // end analyse_matrix()
+
+    bool cusparseSolverBackend::create_preconditioner() {
+
+        double t1, t2;
+        if (verbosity > 2) {
+            t1 = second();
+        }
+
+        d_mCols = d_bCols;
+        d_mRows = d_bRows;
+        cusparseDbsrilu02(cusparseHandle, order, \
+            Nb, nnzb, descr_M, d_mVals, d_mRows, d_mCols, \
+            BLOCK_SIZE, info_M, policy, d_buffer);
+
+        int structural_zero;
+        // cusparseXbsrilu02_zeroPivot() calls cudaDeviceSynchronize()
+        cusparseStatus_t status = cusparseXbsrilu02_zeroPivot(cusparseHandle, info_M, &structural_zero);
+        if (CUSPARSE_STATUS_ZERO_PIVOT == status) {
+            return false;
+        }
+
+        if (verbosity > 2) {
+            cudaStreamSynchronize(stream);
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::create_preconditioner(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+        return true;
+    } // end create_preconditioner()
+
+
+    void cusparseSolverBackend::solve_system(BdaResult &res) {
+        // actually solve
+        gpu_pbicgstab(res);
+        cudaStreamSynchronize(stream);
+        cudaCheckLastError("Something went wrong during the GPU solve");
+    } // end solve_system()
+
+
+    // copy result to host memory
+    // caller must be sure that x is a valid array
+    void cusparseSolverBackend::post_process(double *x) {
+
+        if (!initialized) {
+            cudaHostRegister(x, N * sizeof(double), cudaHostRegisterDefault);
+        }
+
+        double t1, t2;
+        if (verbosity > 2) {
+            t1 = second();
+        }
+
+        cudaMemcpyAsync(x, d_x, N * sizeof(double), cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+
+        if (verbosity > 2) {
+            t2 = second();
+            std::ostringstream out;
+            out << "cusparseSolver::post_process(): " << t2-t1 << " s";
+            OpmLog::info(out.str());
+        }
+    } // end post_process()
+
+
+    typedef cusparseSolverBackend::cusparseSolverStatus cusparseSolverStatus;
+
+    cusparseSolverStatus cusparseSolverBackend::solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res) {       
+        if (initialized == false) {
+            initialize(N, nnz, dim);
+            copy_system_to_gpu(vals, rows, cols, b);
+        }else{
+            update_system_on_gpu(vals, b);
+        }
+        if (analysis_done == false) {
+            if (!analyse_matrix()) {
+                return cusparseSolverStatus::CUSPARSE_SOLVER_ANALYSIS_FAILED;
+            }
+        }
+        reset_prec_on_gpu();
+        if (create_preconditioner()) {
+            solve_system(res);
+        }else{
+            return cusparseSolverStatus::CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED;
+        }
+        return cusparseSolverStatus::CUSPARSE_SOLVER_SUCCESS;
+    }
+
+
+}
+
+
--- a/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/cusparseSolverBackend.hpp
@ -0,0 +1,151 @@
+/*
+  Copyright 2019 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_CUSPARSESOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_CUSPARSESOLVER_BACKEND_HEADER_INCLUDED
+
+
+#include "cublas_v2.h"
+#include "cusparse_v2.h"
+
+#include "opm/simulators/linalg/bda/BdaResult.hpp"
+
+namespace Opm
+{
+
+/// This class implements a cusparse-based ilu0-bicgstab solver on GPU
+class cusparseSolverBackend{
+
+private:
+
+    int minit;
+    int maxit;
+    double tolerance;
+
+    cublasHandle_t cublasHandle;
+    cusparseHandle_t cusparseHandle;
+    cudaStream_t stream;
+    cusparseMatDescr_t descr_B, descr_M, descr_L, descr_U;
+    bsrilu02Info_t info_M;
+    bsrsv2Info_t info_L, info_U;
+    // b: bsr matrix, m: preconditioner
+    double *d_bVals, *d_mVals;
+    int *d_bCols, *d_mCols;
+    int *d_bRows, *d_mRows;
+    double *d_x, *d_b, *d_r, *d_rw, *d_p;
+    double *d_pw, *d_s, *d_t, *d_v;
+    double *vals;
+    int *cols, *rows;
+    double *x, *b;
+    void *d_buffer;
+    int N, Nb, nnz, nnzb;
+
+    int BLOCK_SIZE;
+
+    bool initialized = false;
+    bool analysis_done = false;
+
+    // verbosity
+    // 0: print nothing during solves, only when initializing
+    // 1: print number of iterations and final norm
+    // 2: also print norm each iteration
+    // 3: also print timings of different backend functions
+
+    int verbosity = 0;
+
+    /// Solve linear system using ilu0-bicgstab
+    /// \param[inout] res     summary of solver result
+    void gpu_pbicgstab(BdaResult& res);
+
+    /// Initialize GPU and allocate memory
+    /// \param[in] N         number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz         number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim         size of block
+    void initialize(int N, int nnz, int dim);
+
+    /// Clean memory
+    void finalize();
+
+    /// Copy linear system to GPU
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols        array of columnIndices, contains nnz values
+    /// \param[in] b           input vector, contains N values
+    void copy_system_to_gpu(double *vals, int *rows, int *cols, double *b);
+
+    // Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b           input vector, contains N values
+    void update_system_on_gpu(double *vals, double *b);
+
+    /// Reset preconditioner on GPU, ilu0-decomposition is done inplace by cusparse
+    void reset_prec_on_gpu();
+
+    /// Analyse sparsity pattern to extract parallelism
+    /// \return true iff analysis was successful
+    bool analyse_matrix();
+
+    /// Perform ilu0-decomposition
+    /// \return true iff decomposition was successful
+    bool create_preconditioner();
+
+    /// Solve linear system
+    /// \param[inout] res     summary of solver result
+    void solve_system(BdaResult &res);
+
+public:
+
+    enum class cusparseSolverStatus {
+        CUSPARSE_SOLVER_SUCCESS,
+        CUSPARSE_SOLVER_ANALYSIS_FAILED,
+        CUSPARSE_SOLVER_CREATE_PRECONDITIONER_FAILED,
+        CUSPARSE_SOLVER_UNKNOWN_ERROR
+    };
+
+    /// Construct a cusparseSolver
+    /// \param[in] linear_solver_verbosity    verbosity of cusparseSolver
+    /// \param[in] maxit                      maximum number of iterations for cusparseSolver
+    /// \param[in] tolerance                  required relative tolerance for cusparseSolver
+    cusparseSolverBackend(int linear_solver_verbosity, int maxit, double tolerance);
+
+    /// Destroy a cusparseSolver, and free memory
+    ~cusparseSolverBackend();
+
+    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
+    /// \param[in] N           number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz         number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim         size of block
+    /// \param[in] vals        array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows        array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols        array of columnIndices, contains nnz values
+    /// \param[in] b           input vector, contains N values
+    /// \param[inout] res      summary of solver result
+    /// \return                status code
+    cusparseSolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, BdaResult &res);
+
+    /// Post processing after linear solve, now only copies resulting x vector back
+    /// \param[inout] x        resulting x vector, caller must guarantee that x points to a valid array
+    void post_process(double *x);
+
+}; // end class cusparseSolverBackend
+
+}
+
+#endif
+