From ad1d8624264b4538ce48383afcca28f30682e77c Mon Sep 17 00:00:00 2001
From: Arne Morten Kvarving <arne.morten.kvarving@sintef.no>
Date: Tue, 16 Apr 2024 12:28:49 +0200
Subject: [PATCH] BdaBridge: template Scalar type

---
 opm/simulators/linalg/bda/BdaBridge.cpp | 142 +++++++++++++-----------
 opm/simulators/linalg/bda/BdaBridge.hpp |  32 ++++--
 2 files changed, 98 insertions(+), 76 deletions(-)
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index ae24fc807..b20071433 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -52,28 +52,29 @@
 
 typedef Dune::InverseOperatorResult InverseOperatorResult;
 
-namespace Opm
-{
+namespace Opm {
 
-    using Opm::Accelerator::BdaResult;
-    using Opm::Accelerator::BdaSolver;
-    using Opm::Accelerator::SolverStatus;
+using Accelerator::BdaResult;
+using Accelerator::BdaSolver;
+using Accelerator::SolverStatus;
 
-template <class BridgeMatrix, class BridgeVector, int block_size>
-BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_,
-                                                             int linear_solver_verbosity,
-                                                             [[maybe_unused]] int maxit,
-                                                             [[maybe_unused]] double tolerance,
-                                                             [[maybe_unused]] unsigned int platformID,
-                                                             [[maybe_unused]] unsigned int deviceID,
-                                                             [[maybe_unused]] bool opencl_ilu_parallel,
-                                                             [[maybe_unused]] std::string linsolver)
-: verbosity(linear_solver_verbosity), accelerator_mode(accelerator_mode_)
+template<class BridgeMatrix, class BridgeVector, int block_size>
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+BdaBridge(std::string accelerator_mode_,
+          int linear_solver_verbosity,
+          [[maybe_unused]] int maxit,
+          [[maybe_unused]] Scalar tolerance,
+          [[maybe_unused]] unsigned int platformID,
+          [[maybe_unused]] unsigned int deviceID,
+          [[maybe_unused]] bool opencl_ilu_parallel,
+          [[maybe_unused]] std::string linsolver)
+    : verbosity(linear_solver_verbosity)
+    , accelerator_mode(accelerator_mode_)
 {
     if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
         use_gpu = true;
-        using CU = Accelerator::cusparseSolverBackend<double,block_size>;
+        using CU = Accelerator::cusparseSolverBackend<Scalar,block_size>;
         backend = std::make_unique<CU>(linear_solver_verbosity, maxit, tolerance, deviceID);
 #else
         OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
@@ -81,7 +82,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
         use_gpu = true;
-        using OCL = Accelerator::openclSolverBackend<double,block_size>;
+        using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
         backend = std::make_unique<OCL>(linear_solver_verbosity,
                                         maxit,
                                         tolerance,
@@ -95,7 +96,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("amgcl") == 0) {
 #if HAVE_AMGCL
         use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        using AMGCL = Accelerator::amgclSolverBackend<double,block_size>;
+        using AMGCL = Accelerator::amgclSolverBackend<Scalar,block_size>;
         backend = std::make_unique<AMGCL>(linear_solver_verbosity, maxit,
                                           tolerance, platformID, deviceID);
 #else
@@ -104,7 +105,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("rocalution") == 0) {
 #if HAVE_ROCALUTION
         use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        using ROCA = Accelerator::rocalutionSolverBackend<double,block_size>;
+        using ROCA = Accelerator::rocalutionSolverBackend<Scalar,block_size>;
         backend = std::make_unique<ROCA>(linear_solver_verbosity, maxit, tolerance);
 #else
         OPM_THROW(std::logic_error, "Error rocalutionSolver was chosen, but rocalution was not found by CMake");
@@ -112,7 +113,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("rocsparse") == 0) {
 #if HAVE_ROCSPARSE
         use_gpu = true; // should be replaced by a 'use_bridge' boolean
-        using ROCS = Accelerator::rocsparseSolverBackend<double,block_size>;
+        using ROCS = Accelerator::rocsparseSolverBackend<Scalar,block_size>;
         backend = std::make_unique<ROCS>(linear_solver_verbosity, maxit,
                                          tolerance, platformID, deviceID);
 #else
@@ -125,13 +126,14 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     }
 }
 
-
-
 template <class BridgeMatrix>
-int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::size_type>& diag_indices) {
+int replaceZeroDiagonal(BridgeMatrix& mat,
+                    std::vector<typename BridgeMatrix::size_type>& diag_indices)
+{
+    using Scalar = typename BridgeMatrix::field_type;
     int numZeros = 0;
     const int dim = mat[0][0].N();                    // might be replaced with BridgeMatrix::block_type::size()
-    const double zero_replace = 1e-15;
+    const Scalar zero_replace = 1e-15;
     if (diag_indices.empty()) {
         int Nb = mat.N();
         diag_indices.reserve(Nb);
@@ -147,7 +149,7 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
             }
             diag_indices.emplace_back(diag.offset());
         }
-    }else{
+    } else {
         for (typename BridgeMatrix::iterator r = mat.begin(); r != mat.end(); ++r) {
             typename BridgeMatrix::size_type offset = diag_indices[r.index()];
             auto& diag_block = r->getptr()[offset]; // diag_block is a reference to MatrixBlock, located on column r of row r
@@ -164,13 +166,15 @@ int replaceZeroDiagonal(BridgeMatrix& mat, std::vector<typename BridgeMatrix::si
     return numZeros;
 }
 
-
 // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
 // sparsity pattern should stay the same
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
-
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+copySparsityPatternFromISTL(const BridgeMatrix& mat,
+                            std::vector<int>& h_rows,
+                            std::vector<int>& h_cols)
+{
     h_rows.clear();
     h_cols.clear();
 
@@ -185,17 +189,19 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::copySparsityPatternFromI
 
     // h_rows and h_cols could be changed to 'unsigned int', but cusparse expects 'int'
     if (static_cast<unsigned int>(h_rows[mat.N()]) != mat.nonzeroes()) {
-        OPM_THROW(std::logic_error, "Error size of rows do not sum to number of nonzeroes in BdaBridge::copySparsityPatternFromISTL()");
+        OPM_THROW(std::logic_error,
+                  "Error size of rows do not sum to number of nonzeroes "
+                  "in BdaBridge::copySparsityPatternFromISTL()");
     }
 }
 
-
 // check if the nnz values of the matrix are in contiguous memory
 // this is done by checking if the distance between the last value of the last block of row 0 and
 // the first value of the first row of row 1 is equal to 1
 // if the matrix only has 1 row, it is always contiguous
 template <class BridgeMatrix>
-void checkMemoryContiguous(const BridgeMatrix& mat) {
+void checkMemoryContiguous(const BridgeMatrix& mat)
+{
     auto block_size = mat[0][0].N();
     auto row = mat.begin();
     auto last_of_row0 = row->begin();
@@ -212,14 +218,13 @@ void checkMemoryContiguous(const BridgeMatrix& mat) {
     }
 }
 
-
 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
 solve_system(BridgeMatrix* bridgeMat,
              BridgeMatrix* jacMat,
              int numJacobiBlocks,
              BridgeVector& b,
-             WellContributions<double>& wellContribs,
+             WellContributions<Scalar>& wellContribs,
              InverseOperatorResult& res)
 {
     if (use_gpu) {
@@ -235,14 +240,14 @@ solve_system(BridgeMatrix* bridgeMat,
             return;
         }
 
-        using Mat = Accelerator::BlockedMatrix<double>;
+        using Mat = Accelerator::BlockedMatrix<Scalar>;
         if (!matrix) {
             h_rows.reserve(Nb+1);
             h_cols.reserve(nnzb);
             copySparsityPatternFromISTL(*bridgeMat, h_rows, h_cols);
             checkMemoryContiguous(*bridgeMat);
             matrix = std::make_unique<Mat>(Nb, nnzb, block_size,
-                                           static_cast<double*>(&(((*bridgeMat)[0][0][0][0]))),
+                                           static_cast<Scalar*>(&(((*bridgeMat)[0][0][0][0]))),
                                            h_cols.data(),
                                            h_rows.data());
         }
@@ -251,12 +256,14 @@ solve_system(BridgeMatrix* bridgeMat,
         int numZeros = replaceZeroDiagonal(*bridgeMat, diagIndices);
         if (verbosity >= 2) {
             std::ostringstream out;
-            out << "Checking zeros took: " << t_zeros.stop() << " s, found " << numZeros << " zeros";
+            out << "Checking zeros took: " << t_zeros.stop() << " s, found "
+                << numZeros << " zeros";
             OpmLog::info(out.str());
         }
 
         if (numJacobiBlocks >= 2) {
-            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes() : h_jacRows.back();
+            const int jacNnzb = (h_jacRows.empty()) ? jacMat->nonzeroes()
+                                                    : h_jacRows.back();
 
             if (!jacMatrix) {
                 h_jacRows.reserve(Nb+1);
@@ -264,7 +271,7 @@ solve_system(BridgeMatrix* bridgeMat,
                 copySparsityPatternFromISTL(*jacMat, h_jacRows, h_jacCols);
                 checkMemoryContiguous(*jacMat);
                 jacMatrix = std::make_unique<Mat>(Nb, jacNnzb, block_size,
-                                                  static_cast<double*>(&(((*jacMat)[0][0][0][0]))),
+                                                  static_cast<Scalar*>(&(((*jacMat)[0][0][0][0]))),
                                                   h_jacCols.data(),
                                                   h_jacRows.data());
             }
@@ -273,7 +280,8 @@ solve_system(BridgeMatrix* bridgeMat,
             int jacNumZeros = replaceZeroDiagonal(*jacMat, jacDiagIndices);
             if (verbosity >= 2) {
                 std::ostringstream out;
-                out << "Checking zeros for jacMat took: " << t_zeros2.stop() << " s, found " << jacNumZeros << " zeros";
+                out << "Checking zeros for jacMat took: " << t_zeros2.stop()
+                    << " s, found " << jacNumZeros << " zeros";
                 OpmLog::info(out.str());
             }
         }
@@ -281,17 +289,23 @@ solve_system(BridgeMatrix* bridgeMat,
         /////////////////////////
         // actually solve
         // assume that underlying data (nonzeroes) from b (Dune::BlockVector) are contiguous, if this is not the case, the chosen BdaSolver is expected to perform undefined behaviour
-        SolverStatus status = backend->solve_system(matrix, static_cast<double*>(&(b[0][0])), jacMatrix, wellContribs, result);
+        SolverStatus status = backend->solve_system(matrix,
+                                                    static_cast<Scalar*>(&(b[0][0])),
+                                                    jacMatrix, wellContribs, result);
 
-        switch(status) {
+        switch (status) {
         case SolverStatus::BDA_SOLVER_SUCCESS:
             //OpmLog::info("BdaSolver converged");
             break;
         case SolverStatus::BDA_SOLVER_ANALYSIS_FAILED:
-            OpmLog::warning("BdaSolver could not analyse level information of matrix, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            OpmLog::warning("BdaSolver could not analyse level information of matrix, "
+                            "perhaps there is still a 0.0 on the diagonal of a "
+                            "block on the diagonal");
             break;
         case SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED:
-            OpmLog::warning("BdaSolver could not create preconditioner, perhaps there is still a 0.0 on the diagonal of a block on the diagonal");
+            OpmLog::warning("BdaSolver could not create preconditioner, "
+                            "perhaps there is still a 0.0 on the diagonal "
+                            "of a block on the diagonal");
             break;
         default:
             OpmLog::warning("BdaSolver returned unknown status code");
@@ -307,24 +321,25 @@ solve_system(BridgeMatrix* bridgeMat,
     }
 }
 
-
 template <class BridgeMatrix, class BridgeVector, int block_size>
-void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result([[maybe_unused]] BridgeVector& x) {
+void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
+get_result([[maybe_unused]] BridgeVector& x)
+{
     if (use_gpu) {
-        backend->get_result(static_cast<double*>(&(x[0][0])));
+        backend->get_result(static_cast<Scalar*>(&(x[0][0])));
     }
 }
 
 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::
-initWellContributions([[maybe_unused]] WellContributions<double>& wellContribs,
+initWellContributions([[maybe_unused]] WellContributions<Scalar>& wellContribs,
                       [[maybe_unused]] unsigned N)
 {
     if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
-        using OCL = Accelerator::openclSolverBackend<double,block_size>;
+        using OCL = Accelerator::openclSolverBackend<Scalar,block_size>;
         const auto openclBackend = static_cast<const OCL*>(backend.get());
-        using WCOCL = WellContributionsOCL<double>;
+        using WCOCL = WellContributionsOCL<Scalar>;
         static_cast<WCOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(),
                                                        openclBackend->queue.get());
 #else
@@ -335,23 +350,20 @@ initWellContributions([[maybe_unused]] WellContributions<double>& wellContribs,
 }
 
 // the tests use Dune::FieldMatrix, Flow uses Opm::MatrixBlock
-#define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                           \
-template class BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,   \
-Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                               \
-n>;                                                                                                                            \
-                                                                                                                               \
-template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<double, n, n>, std::allocator<Dune::FieldMatrix<double, n, n> > >, \
-Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                               \
-n>;
+#define INSTANTIATE_BDA_FUNCTIONS(T,n)                                     \
+    template class BdaBridge<Dune::BCRSMatrix<MatrixBlock<T,n,n>>,         \
+                   Dune::BlockVector<Dune::FieldVector<T,n>>,n>;           \
+    template class BdaBridge<Dune::BCRSMatrix<Dune::FieldMatrix<T,n,n>>,   \
+                             Dune::BlockVector<Dune::FieldVector<T,n>>,n>;
 
+#define INSTANTIATE_TYPE(T)        \
+    INSTANTIATE_BDA_FUNCTIONS(T,1) \
+    INSTANTIATE_BDA_FUNCTIONS(T,2) \
+    INSTANTIATE_BDA_FUNCTIONS(T,3) \
+    INSTANTIATE_BDA_FUNCTIONS(T,4) \
+    INSTANTIATE_BDA_FUNCTIONS(T,5) \
+    INSTANTIATE_BDA_FUNCTIONS(T,6)
 
-INSTANTIATE_BDA_FUNCTIONS(1);
-INSTANTIATE_BDA_FUNCTIONS(2);
-INSTANTIATE_BDA_FUNCTIONS(3);
-INSTANTIATE_BDA_FUNCTIONS(4);
-INSTANTIATE_BDA_FUNCTIONS(5);
-INSTANTIATE_BDA_FUNCTIONS(6);
-
-#undef INSTANTIATE_BDA_FUNCTIONS
+INSTANTIATE_TYPE(double)
 
 } // namespace Opm
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index baeb1897a..3ebdf0dcf 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -36,12 +36,13 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 class BdaBridge
 {
 private:
+    using Scalar = typename BridgeVector::field_type;
     int verbosity = 0;
     bool use_gpu = false;
     std::string accelerator_mode;
-    std::unique_ptr<Accelerator::BdaSolver<double,block_size>> backend;
-    std::shared_ptr<Accelerator::BlockedMatrix<double>> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
-    std::shared_ptr<Accelerator::BlockedMatrix<double>> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::unique_ptr<Accelerator::BdaSolver<Scalar,block_size>> backend;
+    std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> matrix;  // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
+    std::shared_ptr<Accelerator::BlockedMatrix<Scalar>> jacMatrix;  // 'stores' preconditioner matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes
     std::vector<int> h_rows, h_cols;  // store the sparsity pattern of the matrix
     std::vector<int> h_jacRows, h_jacCols;  // store the sparsity pattern of the jacMatrix
     std::vector<typename BridgeMatrix::size_type> diagIndices;   // contains offsets of the diagonal blocks wrt start of the row, used for replaceZeroDiagonal()
@@ -57,8 +58,14 @@ public:
     /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
     /// \param[in] opencl_ilu_parallel        whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling
     /// \param[in] linsolver                  indicating the preconditioner, equal to the --linear-solver cmdline argument
-    BdaBridge(std::string accelerator_mode, int linear_solver_verbosity, int maxit, double tolerance,
-        unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver);
+    BdaBridge(std::string accelerator_mode,
+              int linear_solver_verbosity,
+              int maxit,
+              Scalar tolerance,
+              unsigned int platformID,
+              unsigned int deviceID,
+              bool opencl_ilu_parallel,
+              std::string linsolver);
 
 
     /// Solve linear system, A*x = b
@@ -73,7 +80,7 @@ public:
                       BridgeMatrix* jacMat,
                       int numJacobiBlocks,
                       BridgeVector& b,
-                      WellContributions<double>& wellContribs,
+                      WellContributions<Scalar>& wellContribs,
                       InverseOperatorResult &result);
 
     /// Get the resulting x vector
@@ -82,7 +89,8 @@ public:
 
     /// Return whether the BdaBridge will use the GPU or not
     /// return whether the BdaBridge will use the GPU or not
-    bool getUseGpu(){
+    bool getUseGpu()
+    {
         return use_gpu;
     }
 
@@ -90,19 +98,21 @@ public:
     /// \param[in] mat       input matrix, probably BCRSMatrix
     /// \param[out] h_rows   rowpointers
     /// \param[out] h_cols   columnindices
-    static void copySparsityPatternFromISTL(const BridgeMatrix& mat, std::vector<int>& h_rows, std::vector<int>& h_cols);
+    static void copySparsityPatternFromISTL(const BridgeMatrix& mat,
+                                            std::vector<int>& h_rows,
+                                            std::vector<int>& h_cols);
 
     /// Initialize the WellContributions object with opencl context and queue
     /// those must be set before calling BlackOilWellModel::getWellContributions() in ISTL
     /// \param[in] wellContribs   container to hold all WellContributions
     /// \param[in] N              number of rows in scalar vector that wellContribs will be applied on
-    void initWellContributions(WellContributions<double>& wellContribs, unsigned N);
+    void initWellContributions(WellContributions<Scalar>& wellContribs, unsigned N);
 
     /// Return the selected accelerator mode, this is input via the command-line
-    std::string getAccleratorName(){
+    std::string getAccleratorName()
+    {
         return accelerator_mode;
     }
-
 }; // end class BdaBridge
 
 }