From a49aaf53d25a393868800be50f2ac0ec497dcfa5 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Tue, 28 Mar 2023 16:31:50 +0200
Subject: [PATCH 01/14] Add new files rocsparseWellContributions With HIP
 stdwell_apply kernel MultisegmentWells are done on CPU via UMFPack

---
 CMakeLists_files.cmake                        |   3 +
 .../linalg/bda/rocsparseWellContributions.cpp | 212 ++++++++++++++++++
 .../linalg/bda/rocsparseWellContributions.hpp |  55 +++++
 3 files changed, 270 insertions(+)
 create mode 100644 opm/simulators/linalg/bda/rocsparseWellContributions.cpp
 create mode 100644 opm/simulators/linalg/bda/rocsparseWellContributions.hpp

diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 077290e28..3202fe8ab 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -195,6 +195,7 @@ if(CUDA_FOUND)
   list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/set_device.hpp)
 
 endif()
+
 if(USE_BDA_BRIDGE)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp
                                  opm/simulators/linalg/bda/WellContributions.cpp
@@ -219,6 +220,7 @@ if(USE_BDA_BRIDGE)
   endif()
   if(rocsparse_FOUND AND rocblas_FOUND)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseWellContributions.cpp)
   endif()
   if(amgcl_FOUND)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
@@ -473,6 +475,7 @@ list (APPEND PUBLIC_HEADER_FILES
   opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
   opm/simulators/linalg/bda/rocalutionSolverBackend.hpp
   opm/simulators/linalg/bda/rocsparseSolverBackend.hpp
+  opm/simulators/linalg/bda/rocsparseWellContributions.hpp
   opm/simulators/linalg/bda/WellContributions.hpp
   opm/simulators/linalg/amgcpr.hh
   opm/simulators/linalg/twolevelmethodcpr.hh
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
new file mode 100644
index 000000000..b04131061
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -0,0 +1,212 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h> // CMake
+
+#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <hip/hip_runtime.h>
+
+
+#define HIP_CHECK(stat)                               \
+    {                                                 \
+        if(stat != hipSuccess)                        \
+        {                                             \
+            OPM_THROW(std::logic_error, "HIP error"); \
+        }                                             \
+    }
+
+namespace Opm
+{
+
+/// HIP kernel to apply the standard wellcontributions
+__global__ void stdwell_apply(
+            const double *Cnnzs,
+            const double *Dnnzs,
+            const double *Bnnzs,
+            const unsigned *Ccols,
+            const unsigned *Bcols,
+            const double *x,
+            double *y,
+            const unsigned dim,
+            const unsigned dim_wells,
+            const unsigned *val_pointers)
+{
+    unsigned wgId = blockIdx.x;
+    unsigned wiId = threadIdx.x;
+    unsigned valSize = val_pointers[wgId + 1] - val_pointers[wgId];
+    unsigned valsPerBlock = dim*dim_wells;
+    unsigned numActiveWorkItems = (blockDim.x/valsPerBlock)*valsPerBlock;
+    unsigned numBlocksPerWarp = blockDim.x/valsPerBlock;
+    unsigned c = wiId % dim;
+    unsigned r = (wiId/dim) % dim_wells;
+    double temp;
+
+    extern __shared__ double localSum[];
+    double *z1 = localSum + gridDim.x;
+    double *z2 = z1 + dim_wells;
+
+    localSum[wiId] = 0;
+    if(wiId < numActiveWorkItems){
+        unsigned b = wiId/valsPerBlock + val_pointers[wgId];
+        while(b < valSize + val_pointers[wgId]){
+            int colIdx = Bcols[b];
+            localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
+            b += numBlocksPerWarp;
+        }
+
+        // merge all blocks in this workgroup into 1 block
+        // if numBlocksPerWarp >= 3, should use loop
+        // block 1:     block 2:
+        //  0  1  2     12 13 14
+        //  3  4  5     15 16 17
+        //  6  7  8     18 19 20
+        //  9 10 11     21 22 23
+        // workitem i will hold the sum of workitems i and i + valsPerBlock
+        if(wiId < valsPerBlock){
+            for (unsigned i = 1; i < numBlocksPerWarp; ++i) {
+                localSum[wiId] += localSum[wiId + i*valsPerBlock];
+	    }
+        }
+
+        if(c == 0 && wiId < valsPerBlock){
+            for(unsigned i = dim - 1; i > 0; --i){
+                localSum[wiId] += localSum[wiId + i];
+            }
+            z1[r] = localSum[wiId];
+        }
+    }
+
+    __syncthreads();
+
+    if(wiId < dim_wells){
+        temp = 0.0;
+        for(unsigned i = 0; i < dim_wells; ++i){
+            temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
+        }
+        z2[wiId] = temp;
+    }
+
+    __syncthreads();
+
+    if(wiId < dim*valSize){
+        temp = 0.0;
+        unsigned bb = wiId/dim + val_pointers[wgId];
+        for (unsigned j = 0; j < dim_wells; ++j){
+            temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
+        }
+
+        int colIdx = Ccols[bb];
+        y[colIdx*dim + c] -= temp;
+    }
+}
+
+
+void WellContributionsRocsparse::apply_stdwells(double *d_x, double *d_y){
+    unsigned gridDim = num_std_wells;
+    unsigned blockDim = 32;
+    unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
+    // dim3(N) will create a vector {N, 1, 1}
+    stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(
+        d_Cnnzs_hip, d_Dnnzs_hip, d_Bnnzs_hip, d_Ccols_hip, d_Bcols_hip,
+        d_x, d_y, dim, dim_wells, d_val_pointers_hip
+    );
+    HIP_CHECK(hipStreamSynchronize(stream));
+}
+
+void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){
+    if (h_x.empty()) {
+        h_x.resize(N);
+        h_y.resize(N);
+    }
+
+    HIP_CHECK(hipMemcpyAsync(h_x.data(), d_x, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipMemcpyAsync(h_y.data(), d_y, sizeof(double) * N, hipMemcpyDeviceToHost, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // actually apply MultisegmentWells
+    for (auto& well : multisegments) {
+        well->apply(h_x.data(), h_y.data());
+    }
+
+    // copy vector y from CPU to GPU
+    HIP_CHECK(hipMemcpyAsync(d_y, h_y.data(), sizeof(double) * N, hipMemcpyHostToDevice, stream));
+    HIP_CHECK(hipStreamSynchronize(stream));
+}
+
+void WellContributionsRocsparse::apply(double *d_x, double *d_y){
+    if(num_std_wells > 0){
+        apply_stdwells(d_x, d_y);
+    }
+
+    if(num_ms_wells > 0){
+        apply_mswells(d_x, d_y);
+    }
+}
+
+void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
+                                        int* colIndices,
+                                        double* values,
+                                        unsigned int val_size)
+{
+    if (!allocated) {
+        OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
+    }
+
+    switch (type) {
+    case MatrixType::C:
+        HIP_CHECK(hipMemcpyAsync(d_Cnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Cnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Ccols_hip + num_blocks_so_far, colIndices, sizeof(d_Ccols_hip) * val_size, hipMemcpyHostToDevice, stream));
+        break;
+
+    case MatrixType::D:
+        HIP_CHECK(hipMemcpyAsync(d_Dnnzs_hip + num_std_wells_so_far * dim_wells * dim_wells, values, sizeof(d_Dnnzs_hip) * dim_wells * dim_wells, hipMemcpyHostToDevice, stream));
+        break;
+
+    case MatrixType::B:
+        HIP_CHECK(hipMemcpyAsync(d_Bnnzs_hip + num_blocks_so_far * dim * dim_wells, values, sizeof(d_Bnnzs_hip) * val_size * dim * dim_wells, hipMemcpyHostToDevice, stream));
+        HIP_CHECK(hipMemcpyAsync(d_Bcols_hip + num_blocks_so_far, colIndices, sizeof(d_Bcols_hip) * val_size, hipMemcpyHostToDevice, stream));
+
+        val_pointers[num_std_wells_so_far] = num_blocks_so_far;
+        if (num_std_wells_so_far == num_std_wells - 1) {
+            val_pointers[num_std_wells] = num_blocks;
+            HIP_CHECK(hipMemcpyAsync(d_val_pointers_hip, val_pointers.data(), sizeof(d_val_pointers_hip) * (num_std_wells + 1), hipMemcpyHostToDevice, stream));
+        }
+        break;
+
+    default:
+        OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributionsRocsparse::addMatrix()");
+    }
+    HIP_CHECK(hipStreamSynchronize(stream));
+}
+
+void WellContributionsRocsparse::APIalloc()
+{
+    HIP_CHECK(hipMalloc((void**)&d_Cnnzs_hip, sizeof(d_Cnnzs_hip) * num_blocks * dim * dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Dnnzs_hip, sizeof(d_Dnnzs_hip) * num_std_wells * dim_wells * dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Bnnzs_hip, sizeof(d_Bnnzs_hip) * num_blocks * dim * dim_wells));
+    HIP_CHECK(hipMalloc((void**)&d_Ccols_hip, sizeof(d_Ccols_hip) * num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_Bcols_hip, sizeof(d_Bcols_hip) * num_blocks));
+    HIP_CHECK(hipMalloc((void**)&d_val_pointers_hip, sizeof(d_val_pointers_hip) * (num_std_wells + 1)));
+}
+
+} //namespace Opm
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
new file mode 100644
index 000000000..e9ca6a686
--- /dev/null
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
@@ -0,0 +1,55 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef WELLCONTRIBUTIONS_ROCSPARSE_HEADER_INCLUDED
+#define WELLCONTRIBUTIONS_ROCSPARSE_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/WellContributions.hpp>
+
+
+#include <vector>
+
+
+namespace Opm
+{
+
+class WellContributionsRocsparse : public WellContributions
+{
+public:
+    void apply_stdwells(double *d_x, double *d_y);
+    void apply_mswells(double *d_x, double *d_y);
+    void apply(double *d_x, double *d_y);
+
+protected:
+    /// Allocate memory for the StandardWells
+    void APIalloc() override;
+
+    void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
+
+    double *d_Cnnzs_hip, *d_Dnnzs_hip, *d_Bnnzs_hip;
+    unsigned *d_Ccols_hip, *d_Bcols_hip;
+    unsigned *d_val_pointers_hip;
+
+    std::vector<double> h_x;
+    std::vector<double> h_y;
+};
+
+} //namespace Opm
+
+#endif

From 9bef10a018de32b962152e7548296bceddbd7012 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Fri, 14 Apr 2023 09:35:15 +0200
Subject: [PATCH 02/14] Set hipStream for rocsparseWellcontributions

---
 opm/simulators/linalg/bda/rocsparseSolverBackend.cpp     | 6 ++++++
 opm/simulators/linalg/bda/rocsparseWellContributions.cpp | 4 ++++
 opm/simulators/linalg/bda/rocsparseWellContributions.hpp | 5 +++++
 3 files changed, 15 insertions(+)

diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index 14392564d..52db594ad 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -153,6 +153,12 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
 
     Timer t_total, t_prec(false), t_spmv(false), t_rest(false);
 
+    // set stream here, the WellContributions object is destroyed every linear solve
+    // the number of wells can change every linear solve
+    if(wellContribs.getNumWells() > 0){
+        static_cast<WellContributionsRocsparse&>(wellContribs).setStream(stream);
+    }
+
 // HIP_VERSION is defined as (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
 #if HIP_VERSION >= 50400000
     ROCSPARSE_CHECK(rocsparse_dbsrmv_ex(handle, dir, operation,
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
index b04131061..504f66609 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -163,6 +163,10 @@ void WellContributionsRocsparse::apply(double *d_x, double *d_y){
     }
 }
 
+void WellContributionsRocsparse::setStream(hipStream_t stream_){
+    stream = stream_;
+}
+
 void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
                                         int* colIndices,
                                         double* values,
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
index e9ca6a686..b6e6dc81d 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
@@ -22,6 +22,7 @@
 
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 
+#include <hip/hip_runtime_api.h>
 
 #include <vector>
 
@@ -31,10 +32,14 @@ namespace Opm
 
 class WellContributionsRocsparse : public WellContributions
 {
+private:
+    hipStream_t stream;
+
 public:
     void apply_stdwells(double *d_x, double *d_y);
     void apply_mswells(double *d_x, double *d_y);
     void apply(double *d_x, double *d_y);
+    void setStream(hipStream_t stream);
 
 protected:
     /// Allocate memory for the StandardWells

From 50ccea0880bf2d2b64896533b5a35013e5fbe06f Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Fri, 14 Apr 2023 09:35:44 +0200
Subject: [PATCH 03/14] Allow rocsparseSolver to choose deviceID

---
 opm/simulators/linalg/bda/rocsparseSolverBackend.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index 52db594ad..d85fc4063 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -96,11 +96,12 @@ using Dune::Timer;
 
 template <unsigned int block_size>
 rocsparseSolverBackend<block_size>::rocsparseSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
-    hipDevice_t device;
-    if(hipDeviceGet(&device, deviceID) != hipSuccess)
-    {
-        OPM_THROW(std::logic_error, "HIP Error: could not get device");
+    int numDevices = 0;
+    HIP_CHECK(hipGetDeviceCount(&numDevices));
+    if (static_cast<int>(deviceID) >= numDevices) {
+        OPM_THROW(std::runtime_error, "Error chosen too high HIP device ID");
     }
+    HIP_CHECK(hipSetDevice(deviceID));
 
     ROCSPARSE_CHECK(rocsparse_create_handle(&handle));
     ROCBLAS_CHECK(rocblas_create_handle(&blas_handle));

From cb20d041c3c1d172e03887a8a7c2b94eac08aecb Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Tue, 28 Mar 2023 16:31:50 +0200
Subject: [PATCH 04/14] Allow WellContributionsRocsparse to be used

---
 .../linalg/bda/WellContributions.cpp          | 11 ++++++++-
 .../linalg/bda/rocsparseSolverBackend.cpp     | 23 ++++++++++++++++---
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp
index 4e0675b40..f60b1d17a 100644
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@@ -32,6 +32,10 @@
 #include <opm/simulators/linalg/bda/cuda/cuWellContributions.hpp>
 #endif
 
+#ifdef HAVE_ROCSPARSE
+#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
+#endif
+
 namespace Opm
 {
 
@@ -54,9 +58,14 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
     }
     else if(accelerator_mode.compare("rocsparse") == 0){
         if (!useWellConn) {
-            OPM_THROW(std::logic_error, "Error rocsparse requires --matrix-add-well-contributions=true");
+#if HAVE_ROCSPARSE
+            return std::make_unique<WellContributionsRocsparse>();
+#else
+        OPM_THROW(std::runtime_error, "Cannot initialize well contributions: rocsparse is not enabled");
+#endif
         }
         return std::make_unique<WellContributions>();
+
     }
     else if(accelerator_mode.compare("amgcl") == 0){
         if (!useWellConn) {
diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index d85fc4063..635bde61c 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -37,6 +37,7 @@
 #undef HAVE_CUDA
 
 #include <opm/simulators/linalg/bda/rocsparseSolverBackend.hpp>
+#include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
 
 #include <opm/simulators/linalg/bda/BdaResult.hpp>
 
@@ -152,7 +153,7 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
     double one  = 1.0;
     double mone = -1.0;
 
-    Timer t_total, t_prec(false), t_spmv(false), t_rest(false);
+    Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
 
     // set stream here, the WellContributions object is destroyed every linear solve
     // the number of wells can change every linear solve
@@ -232,10 +233,18 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
         if (verbosity >= 3) {
             HIP_CHECK(hipStreamSynchronize(stream));
             t_spmv.stop();
-            t_rest.start();
+            t_well.start();
         }
 
         // apply wellContributions
+        if(wellContribs.getNumWells() > 0){
+            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_pw, d_v);
+        }
+        if (verbosity >= 3) {
+            HIP_CHECK(hipStreamSynchronize(stream));
+            t_well.stop();
+            t_rest.start();
+        }
 
         ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_rw, 1, d_v, 1, &tmp1));
         alpha = rho / tmp1;
@@ -285,10 +294,18 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
         if(verbosity >= 3){
             HIP_CHECK(hipStreamSynchronize(stream));
             t_spmv.stop();
-            t_rest.start();
+            t_well.start();
         }
 
         // apply wellContributions
+        if(wellContribs.getNumWells() > 0){
+            static_cast<WellContributionsRocsparse&>(wellContribs).apply(d_s, d_t);
+        }
+        if (verbosity >= 3) {
+            HIP_CHECK(hipStreamSynchronize(stream));
+            t_well.stop();
+            t_rest.start();
+        }
 
         ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_r, 1, &tmp1));
         ROCBLAS_CHECK(rocblas_ddot(blas_handle, N, d_t, 1, d_t, 1, &tmp2));

From 684397129bffb4dddfa96372fd2d29b1ff85caf6 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Tue, 28 Mar 2023 16:31:50 +0200
Subject: [PATCH 05/14] Make sure rocsparse can get wellcontributions

---
 opm/simulators/linalg/ISTLSolverEbos.cpp | 166 +++++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/opm/simulators/linalg/ISTLSolverEbos.cpp b/opm/simulators/linalg/ISTLSolverEbos.cpp
index 241df3fb0..9f8f64e46 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.cpp
@@ -203,6 +203,172 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
     }
 }
 
+//Razvan<<<<<<< HEAD
+//Razvan=======
+#if COMPILE_BDA_BRIDGE
+template<class Matrix, class Vector>
+BdaSolverInfo<Matrix,Vector>::
+BdaSolverInfo(const std::string& accelerator_mode,
+              const int linear_solver_verbosity,
+              const int maxit,
+              const double tolerance,
+              const int platformID,
+              const int deviceID,
+              const bool opencl_ilu_parallel,
+              const std::string& linsolver)
+    : bridge_(std::make_unique<Bridge>(accelerator_mode,
+                                       linear_solver_verbosity, maxit,
+                                       tolerance, platformID, deviceID,
+                                       opencl_ilu_parallel, linsolver))
+    , accelerator_mode_(accelerator_mode)
+{}
+
+template<class Matrix, class Vector>
+BdaSolverInfo<Matrix,Vector>::~BdaSolverInfo() = default;
+
+template<class Matrix, class Vector>
+template<class Grid>
+void BdaSolverInfo<Matrix,Vector>::
+prepare(const Grid& grid,
+        const Dune::CartesianIndexMapper<Grid>& cartMapper,
+        const std::vector<Well>& wellsForConn,
+        const std::vector<int>& cellPartition,
+        const size_t nonzeroes,
+        const bool useWellConn)
+{
+    if (numJacobiBlocks_ > 1) {
+      detail::setWellConnections(grid, cartMapper, wellsForConn,
+                                 useWellConn,
+                                 wellConnectionsGraph_,
+                                 numJacobiBlocks_);
+      this->blockJacobiAdjacency(grid, cellPartition, nonzeroes);
+    }
+}
+
+template<class Matrix, class Vector>
+bool BdaSolverInfo<Matrix,Vector>::
+apply(Vector& rhs,
+      const bool useWellConn,
+      WellContribFunc getContribs,
+      const int rank,
+      Matrix& matrix,
+      Vector& x,
+      Dune::InverseOperatorResult& result)
+{
+    bool use_gpu = bridge_->getUseGpu();
+    if (use_gpu) {
+        auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
+        bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
+
+        // the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_ROCSPARSE
+        if (!useWellConn) {
+            getContribs(*wellContribs);
+        }
+#endif
+
+        if (numJacobiBlocks_ > 1) {
+            this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
+            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
+            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
+                                  numJacobiBlocks_, rhs, *wellContribs, result);
+        }
+        else
+            bridge_->solve_system(&matrix, &matrix,
+                                  numJacobiBlocks_, rhs, *wellContribs, result);
+        if (result.converged) {
+            // get result vector x from non-Dune backend, iff solve was successful
+            bridge_->get_result(x);
+            return true;
+        } else {
+            // warn about CPU fallback
+            // BdaBridge might have disabled its BdaSolver for this simulation due to some error
+            // in that case the BdaBridge is disabled and flexibleSolver is always used
+            // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
+            if (rank == 0) {
+                OpmLog::warning(bridge_->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
+            }
+        }
+    }
+
+    return false;
+}
+
+template<class Matrix, class Vector>
+bool BdaSolverInfo<Matrix,Vector>::
+gpuActive()
+{
+    return bridge_->getUseGpu();
+}
+
+template<class Matrix, class Vector>
+template<class Grid>
+void BdaSolverInfo<Matrix,Vector>::
+blockJacobiAdjacency(const Grid& grid,
+                     const std::vector<int>& cell_part,
+                     size_t nonzeroes)
+{
+    using size_type = typename Matrix::size_type;
+    using Iter = typename Matrix::CreateIterator;
+    size_type numCells = grid.size(0);
+    blockJacobiForGPUILU0_ = std::make_unique<Matrix>(numCells, numCells,
+                                                      nonzeroes, Matrix::row_wise);
+
+    const auto& lid = grid.localIdSet();
+    const auto& gridView = grid.leafGridView();
+    auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
+
+    //Loop over cells
+    for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
+    {
+        const auto& elem = *elemIt;
+        size_type idx = lid.id(elem);
+        row.insert(idx);
+
+        // Add well non-zero connections
+        for (const auto wc : wellConnectionsGraph_[idx]) {
+            row.insert(wc);
+        }
+
+        int locPart = cell_part[idx];
+
+        //Add neighbor if it is on the same part
+        auto isend = gridView.iend(elem);
+        for (auto is = gridView.ibegin(elem); is!=isend; ++is)
+        {
+            //check if face has neighbor
+            if (is->neighbor())
+            {
+                size_type nid = lid.id(is->outside());
+                int nabPart = cell_part[nid];
+                if (locPart == nabPart) {
+                    row.insert(nid);
+                }
+            }
+        }
+    }
+}
+
+template<class Matrix, class Vector>
+void BdaSolverInfo<Matrix,Vector>::
+copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
+{
+    auto rbegin = blockJac.begin();
+    auto rend = blockJac.end();
+    auto outerRow = mat.begin();
+    for (auto row = rbegin; row != rend; ++row, ++outerRow) {
+        auto outerCol = (*outerRow).begin();
+        for (auto col = (*row).begin(); col != (*row).end(); ++col) {
+            // outerRow is guaranteed to have all column entries that row has!
+            while(outerCol.index() < col.index()) ++outerCol;
+            assert(outerCol.index() == col.index());
+            *col = *outerCol; // copy nonzero block
+        }
+    }
+}
+#endif // COMPILE_BDA_BRIDGE
+
+//Razvan>>>>>>> 1a32e4cc1 (Make sure rocsparse can get wellcontributions)
 template<int Dim>
 using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
 template<int Dim>

From ef6be5859efdddc00d8a823b5ccce51bbf409b51 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Fri, 14 Apr 2023 12:59:07 +0200
Subject: [PATCH 06/14] Restore old HAVE_CUDA value

---
 .../linalg/bda/rocsparseWellContributions.cpp    | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
index 504f66609..22619dfb5 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -19,11 +19,27 @@
 
 #include <config.h> // CMake
 
+// MultisegmentWellContribution includes the cuda runtime if found by CMake
+// this leads to inclusion of both amd_hip_vector_types.h and vector_types.h
+// which both define vector types like uchar2, short3 and double4.
+// Restore the value (if defined) afterwards.
+#ifdef HAVE_CUDA
+#define HIP_HAVE_CUDA_DEFINED HAVE_CUDA
+#endif
+
+#undef HAVE_CUDA
+
 #include <opm/simulators/linalg/bda/rocsparseWellContributions.hpp>
 
+#ifdef HIP_HAVE_CUDA_DEFINED
+#define HAVE_CUDA HIP_HAVE_CUDA_DEFINED
+#undef HIP_HAVE_CUDA_DEFINED
+#endif
+
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 
+
 #include <hip/hip_runtime.h>
 
 

From abce3a897c1f2080122826139f416c3e19da84cb Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Fri, 14 Apr 2023 13:00:29 +0200
Subject: [PATCH 07/14] Only compile HIP kernel with hipcc

---
 opm/simulators/linalg/bda/rocsparseWellContributions.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
index 22619dfb5..4e448ff94 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -54,6 +54,7 @@
 namespace Opm
 {
 
+#ifdef __HIP__
 /// HIP kernel to apply the standard wellcontributions
 __global__ void stdwell_apply(
             const double *Cnnzs,
@@ -135,9 +136,12 @@ __global__ void stdwell_apply(
         y[colIdx*dim + c] -= temp;
     }
 }
+#endif
 
 
-void WellContributionsRocsparse::apply_stdwells(double *d_x, double *d_y){
+void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
+                                                [[maybe_unused]] double *d_y){
+#ifdef __HIP__
     unsigned gridDim = num_std_wells;
     unsigned blockDim = 32;
     unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
@@ -147,6 +151,9 @@ void WellContributionsRocsparse::apply_stdwells(double *d_x, double *d_y){
         d_x, d_y, dim, dim_wells, d_val_pointers_hip
     );
     HIP_CHECK(hipStreamSynchronize(stream));
+#else
+    OPM_THROW(std::logic_error, "Error separate wellcontributions for rocsparse only supported when compiling with hipcc");
+#endif
 }
 
 void WellContributionsRocsparse::apply_mswells(double *d_x, double *d_y){

From b2fea287bc38718b06effa71dce4d8a135345548 Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Fri, 14 Apr 2023 13:01:44 +0200
Subject: [PATCH 08/14] Print time to apply wells for verbosity>=3

---
 opm/simulators/linalg/bda/rocsparseSolverBackend.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
index 635bde61c..56351725d 100644
--- a/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/rocsparseSolverBackend.cpp
@@ -347,6 +347,7 @@ void rocsparseSolverBackend<block_size>::gpu_pbicgstab([[maybe_unused]] WellCont
         std::ostringstream out;
         out << "rocsparseSolver::prec_apply:  " << t_prec.elapsed() << " s\n";
         out << "rocsparseSolver::spmv:        " << t_spmv.elapsed() << " s\n";
+        out << "rocsparseSolver::well:        " << t_well.elapsed() << " s\n";
         out << "rocsparseSolver::rest:        " << t_rest.elapsed() << " s\n";
         out << "rocsparseSolver::total_solve: " << res.elapsed << " s\n";
         OpmLog::info(out.str());

From ca210612e1fa685d88dc761b78c2c2bd5cc7b00f Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Fri, 30 Jun 2023 14:40:45 +0200
Subject: [PATCH 09/14] Fix hipErrorNoBinaryForGpu when hip_FOUND

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e16ffc284..bcd523838 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -684,6 +684,10 @@ if(USE_BDA_BRIDGE)
   if(VexCL_FOUND)
     target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
   endif()
+
+  if(hip_FOUND)
+    target_link_libraries( opmsimulators PUBLIC hip::device )
+  endif()
 endif()
 
 if(Damaris_FOUND)

From 177a46366d82bc36365e435a71341c15611507dc Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Fri, 30 Jun 2023 14:44:08 +0200
Subject: [PATCH 10/14] Add <rocsparse> to accelerator-mode option in doc

---
 opm/simulators/linalg/FlowLinearSolverParameters.hpp | 2 +-
 opm/simulators/linalg/bda/BdaBridge.cpp              | 2 +-
 opm/simulators/linalg/bda/BdaBridge.hpp              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/opm/simulators/linalg/FlowLinearSolverParameters.hpp b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
index 4e896b759..b24ad935c 100644
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@@ -311,7 +311,7 @@ namespace Opm
             EWOMS_REGISTER_PARAM(TypeTag, bool, LinearSolverPrintJsonDefinition, "Write the JSON definition of the linear solver setup to the DBG file.");
             EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate, 4: recreated every CprReuseInterval");
             EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseInterval, "Reuse preconditioner interval. Used when CprReuseSetup is set to 4, then the preconditioner will be fully recreated instead of reused every N linear solve, where N is this parameter.");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Choose a linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]'");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Choose a linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution|rocsparse]'");
             EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
             EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
             EWOMS_REGISTER_PARAM(TypeTag, bool, OpenclIluParallel, "Parallelize ILU decomposition and application on GPU");
diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp
index 1ddb3216c..5434e22fe 100644
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@@ -108,7 +108,7 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string acceler
     } else if (accelerator_mode.compare("none") == 0) {
         use_gpu = false;
     } else {
-        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]");
+        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution|rocsparse]");
     }
 }
 
diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp
index 8001b9552..03d5904c3 100644
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@@ -49,7 +49,7 @@ private:
 
 public:
     /// Construct a BdaBridge
-    /// \param[in] accelerator_mode           to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]'
+    /// \param[in] accelerator_mode           to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution|rocsparse]'
     /// \param[in] linear_solver_verbosity    verbosity of BdaSolver
     /// \param[in] maxit                      maximum number of iterations for BdaSolver
     /// \param[in] tolerance                  required relative tolerance for BdaSolver

From e4abc12a05222e62376f539877385fe4672a458c Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Tue, 26 Sep 2023 06:17:18 +0200
Subject: [PATCH 11/14] Adapt rocsparse separate wells PR to changes made to
 ISTLSolverEbos

---
 CMakeLists.txt                                |   4 -
 opm/simulators/linalg/ISTLSolverEbos.cpp      | 166 ------------------
 opm/simulators/linalg/ISTLSolverEbosBda.cpp   |   4 +-
 .../linalg/bda/rocsparseWellContributions.cpp |   2 +-
 4 files changed, 3 insertions(+), 173 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcd523838..e16ffc284 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -684,10 +684,6 @@ if(USE_BDA_BRIDGE)
   if(VexCL_FOUND)
     target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL )
   endif()
-
-  if(hip_FOUND)
-    target_link_libraries( opmsimulators PUBLIC hip::device )
-  endif()
 endif()
 
 if(Damaris_FOUND)
diff --git a/opm/simulators/linalg/ISTLSolverEbos.cpp b/opm/simulators/linalg/ISTLSolverEbos.cpp
index 9f8f64e46..241df3fb0 100644
--- a/opm/simulators/linalg/ISTLSolverEbos.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.cpp
@@ -203,172 +203,6 @@ void FlexibleSolverInfo<Matrix,Vector,Comm>::create(const Matrix& matrix,
     }
 }
 
-//Razvan<<<<<<< HEAD
-//Razvan=======
-#if COMPILE_BDA_BRIDGE
-template<class Matrix, class Vector>
-BdaSolverInfo<Matrix,Vector>::
-BdaSolverInfo(const std::string& accelerator_mode,
-              const int linear_solver_verbosity,
-              const int maxit,
-              const double tolerance,
-              const int platformID,
-              const int deviceID,
-              const bool opencl_ilu_parallel,
-              const std::string& linsolver)
-    : bridge_(std::make_unique<Bridge>(accelerator_mode,
-                                       linear_solver_verbosity, maxit,
-                                       tolerance, platformID, deviceID,
-                                       opencl_ilu_parallel, linsolver))
-    , accelerator_mode_(accelerator_mode)
-{}
-
-template<class Matrix, class Vector>
-BdaSolverInfo<Matrix,Vector>::~BdaSolverInfo() = default;
-
-template<class Matrix, class Vector>
-template<class Grid>
-void BdaSolverInfo<Matrix,Vector>::
-prepare(const Grid& grid,
-        const Dune::CartesianIndexMapper<Grid>& cartMapper,
-        const std::vector<Well>& wellsForConn,
-        const std::vector<int>& cellPartition,
-        const size_t nonzeroes,
-        const bool useWellConn)
-{
-    if (numJacobiBlocks_ > 1) {
-      detail::setWellConnections(grid, cartMapper, wellsForConn,
-                                 useWellConn,
-                                 wellConnectionsGraph_,
-                                 numJacobiBlocks_);
-      this->blockJacobiAdjacency(grid, cellPartition, nonzeroes);
-    }
-}
-
-template<class Matrix, class Vector>
-bool BdaSolverInfo<Matrix,Vector>::
-apply(Vector& rhs,
-      const bool useWellConn,
-      WellContribFunc getContribs,
-      const int rank,
-      Matrix& matrix,
-      Vector& x,
-      Dune::InverseOperatorResult& result)
-{
-    bool use_gpu = bridge_->getUseGpu();
-    if (use_gpu) {
-        auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
-        bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
-
-        // the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
-#if HAVE_CUDA || HAVE_OPENCL || HAVE_ROCSPARSE
-        if (!useWellConn) {
-            getContribs(*wellContribs);
-        }
-#endif
-
-        if (numJacobiBlocks_ > 1) {
-            this->copyMatToBlockJac(matrix, *blockJacobiForGPUILU0_);
-            // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
-            bridge_->solve_system(&matrix, blockJacobiForGPUILU0_.get(),
-                                  numJacobiBlocks_, rhs, *wellContribs, result);
-        }
-        else
-            bridge_->solve_system(&matrix, &matrix,
-                                  numJacobiBlocks_, rhs, *wellContribs, result);
-        if (result.converged) {
-            // get result vector x from non-Dune backend, iff solve was successful
-            bridge_->get_result(x);
-            return true;
-        } else {
-            // warn about CPU fallback
-            // BdaBridge might have disabled its BdaSolver for this simulation due to some error
-            // in that case the BdaBridge is disabled and flexibleSolver is always used
-            // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
-            if (rank == 0) {
-                OpmLog::warning(bridge_->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
-            }
-        }
-    }
-
-    return false;
-}
-
-template<class Matrix, class Vector>
-bool BdaSolverInfo<Matrix,Vector>::
-gpuActive()
-{
-    return bridge_->getUseGpu();
-}
-
-template<class Matrix, class Vector>
-template<class Grid>
-void BdaSolverInfo<Matrix,Vector>::
-blockJacobiAdjacency(const Grid& grid,
-                     const std::vector<int>& cell_part,
-                     size_t nonzeroes)
-{
-    using size_type = typename Matrix::size_type;
-    using Iter = typename Matrix::CreateIterator;
-    size_type numCells = grid.size(0);
-    blockJacobiForGPUILU0_ = std::make_unique<Matrix>(numCells, numCells,
-                                                      nonzeroes, Matrix::row_wise);
-
-    const auto& lid = grid.localIdSet();
-    const auto& gridView = grid.leafGridView();
-    auto elemIt = gridView.template begin<0>(); // should never overrun, since blockJacobiForGPUILU0_ is initialized with numCells rows
-
-    //Loop over cells
-    for (Iter row = blockJacobiForGPUILU0_->createbegin(); row != blockJacobiForGPUILU0_->createend(); ++elemIt, ++row)
-    {
-        const auto& elem = *elemIt;
-        size_type idx = lid.id(elem);
-        row.insert(idx);
-
-        // Add well non-zero connections
-        for (const auto wc : wellConnectionsGraph_[idx]) {
-            row.insert(wc);
-        }
-
-        int locPart = cell_part[idx];
-
-        //Add neighbor if it is on the same part
-        auto isend = gridView.iend(elem);
-        for (auto is = gridView.ibegin(elem); is!=isend; ++is)
-        {
-            //check if face has neighbor
-            if (is->neighbor())
-            {
-                size_type nid = lid.id(is->outside());
-                int nabPart = cell_part[nid];
-                if (locPart == nabPart) {
-                    row.insert(nid);
-                }
-            }
-        }
-    }
-}
-
-template<class Matrix, class Vector>
-void BdaSolverInfo<Matrix,Vector>::
-copyMatToBlockJac(const Matrix& mat, Matrix& blockJac)
-{
-    auto rbegin = blockJac.begin();
-    auto rend = blockJac.end();
-    auto outerRow = mat.begin();
-    for (auto row = rbegin; row != rend; ++row, ++outerRow) {
-        auto outerCol = (*outerRow).begin();
-        for (auto col = (*row).begin(); col != (*row).end(); ++col) {
-            // outerRow is guaranteed to have all column entries that row has!
-            while(outerCol.index() < col.index()) ++outerCol;
-            assert(outerCol.index() == col.index());
-            *col = *outerCol; // copy nonzero block
-        }
-    }
-}
-#endif // COMPILE_BDA_BRIDGE
-
-//Razvan>>>>>>> 1a32e4cc1 (Make sure rocsparse can get wellcontributions)
 template<int Dim>
 using BM = Dune::BCRSMatrix<MatrixBlock<double,Dim,Dim>>;
 template<int Dim>
diff --git a/opm/simulators/linalg/ISTLSolverEbosBda.cpp b/opm/simulators/linalg/ISTLSolverEbosBda.cpp
index c6f462a4f..e1261d17a 100644
--- a/opm/simulators/linalg/ISTLSolverEbosBda.cpp
+++ b/opm/simulators/linalg/ISTLSolverEbosBda.cpp
@@ -100,8 +100,8 @@ apply(Vector& rhs,
         auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn);
         bridge_->initWellContributions(*wellContribs, x.N() * x[0].N());
 
-        // the WellContributions can only be applied separately with CUDA or OpenCL, not with amgcl or rocalution
-#if HAVE_CUDA || HAVE_OPENCL
+         // the WellContributions can only be applied separately with CUDA, OpenCL or rocsparse, not with amgcl or rocalution
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_ROCSPARSE
         if (!useWellConn) {
             getContribs(*wellContribs);
         }
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
index 4e448ff94..f8dc0ae32 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -143,7 +143,7 @@ void WellContributionsRocsparse::apply_stdwells([[maybe_unused]] double *d_x,
                                                 [[maybe_unused]] double *d_y){
 #ifdef __HIP__
     unsigned gridDim = num_std_wells;
-    unsigned blockDim = 32;
+    unsigned blockDim = 64;
     unsigned shared_mem_size = (blockDim + 2 * dim_wells) * sizeof(double); // shared memory for localSum, z1 and z2
     // dim3(N) will create a vector {N, 1, 1}
     stdwell_apply<<<dim3(gridDim), dim3(blockDim), shared_mem_size, stream>>>(

From dcc3bd70f01b5ff12fec54bb4f172d51e01b79c9 Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvan.nane@bigdataccelerate.com>
Date: Fri, 29 Sep 2023 13:22:07 +0200
Subject: [PATCH 12/14] Reinclude changes lost in adaptation to
 ISTLSolverEbosBda

---
 opm/simulators/linalg/bda/rocsparseWellContributions.cpp | 8 ++++----
 opm/simulators/linalg/bda/rocsparseWellContributions.hpp | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
index f8dc0ae32..07b46d7d2 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.cpp
@@ -1,5 +1,5 @@
 /*
-  Copyright 2020 Equinor ASA
+  Copyright 2023 Equinor ASA
 
   This file is part of the Open Porous Media project (OPM).
 
@@ -191,9 +191,9 @@ void WellContributionsRocsparse::setStream(hipStream_t stream_){
 }
 
 void WellContributionsRocsparse::APIaddMatrix(MatrixType type,
-                                        int* colIndices,
-                                        double* values,
-                                        unsigned int val_size)
+                                              int* colIndices,
+                                              double* values,
+                                              unsigned int val_size)
 {
     if (!allocated) {
         OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
diff --git a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
index b6e6dc81d..bfa7e888a 100644
--- a/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
+++ b/opm/simulators/linalg/bda/rocsparseWellContributions.hpp
@@ -1,5 +1,5 @@
 /*
-  Copyright 2020 Equinor ASA
+  Copyright 2023 Equinor ASA
 
   This file is part of the Open Porous Media project (OPM).
 

From 7673118f143d57f832c051258878bf702dd0c6ce Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvnane@yahoo.com>
Date: Sun, 8 Oct 2023 08:57:37 +0200
Subject: [PATCH 13/14] CUDA and HIP not allowed at the same time

---
 CMakeLists.txt         | 7 +++++++
 CMakeLists_files.cmake | 9 ++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e16ffc284..3312c08b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -312,6 +312,13 @@ macro (files_hook)
     set(HDF5_FOUND OFF)
     unset(HAVE_HDF5)
   endif()
+  if(HAVE_ROCSPARSE AND HAVE_CUDA)
+    message(WARNING "WARNING! Using CUDA and HIP at the same time is not allowed. Please choose only one of them. Disabling both...\n")
+    set(CUDA_FOUND OFF)
+    set(rocsparse_FOUND OFF)
+    unset(HAVE_CUDA)
+    unset(HAVE_ROCSPARSE)
+  endif()
   # read the list of components from this file (in the project directory);
   # it should set various lists with the names of the files to include
   # include needs to be here to make reset HDF5_FOUND available in
diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake
index 3202fe8ab..ffedbce48 100644
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -151,11 +151,6 @@ if (Damaris_FOUND AND MPI_FOUND)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/utils/initDamarisXmlFile.cpp)
 endif()
 if(CUDA_FOUND)
-  if(USE_BDA_BRIDGE)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
-    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cuWellContributions.cu)
-  endif()
-
   # CUISTL SOURCE
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.cpp)
   list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.cpp)
@@ -222,6 +217,10 @@ if(USE_BDA_BRIDGE)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseSolverBackend.cpp)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/rocsparseWellContributions.cpp)
   endif()
+  if(CUDA_FOUND)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.cu)
+    list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuda/cuWellContributions.cu)
+  endif()
   if(amgcl_FOUND)
     list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp)
     if(CUDA_FOUND)

From 7e8528b43a8ba0a1094e20f287ec909a9624474f Mon Sep 17 00:00:00 2001
From: Razvan Nane <razvnane@yahoo.com>
Date: Wed, 11 Oct 2023 13:29:40 +0200
Subject: [PATCH 14/14] Disable only CUDA when ROCm also selected

---
 CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3312c08b2..5c0418d3d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -313,11 +313,9 @@ macro (files_hook)
     unset(HAVE_HDF5)
   endif()
   if(HAVE_ROCSPARSE AND HAVE_CUDA)
-    message(WARNING "WARNING! Using CUDA and HIP at the same time is not allowed. Please choose only one of them. Disabling both...\n")
+    message(WARNING "WARNING! Using CUDA and ROCm at the same time is not allowed. Please choose only one of them by setting CMAKE_DISABLE_FIND_PACKAGE_<rocsparse|CUDA>=<ON|OFF>. Disabling CUDA...\n")
     set(CUDA_FOUND OFF)
-    set(rocsparse_FOUND OFF)
     unset(HAVE_CUDA)
-    unset(HAVE_ROCSPARSE)
   endif()
   # read the list of components from this file (in the project directory);
   # it should set various lists with the names of the files to include