Combine new diagonalMV kernels into one using

template arguements. Use more consistent naming conventions. Change printing function of CuVector from output stream overload to toDebugString().
2025-02-25 18:55:30 -06:00 · 2023-10-12 16:51:34 +02:00
parent c809819cbd
commit 533c5a1a3e
10 changed files with 358 additions and 286 deletions
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@@ -45,33 +45,34 @@ namespace Opm::cuistl

 template <class M, class X, class Y, int l>
 CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
-    : cpuMatrix(A)
-    , relaxation_factor(w)
-    , cuMatrix(CuSparseMatrix<field_type>::fromMatrix(detail::makeMatrixWithNonzeroDiagonal(A)))
-    , cuVec_diagInvFlattened(cuMatrix.N() * cuMatrix.blockSize() * cuMatrix.blockSize())
+    : m_cpuMatrix(A)
+    , m_relaxationFactor(w)
+    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(A))
+    , m_diagInvFlattened(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
 {
    // Some sanity check
-    OPM_ERROR_IF(A.N() != cuMatrix.N(),
-                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", cuMatrix.N(), A.N()));
-    OPM_ERROR_IF(
-        A[0][0].N() != cuMatrix.blockSize(),
-        fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.", cuMatrix.blockSize(), A[0][0].N()));
-    OPM_ERROR_IF(A.N() * A[0][0].N() != cuMatrix.dim(),
+    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
+                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
+    OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
+                 fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
+                             m_gpuMatrix.blockSize(),
+                             A[0][0].N()));
+    OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
                 fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
-                             cuMatrix.dim(),
+                             m_gpuMatrix.dim(),
                             A.N() * A[0][0].N()));
-    OPM_ERROR_IF(A.nonzeroes() != cuMatrix.nonzeroes(),
+    OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
                 fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
-                             cuMatrix.nonzeroes(),
+                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));

-    // Compute the inverted diagonal of A and store it in a vector format in cuVec_diagInvFlattened
-    detail::invertDiagonalAndFlatten(cuMatrix.getNonZeroValues().data(),
-                                     cuMatrix.getRowIndices().data(),
-                                     cuMatrix.getColumnIndices().data(),
-                                     detail::to_int(cuMatrix.N()),
-                                     detail::to_int(cuMatrix.blockSize()),
-                                     cuVec_diagInvFlattened.data());
+    // Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
+    detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(),
+                                     m_gpuMatrix.getRowIndices().data(),
+                                     m_gpuMatrix.getColumnIndices().data(),
+                                     m_gpuMatrix.N(),
+                                     m_gpuMatrix.blockSize(),
+                                     m_diagInvFlattened.data());
 }

 template <class M, class X, class Y, int l>
@@ -89,10 +90,10 @@ CuJac<M, X, Y, l>::apply(X& v, const Y& d)

    // Compute the MV product where the matrix is diagonal and therefore stored as a vector.
    // The product is thus computed as a hadamard product.
-    detail::weightedDiagMV(cuVec_diagInvFlattened.data(),
-                           detail::to_size_t(cuMatrix.N()),
-                           detail::to_size_t(cuMatrix.blockSize()),
-                           relaxation_factor,
+    detail::weightedDiagMV(m_diagInvFlattened.data(),
+                           m_gpuMatrix.N(),
+                           m_gpuMatrix.blockSize(),
+                           m_relaxationFactor,
                           d.data(),
                           v.data());
 }
@@ -114,13 +115,13 @@ template <class M, class X, class Y, int l>
 void
 CuJac<M, X, Y, l>::update()
 {
-    cuMatrix.updateNonzeroValues(detail::makeMatrixWithNonzeroDiagonal(cpuMatrix));
-    detail::invertDiagonalAndFlatten(cuMatrix.getNonZeroValues().data(),
-                                     cuMatrix.getRowIndices().data(),
-                                     cuMatrix.getColumnIndices().data(),
-                                     detail::to_int(cuMatrix.N()),
-                                     detail::to_int(cuMatrix.blockSize()),
-                                     cuVec_diagInvFlattened.data());
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
+    detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(),
+                                     m_gpuMatrix.getRowIndices().data(),
+                                     m_gpuMatrix.getColumnIndices().data(),
+                                     m_gpuMatrix.N(),
+                                     m_gpuMatrix.blockSize(),
+                                     m_diagInvFlattened.data());
 }

 } // namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@@ -30,9 +30,9 @@

 namespace Opm::cuistl
 {
-//! \brief Sequential Jacobi preconditioner on the GPU.
+//! \brief Jacobi preconditioner on the GPU.
 //!
-//! \note This is not expected to be a fast preconditioner.
+//! \note This is a fast but weak preconditioner
 //!
 //! \tparam M The matrix type to operate on
 //! \tparam X Type of the update
@@ -96,13 +96,13 @@ public:

 private:
    //! \brief Reference to the underlying matrix
-    const M& cpuMatrix;
+    const M& m_cpuMatrix;
    //! \brief The relaxation factor to use.
-    const field_type relaxation_factor;
+    const field_type m_relaxationFactor;
    //! \brief The A matrix stored on the gpu
-    CuSparseMatrix<field_type> cuMatrix;
+    CuSparseMatrix<field_type> m_gpuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
-    CuVector<field_type> cuVec_diagInvFlattened;
+    CuVector<field_type> m_diagInvFlattened;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/CuVector.hpp
+++ b/opm/simulators/linalg/cuistl/CuVector.hpp
@@ -26,6 +26,7 @@
 #include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
 #include <vector>
+#include <string>


 namespace Opm::cuistl
@@ -361,6 +362,18 @@ public:
     */
    void setZeroAtIndexSet(const CuVector<int>& indexSet);

+    // Slow method that creates a string representation of a CuVector for debug purposes
+    std::string toDebugString()
+    {
+        std::vector<T> v = asStdVector();
+        std::string res = "";
+        for (int i = 0; i < v.size(); i++){
+            res += std::to_string(v[i]) + " ";
+        }
+        res += std::to_string(v[v.size()-1]);
+        return res;
+    }
+
 private:
    T* m_dataOnDevice = nullptr;

@@ -375,16 +388,5 @@ private:
    void assertHasElements() const;
 };

-template <class T>
-std::ostream& operator<<(std::ostream &os, const CuVector<T> &arg)
-{
-    std::vector<T> v = arg.asStdVector();
-    for (int i = 0; i < v.size(); i++){
-        os << v[i] << " ";
-    }
-    os << std::endl;
-    return os;
-}
-
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@@ -23,96 +23,111 @@ namespace Opm::cuistl::detail
 {

 namespace
-{
-    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize1(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+{   
+    // TODO: combine the three following kernels into one using template arguments so that
+    // no compile time optimization is lost while making the code more compact
+    template <class T>
+    __global__ void cuInvertDiagonalAndFlattenBlocksize1(
+        T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
    {
-        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 1;

-        if (thrIndex < numberOfRows){
-            size_t nnzIdx = rowIndices[thrIndex];
-            size_t nnzIdxLim = rowIndices[thrIndex+1]; 
+        if (row < numberOfRows) {
+            size_t nnzIdx = rowIndices[row];
+            size_t nnzIdxLim = rowIndices[row + 1];

-            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
-            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) { 
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
+            // zero element
+            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
                ++nnzIdx;
            }

-            // pDiagBlock points to the start of where the diagonal block is stored
-            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
-            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
-            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+            // diagBlock points to the start of where the diagonal block is stored
+            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
+            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
+            // element should be stored
+            T* vecBlock = &vec[blocksize * blocksize * row];

-            pVecBlock[0] =   1.0/(pDiagBlock[0]);
+            vecBlock[0] = 1.0 / (diagBlock[0]);
        }
    }

-    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize2(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+    template <class T>
+    __global__ void cuInvertDiagonalAndFlattenBlocksize2(
+        T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
    {
-        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 2;

-        if (thrIndex < numberOfRows){
-            size_t nnzIdx = rowIndices[thrIndex];
-            size_t nnzIdxLim = rowIndices[thrIndex+1];
+        if (row < numberOfRows) {
+            size_t nnzIdx = rowIndices[row];
+            size_t nnzIdxLim = rowIndices[row + 1];

-            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
-            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) {
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
+            // zero element
+            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
                ++nnzIdx;
            }

-            // pDiagBlock points to the start of where the diagonal block is stored
-            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
-            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
-            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+            // diagBlock points to the start of where the diagonal block is stored
+            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
+            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
+            // element should be stored
+            T* vecBlock = &vec[blocksize * blocksize * row];

            // based on Dune implementation
-            T det_inv = 1.0/(pDiagBlock[0]*pDiagBlock[3] - pDiagBlock[1]*pDiagBlock[2]);
-            pVecBlock[0] =   pDiagBlock[3] * det_inv;
-            pVecBlock[1] = - pDiagBlock[1] * det_inv;
-            pVecBlock[2] = - pDiagBlock[2] * det_inv;
-            pVecBlock[3] =   pDiagBlock[0] * det_inv;
+            T detInv = 1.0 / (diagBlock[0] * diagBlock[3] - diagBlock[1] * diagBlock[2]);
+            vecBlock[0] = diagBlock[3] * detInv;
+            vecBlock[1] = -diagBlock[1] * detInv;
+            vecBlock[2] = -diagBlock[2] * detInv;
+            vecBlock[3] = diagBlock[0] * detInv;
        }
    }
-    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize3(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+    template <class T>
+    __global__ void cuInvertDiagonalAndFlattenBlocksize3(
+        T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
    {
-        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 3;

-        if (thrIndex < numberOfRows){
-            size_t nnzIdx = rowIndices[thrIndex];
-            size_t nnzIdxLim = rowIndices[thrIndex+1];
+        if (row < numberOfRows) {
+            size_t nnzIdx = rowIndices[row];
+            size_t nnzIdxLim = rowIndices[row + 1];

-            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
-            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) {
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
+            // zero element
+            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
                ++nnzIdx;
            }

-            // pDiagBlock points to the start of where the diagonal block is stored
-            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
-            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
-            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+            // diagBlock points to the start of where the diagonal block is stored
+            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
+            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
+            // element should be stored
+            T* vecBlock = &vec[blocksize * blocksize * row];

            // based on Dune implementation
-            T t4  = pDiagBlock[0] * pDiagBlock[4];
-            T t6  = pDiagBlock[0] * pDiagBlock[5];
-            T t8  = pDiagBlock[1] * pDiagBlock[3];
-            T t10 = pDiagBlock[2] * pDiagBlock[3];
-            T t12 = pDiagBlock[1] * pDiagBlock[6];
-            T t14 = pDiagBlock[2] * pDiagBlock[6];
+            T t4 = diagBlock[0] * diagBlock[4];
+            T t6 = diagBlock[0] * diagBlock[5];
+            T t8 = diagBlock[1] * diagBlock[3];
+            T t10 = diagBlock[2] * diagBlock[3];
+            T t12 = diagBlock[1] * diagBlock[6];
+            T t14 = diagBlock[2] * diagBlock[6];

-            T t17 = 1.0/(t4*pDiagBlock[8]-t6*pDiagBlock[7]-t8*pDiagBlock[8]+
-                    t10*pDiagBlock[7]+t12*pDiagBlock[5]-t14*pDiagBlock[4]); // t17 is 1/determinant
+            T t17 = 1.0
+                / (t4 * diagBlock[8] - t6 * diagBlock[7] - t8 * diagBlock[8] + t10 * diagBlock[7] + t12 * diagBlock[5]
+                   - t14 * diagBlock[4]); // t17 is 1/determinant

-            pVecBlock[0] =  (pDiagBlock[4] * pDiagBlock[8] - pDiagBlock[5] * pDiagBlock[7])*t17;
-            pVecBlock[1] = -(pDiagBlock[1] * pDiagBlock[8] - pDiagBlock[2] * pDiagBlock[7])*t17;
-            pVecBlock[2] =  (pDiagBlock[1] * pDiagBlock[5] - pDiagBlock[2] * pDiagBlock[4])*t17;
-            pVecBlock[3] = -(pDiagBlock[3] * pDiagBlock[8] - pDiagBlock[5] * pDiagBlock[6])*t17;
-            pVecBlock[4] =  (pDiagBlock[0] * pDiagBlock[8] - t14) * t17;
-            pVecBlock[5] = -(t6-t10) * t17;
-            pVecBlock[6] =  (pDiagBlock[3] * pDiagBlock[7] - pDiagBlock[4] * pDiagBlock[6]) * t17;
-            pVecBlock[7] = -(pDiagBlock[0] * pDiagBlock[7] - t12) * t17;
-            pVecBlock[8] =  (t4-t8) * t17;
+            vecBlock[0] = (diagBlock[4] * diagBlock[8] - diagBlock[5] * diagBlock[7]) * t17;
+            vecBlock[1] = -(diagBlock[1] * diagBlock[8] - diagBlock[2] * diagBlock[7]) * t17;
+            vecBlock[2] = (diagBlock[1] * diagBlock[5] - diagBlock[2] * diagBlock[4]) * t17;
+            vecBlock[3] = -(diagBlock[3] * diagBlock[8] - diagBlock[5] * diagBlock[6]) * t17;
+            vecBlock[4] = (diagBlock[0] * diagBlock[8] - t14) * t17;
+            vecBlock[5] = -(t6 - t10) * t17;
+            vecBlock[6] = (diagBlock[3] * diagBlock[7] - diagBlock[4] * diagBlock[6]) * t17;
+            vecBlock[7] = -(diagBlock[0] * diagBlock[7] - t12) * t17;
+            vecBlock[8] = (t4 - t8) * t17;
        }
    }

@@ -130,21 +145,25 @@ namespace

 template <class T>
 void
-invertDiagonalAndFlatten(T* d_mat, int rowIndices[], int colIndices[], size_t numberOfRows, size_t blocksize, T* d_vec)
+invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec)
 {
    switch (blocksize) {
    case 1:
-        cuInvertDiagonalAndFlattenBlocksize1<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        cuInvertDiagonalAndFlattenBlocksize1<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
+            mat, rowIndices, colIndices, numberOfRows, vec);
        break;
    case 2:
-        cuInvertDiagonalAndFlattenBlocksize2<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        cuInvertDiagonalAndFlattenBlocksize2<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
+            mat, rowIndices, colIndices, numberOfRows, vec);
        break;
    case 3:
-        cuInvertDiagonalAndFlattenBlocksize3<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        cuInvertDiagonalAndFlattenBlocksize3<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
+            mat, rowIndices, colIndices, numberOfRows, vec);
        break;
    default:
-        // TODO: Figure out what is why it did not produce an error or any output in the output stream or the DBG file when I forced this case to execute
-        OPM_THROW(std::invalid_argument, "Inverting diagonal is not defined for blocksize>3");
+        // TODO: Figure out what is why it did not produce an error or any output in the output stream or the DBG file
+        // when I forced this case to execute
+        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
        break;
    }
 }
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
@@ -25,15 +25,15 @@ namespace Opm::cuistl::detail

 /**
 * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
- * @param d_mat pointer to GPU memory containing nonzerovalues of the sparse matrix
+ * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param numberOfRows Integer describing the number of rows in the matrix
 * @param blocksize Integer describing the sidelength of a block in the sparse matrix
- * @param d_vec Pointer to the vector where the inverse of the diagonal matrix should be stored
+ * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
 */
 template <class T>
-void invertDiagonalAndFlatten(T* d_mat, int rowIndices[], int colIndices[], size_t numberOfRows, size_t blocksize, T* d_vec);
+void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec);

 } // namespace Opm::cuistl::detail
 #endif
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@@ -47,57 +47,24 @@ namespace
        }
    }

-    template <class T>
-    __global__ void weightedDiagMVBlocksize1(
-        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
+    template <class T, int blocksize>
+    __global__ void weightedDiagMV(
+        const T* squareBlockVector, const size_t numberOfElements, const T w, const T* globalSrcVec, T* globalDstVec)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
-        const int blocksize = 1;

        if (globalIndex < numberOfElements) {
-            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            const T* srcVec = (src_vec + (blocksize * globalIndex));
-            T* dstVec = (dst_vec + (blocksize * globalIndex));
+            const T* localBlock = (squareBlockVector + (blocksize * blocksize * globalIndex));
+            const T* localSrcVec = (globalSrcVec + (blocksize * globalIndex));
+            T* localDstVec = (globalDstVec + (blocksize * globalIndex));

-            T v0 = srcVec[0];
-            dstVec[0] = (pMat[0] * v0) * w;
-        }
-    }
-
-    template <class T>
-    __global__ void weightedDiagMVBlocksize2(
-        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
-    {
-        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
-        const int blocksize = 2;
-
-        if (globalIndex < numberOfElements) {
-            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            const T* srcVec = (src_vec + (blocksize * globalIndex));
-            T* dstVec = (dst_vec + (blocksize * globalIndex));
-
-            T v0 = srcVec[0], v1 = srcVec[1];
-            dstVec[0] = (pMat[0] * v0 + pMat[1] * v1) * w;
-            dstVec[1] = (pMat[2] * v0 + pMat[3] * v1) * w;
-        }
-    }
-
-    template <class T>
-    __global__ void weightedDiagMVBlocksize3(
-        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
-    {
-        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
-        const int blocksize = 3;
-
-        if (globalIndex < numberOfElements) {
-            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            const T* srcVec = (src_vec + (blocksize * globalIndex));
-            T* dstVec = (dst_vec + (blocksize * globalIndex));
-
-            T v0 = srcVec[0], v1 = srcVec[1], v2 = srcVec[2];
-            dstVec[0] = (pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2) * w;
-            dstVec[1] = (pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2) * w;
-            dstVec[2] = (pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2) * w;
+            for (int i = 0; i < blocksize; ++i) {
+                T rowResult = 0.0;
+                for (int j = 0; j < blocksize; ++j) {
+                    rowResult += localBlock[i * blocksize + j] * localSrcVec[j];
+                }
+                localDstVec[i] = rowResult * w;
+            }
        }
    }

@@ -168,33 +135,33 @@ template int innerProductAtIndices(const int*, const int*, int* buffer, size_t,

 template <class T>
 void
-weightedDiagMV(T* squareBlockVector,
+weightedDiagMV(const T* squareBlockVector,
               const size_t numberOfElements,
               const size_t blocksize,
-               T relaxation_factor,
-               const T* src_vec,
-               T* dst_vec)
+               T relaxationFactor,
+               const T* srcVec,
+               T* dstVec)
 {
    switch (blocksize) {
    case 1:
-        weightedDiagMVBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
+        weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
        break;
    case 2:
-        weightedDiagMVBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
+        weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
        break;
    case 3:
-        weightedDiagMVBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
+        weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
        break;
    default:
-        OPM_THROW(std::invalid_argument, "blockvector hadamard product not defined for blocksize>3");
+        OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
        break;
    }
 }

-template void weightedDiagMV(double*, const size_t, const size_t, double, const double*, double*);
-template void weightedDiagMV(float*, const size_t, const size_t, float, const float*, float*);
+template void weightedDiagMV(const double*, const size_t, const size_t, double, const double*, double*);
+template void weightedDiagMV(const float*, const size_t, const size_t, float, const float*, float*);

 } // namespace Opm::cuistl::detail
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
@@ -57,22 +57,22 @@ T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t nu

 /**
 * @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
- * compute the hadamard product.
+ * compute the Hadamard product.
 * @param squareBlockVector A CuVector whose elements are NxN matrix blocks
 * @param numberOfRows The number of rows in the vector
 * @param blocksize The sidelength of the square block elements in the vector
 * @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
- * @param dst_vec A pointer to the data of the CuVector we store the result in
+ * @param[out] dst_vec A pointer to the data of the CuVector we store the result in
 *
 * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
 * diagonal of the matrix and use this product.
 */
 template <class T>
-void weightedDiagMV(T* squareBlockVector,
+void weightedDiagMV(const T* squareBlockVector,
                    const size_t numberOfRows,
                    const size_t blocksize,
-                    T relaxation_factor,
-                    const T* src_vec,
-                    T* dst_vec);
+                    T relaxationFactor,
+                    const T* srcVec,
+                    T* dstVec);
 } // namespace Opm::cuistl::detail
 #endif
--- a/tests/cuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp
@@ -25,9 +25,9 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

@@ -66,35 +66,57 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
        }
    }

-    B[0][0][0][0]=1.0;
-    B[0][0][0][1]=2.0;
-    B[0][0][0][2]=3.0;
-    B[0][0][1][0]=5.0;
-    B[0][0][1][1]=2.0;
-    B[0][0][1][2]=3.0;
-    B[0][0][2][0]=2.0;
-    B[0][0][2][1]=1.0;
-    B[0][0][2][2]=1.0;
+    B[0][0][0][0] = 1.0;
+    B[0][0][0][1] = 2.0;
+    B[0][0][0][2] = 3.0;
+    B[0][0][1][0] = 5.0;
+    B[0][0][1][1] = 2.0;
+    B[0][0][1][2] = 3.0;
+    B[0][0][2][0] = 2.0;
+    B[0][0][2][1] = 1.0;
+    B[0][0][2][2] = 1.0;

-    B[0][1][0][0]=1.0;
-    B[0][1][1][1]=1.0;
-    B[0][1][2][2]=1.0;
+    B[0][1][0][0] = 1.0;
+    B[0][1][1][1] = 1.0;
+    B[0][1][2][2] = 1.0;

-    B[1][1][0][0]=-1.0;
-    B[1][1][1][1]=-1.0;
-    B[1][1][2][2]=-1.0;
+    B[1][1][0][0] = -1.0;
+    B[1][1][1][1] = -1.0;
+    B[1][1][2][2] = -1.0;

-    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(Opm::cuistl::detail::makeMatrixWithNonzeroDiagonal(B));
-    Opm::cuistl::CuVector<T> d_invDiag(blocksize*blocksize*N);
+    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
+    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, blocksize, d_invDiag.data());
+    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(),
+                                                  m.getRowIndices().data(),
+                                                  m.getColumnIndices().data(),
+                                                  N,
+                                                  blocksize,
+                                                  dInvDiag.data());

-    std::vector<T> expected_inv_diag{-1.0/4.0,1.0/4.0,0.0,1.0/4.0,-5.0/4.0,3.0,1.0/4.0,3.0/4.0,-2.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0};
-    std::vector<T> computed_inv_diag = d_invDiag.asStdVector();
+    std::vector<T> expectedInvDiag {-1.0 / 4.0,
+                                    1.0 / 4.0,
+                                    0.0,
+                                    1.0 / 4.0,
+                                    -5.0 / 4.0,
+                                    3.0,
+                                    1.0 / 4.0,
+                                    3.0 / 4.0,
+                                    -2.0,
+                                    -1.0,
+                                    0.0,
+                                    0.0,
+                                    0.0,
+                                    -1.0,
+                                    0.0,
+                                    0.0,
+                                    0.0,
+                                    -1.0};
+    std::vector<T> computedInvDiag = dInvDiag.asStdVector();

-    BOOST_REQUIRE_EQUAL(expected_inv_diag.size(), computed_inv_diag.size());
-    for (size_t i = 0; i < expected_inv_diag.size(); ++i){
-        BOOST_CHECK_CLOSE(expected_inv_diag[i], computed_inv_diag[i], 1e-7);
+    BOOST_REQUIRE_EQUAL(expectedInvDiag.size(), computedInvDiag.size());
+    for (size_t i = 0; i < expectedInvDiag.size(); ++i) {
+        BOOST_CHECK_CLOSE(expectedInvDiag[i], computedInvDiag[i], 1e-7);
    }
 }

@@ -118,7 +140,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
        | |-1/2   1| |
        |            |
        | |  -1   0| |
-        | |   0  -1| |W
+        | |   0  -1| |
    */

    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
@@ -129,27 +151,32 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
        }
    }

-    B[0][0][0][0]=1.0;
-    B[0][0][0][1]=2.0;
-    B[0][0][1][0]=1.0/2.0;
-    B[0][0][1][1]=2.0;
+    B[0][0][0][0] = 1.0;
+    B[0][0][0][1] = 2.0;
+    B[0][0][1][0] = 1.0 / 2.0;
+    B[0][0][1][1] = 2.0;

-    B[0][1][0][0]=1.0;
-    B[0][1][1][1]=1.0;
+    B[0][1][0][0] = 1.0;
+    B[0][1][1][1] = 1.0;

-    B[1][1][0][0]=-1.0;
-    B[1][1][1][1]=-1.0;
+    B[1][1][0][0] = -1.0;
+    B[1][1][1][1] = -1.0;

-    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(Opm::cuistl::detail::makeMatrixWithNonzeroDiagonal(B));
-    Opm::cuistl::CuVector<T> d_invDiag(blocksize*blocksize*N);
+    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
+    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, blocksize, d_invDiag.data());
+    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(),
+                                                  m.getRowIndices().data(),
+                                                  m.getColumnIndices().data(),
+                                                  N,
+                                                  blocksize,
+                                                  dInvDiag.data());

-    std::vector<T> expected_inv_diag{2.0,-2.0,-1.0/2.0,1.0,-1.0,0.0,0.0,-1.0};
-    std::vector<T> computed_inv_diag = d_invDiag.asStdVector();
+    std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};
+    std::vector<T> computedInvDiag = dInvDiag.asStdVector();

-    BOOST_REQUIRE_EQUAL(expected_inv_diag.size(), computed_inv_diag.size());
-    for (size_t i = 0; i < expected_inv_diag.size(); ++i){
-        BOOST_CHECK_CLOSE(expected_inv_diag[i], computed_inv_diag[i], 1e-7);
+    BOOST_REQUIRE_EQUAL(expectedInvDiag.size(), computedInvDiag.size());
+    for (size_t i = 0; i < expectedInvDiag.size(); ++i) {
+        BOOST_CHECK_CLOSE(expectedInvDiag[i], computedInvDiag[i], 1e-7);
    }
 }
--- a/tests/cuistl/test_cuVector_operations.cpp
+++ b/tests/cuistl/test_cuVector_operations.cpp
@@ -25,9 +25,9 @@
 #include <dune/istl/bcrsmatrix.hh>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

@@ -44,21 +44,22 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto
    const size_t N = 1;
    const T weight = 1.0;

-    std::vector<T> h_blockVector({1.0,2.0,3.0,5.0,2.0,3.0,2.0,1.0,2.0});
-    std::vector<T> h_vecVector({3.0,2.0,1.0});
-    std::vector<T> h_dstVector({0, 0, 0});
-    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
-    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
-    Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
+    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 5.0, 2.0, 3.0, 2.0, 1.0, 2.0});
+    std::vector<T> hostVecVector({3.0, 2.0, 1.0});
+    std::vector<T> hostDstVector({0, 0, 0});
+    Opm::cuistl::CuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::cuistl::CuVector<T> deviceVecVector(hostVecVector);
+    Opm::cuistl::CuVector<T> deviceDstVector(hostDstVector);

-    Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
+    Opm::cuistl::detail::weightedDiagMV(
+        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());

-    std::vector<T> expected_vec{10.0,22.0,10.0};
-    std::vector<T> computed_vec = d_dstVector.asStdVector();
+    std::vector<T> expectedVec {10.0, 22.0, 10.0};
+    std::vector<T> computedVec = deviceDstVector.asStdVector();

-    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
-    for (size_t i = 0; i < expected_vec.size(); i++){
-        BOOST_CHECK_CLOSE(expected_vec[i], computed_vec[i], 1e-7);
+    BOOST_REQUIRE_EQUAL(expectedVec.size(), computedVec.size());
+    for (size_t i = 0; i < expectedVec.size(); i++) {
+        BOOST_CHECK_CLOSE(expectedVec[i], computedVec[i], 1e-7);
    }
 }

@@ -77,20 +78,21 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
    const size_t N = 2;
    const T weight = 0.5;

-    std::vector<T> h_blockVector({1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0});
-    std::vector<T> h_vecVector({1.0,3.0,2.0,4.0});
-    std::vector<T> h_dstVector({0, 0, 0, 0});
-    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
-    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
-    Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
+    std::vector<T> hostBlockVector({1.0, 2.0, 3.0, 4.0, 4.0, 3.0, 2.0, 1.0});
+    std::vector<T> hostVecVector({1.0, 3.0, 2.0, 4.0});
+    std::vector<T> hostDstVector({0, 0, 0, 0});
+    Opm::cuistl::CuVector<T> deviceBlockVector(hostBlockVector);
+    Opm::cuistl::CuVector<T> deviceVecVector(hostVecVector);
+    Opm::cuistl::CuVector<T> deviceDstVector(hostDstVector);

-    Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
+    Opm::cuistl::detail::weightedDiagMV(
+        deviceBlockVector.data(), N, blocksize, weight, deviceVecVector.data(), deviceDstVector.data());

-    std::vector<T> expected_vec{3.5,7.5,10.0,4.0};
-    std::vector<T> computed_vec = d_dstVector.asStdVector();
+    std::vector<T> expectedVec {3.5, 7.5, 10.0, 4.0};
+    std::vector<T> computedVec = deviceDstVector.asStdVector();

-    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
-    for (size_t i = 0; i < expected_vec.size(); i++){
-        BOOST_CHECK_CLOSE(expected_vec[i], computed_vec[i], 1e-7);
+    BOOST_REQUIRE_EQUAL(expectedVec.size(), computedVec.size());
+    for (size_t i = 0; i < expectedVec.size(); i++) {
+        BOOST_CHECK_CLOSE(expectedVec[i], computedVec[i], 1e-7);
    }
 }
--- a/tests/cuistl/test_cujac.cpp
+++ b/tests/cuistl/test_cujac.cpp
@@ -26,10 +26,10 @@
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>

 #include <iostream>

@@ -37,22 +37,22 @@ using NumericTypes = boost::mpl::list<double, float>;

 BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
 {
-     /*
-        Test data to validate jacobi preconditioner, expected result is x_1
-            | |3 1|  | 1  0|       | |1| |     | |2| |       | |   1| |
-            | |2 1|  | 0  1|       | |2| |     | |1| |       | |   0| |
-        A = |              | x_0 = |     | b = |     | x_1 = |        |
-            | |0 0|  |-1  0|       | |1| |     | |3| |       | |  -1| |
-            | |0 0|  | 0 -1|       | |1| |     | |4| |       | |-1.5| |
-    */
+    /*
+       Test data to validate jacobi preconditioner, expected result is x_1, and relaxation factor is 0.5
+           | |3 1|  | 1  0| |     | |2| |       | | 0.5| |
+           | |2 1|  | 0  1| |     | |1| |       | |-0.5| |
+       A = |                | d = |     |   v = |        |
+           | |0 0|  |-1  0| |     | |3| |       | |-1.5| |
+           | |0 0|  | 0 -1| |     | |4| |       | |-2.0| |
+   */
    const int N = 2;
    const int blocksize = 2;
    const int nonZeroes = 3;
    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
    using SpMatrix = Dune::BCRSMatrix<M>;
    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
-    using cujac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
-    
+    using CuJac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+
    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
    for (auto row = B.createbegin(); row != B.createend(); ++row) {
        row.insert(row.index());
@@ -61,37 +61,91 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
        }
    }

-    B[0][0][0][0]=3.0;
-    B[0][0][0][1]=1.0;
-    B[0][0][1][0]=2.0;
-    B[0][0][1][1]=1.0;
+    B[0][0][0][0] = 3.0;
+    B[0][0][0][1] = 1.0;
+    B[0][0][1][0] = 2.0;
+    B[0][0][1][1] = 1.0;

-    B[0][1][0][0]=1.0;
-    B[0][1][1][1]=1.0;
+    B[0][1][0][0] = 1.0;
+    B[0][1][1][1] = 1.0;

-    B[1][1][0][0]=-1.0;
-    B[1][1][1][1]=-1.0;
+    B[1][1][0][0] = -1.0;
+    B[1][1][1][1] = -1.0;

-    auto CuJac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, cujac>(std::make_shared<cujac>(B, 0.5));
-    
-    Vector h_dune_x(2);
-    h_dune_x[0][0] = 1.0;
-    h_dune_x[0][1] = 2.0;
-    h_dune_x[1][0] = 1.0;
-    h_dune_x[1][1] = 1.0;
+    auto cujac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuJac>(std::make_shared<CuJac>(B, 0.5));

-    Vector h_dune_b(2);
-    h_dune_b[0][0] = 2.0;
-    h_dune_b[0][1] = 1.0;
-    h_dune_b[1][0] = 3.0;
-    h_dune_b[1][1] = 4.0;
-    
-    const T expected_ans[2][2] = {{0.0, 5.0/2.0},{-3.0/2.0, -2.0}};
+    Vector vVector(2);
+    Vector dVector(2);
+    dVector[0][0] = 2.0;
+    dVector[0][1] = 1.0;
+    dVector[1][0] = 3.0;
+    dVector[1][1] = 4.0;

-    CuJac.apply(h_dune_x, h_dune_b);
-    BOOST_CHECK_CLOSE(h_dune_x[0][0], expected_ans[0][0], 1e-7);
-    BOOST_CHECK_CLOSE(h_dune_x[0][1], expected_ans[0][1], 1e-7);
-    BOOST_CHECK_CLOSE(h_dune_x[1][0], expected_ans[1][0], 1e-7);
-    BOOST_CHECK_CLOSE(h_dune_x[1][1], expected_ans[1][1], 1e-7);
-    BOOST_CHECK(true);
+    const T expectedAns[2][2] = {{1.0 / 2.0, -1.0 / 2.0}, {-3.0 / 2.0, -2.0}};
+
+
+    cujac.apply(vVector, dVector);
+    BOOST_CHECK_CLOSE(vVector[0][0], expectedAns[0][0], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[0][1], expectedAns[0][1], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[1][0], expectedAns[1][0], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[1][1], expectedAns[1][1], 1e-7);
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize1, T, NumericTypes)
+{
+    /*
+       Test data to validate jacobi preconditioner, expected result is x_1, relaxation factor is 0.5
+           | 3  1  1  0|       |1|       | 1/3|
+           | 2  1  0  1|       |2|       | 1/2|
+       A = | 0  0 -1  0|   d = |1|   v = |-3/2|
+           | 0  0  0 -1|       |1|       |  -2|
+   */
+    const int N = 4;
+    const int blocksize = 1;
+    const int nonZeroes = 8;
+    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
+    using SpMatrix = Dune::BCRSMatrix<M>;
+    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
+    using CuJac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+
+    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
+    for (auto row = B.createbegin(); row != B.createend(); ++row) {
+        row.insert(row.index());
+        if (row.index() == 0) {
+            row.insert(row.index() + 1);
+            row.insert(row.index() + 2);
+        }
+        if (row.index() == 1) {
+            row.insert(row.index() - 1);
+            row.insert(row.index() + 2);
+        }
+    }
+
+    B[0][0][0][0] = 3.0;
+    B[0][1][0][0] = 1.0;
+    B[0][2][0][0] = 1.0;
+
+    B[1][0][0][0] = 2.0;
+    B[1][1][0][0] = 1.0;
+    B[1][3][0][0] = 1.0;
+
+    B[2][2][0][0] = -1.0;
+    B[3][3][0][0] = -1.0;
+
+    auto cujac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, CuJac>(std::make_shared<CuJac>(B, 0.5));
+
+    Vector vVector(4);
+    Vector dVector(4);
+    dVector[0] = 2.0;
+    dVector[1] = 1.0;
+    dVector[2] = 3.0;
+    dVector[3] = 4.0;
+
+    const T expectedAns[4] = {1.0 / 3.0, 1.0 / 2.0, -3.0 / 2.0, -2.0};
+
+    cujac.apply(vVector, dVector);
+    BOOST_CHECK_CLOSE(vVector[0], expectedAns[0], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[1], expectedAns[1], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[2], expectedAns[2], 1e-7);
+    BOOST_CHECK_CLOSE(vVector[3], expectedAns[3], 1e-7);
 }