Add jacobi preconditioner that runs on the GPU.

Implement calls to cuBlas, cuSparse and implement necessary CUDA kernels to perform a single iteration of the jacobi preconditioner. Add tests that verify new kernels and the preconditioner in its totality. The preconditioner is verified on 2x2 and 3x3 blocks, which as of now are the only supported sizes. 1x1 are not supported because cuSparse does not support it.
2025-02-25 18:55:30 -06:00 · 2023-08-29 11:34:57 +02:00
parent 3af34e4ae6
commit 1e4b0e97ee
13 changed files with 982 additions and 9 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@@ -158,15 +158,18 @@ if(CUDA_FOUND)

  # CUISTL SOURCE
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuJac.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSeqILU0.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/set_device.cpp)

  # CUISTL HEADERS
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp)
+  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp)
  list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp)
@@ -283,18 +286,21 @@ if(CUDA_FOUND)
  if(USE_BDA_BRIDGE)
    list(APPEND TEST_SOURCE_FILES tests/test_cusparseSolver.cpp)
  endif()
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_safe_call.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_converttofloatadapter.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_handle.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_safe_call.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_safe_call.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuda_safe_call.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuda_check_last_error.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cublas_handle.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_handle.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuvector.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparsematrix.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_safe_conversion.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuseqilu0.cpp)
-  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_converttofloatadapter.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cujac.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuowneroverlapcopy.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuseqilu0.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparse_handle.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuSparse_matrix_operations.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cusparsematrix.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuvector.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_cuVector_operations.cpp)
+  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_safe_conversion.cpp)
  list(APPEND TEST_SOURCE_FILES tests/cuistl/test_solver_adapter.cpp)


--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -47,6 +47,7 @@
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
+#include <opm/simulators/linalg/cuistl/CuJac.hpp>

 #endif

@@ -239,6 +240,17 @@ struct StandardPreconditioners
            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
+
+        F::addCreator("CUJac", [](const O& op, const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+            const double w = prm.get<double>("relaxation", 1.0);
+            using field_type = typename V::field_type;
+            using CuJac = typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            auto cuJac = std::make_shared<CuJac>(op.getmat(), w);
+
+            auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuJac>>(cuJac);
+            auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            return wrapped;
+        });
 #endif
    }

@@ -445,6 +457,13 @@ struct StandardPreconditioners<Operator,Dune::Amg::SequentialInformation>
            return converted;

        });
+
+        F::addCreator("CUJac", [](const O& op, const P& prm, const std::function<V()>&, std::size_t) {
+            const double w = prm.get<double>("relaxation", 1.0);
+            using field_type = typename V::field_type;
+            using CUJac = typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
+            return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(std::make_shared<CUJac>(op.getmat(), w));
+        });
 #endif
    }
 };
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@@ -0,0 +1,185 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cusparse.h>
+#include <dune/common/fmatrix.hh>
+#include <dune/common/fvector.hh>
+#include <dune/istl/bcrsmatrix.hh>
+#include <dune/istl/bvector.hh>
+#include <fmt/core.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/CuJac.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
+#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/matrixblock.hh>
+
+namespace Opm::cuistl
+{
+
+template <class M, class X, class Y, int l>
+CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
+    : cpuMatrix(A)
+    , relaxation_factor(w)
+    , cuMatrix(CuSparseMatrix<field_type>::fromMatrix(detail::makeMatrixWithNonzeroDiagonal(A)))
+    , cuVec_diagInvFlattened(cuMatrix.N() * cuMatrix.blockSize() * cuMatrix.blockSize())
+    , cuVec_resultBuffer(cuMatrix.N() * cuMatrix.blockSize())
+    , cuMatrixDescription(detail::createMatrixDescription())
+    , cuSparseHandle(detail::CuSparseHandle::getInstance())
+    , cuBlasHandle(detail::CuBlasHandle::getInstance())
+{
+    // Some sanity check
+    OPM_ERROR_IF(A.N() != cuMatrix.N(),
+                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", cuMatrix.N(), A.N()));
+    OPM_ERROR_IF(
+        A[0][0].N() != cuMatrix.blockSize(),
+        fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.", cuMatrix.blockSize(), A[0][0].N()));
+    OPM_ERROR_IF(A.N() * A[0][0].N() != cuMatrix.dim(),
+                 fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
+                             cuMatrix.dim(),
+                             A.N() * A[0][0].N()));
+    OPM_ERROR_IF(A.nonzeroes() != cuMatrix.nonzeroes(),
+                 fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
+                             cuMatrix.nonzeroes(),
+                             A.nonzeroes()));
+
+    // Compute the inverted diagonal of A and store it in a vector format in cuVec_diagInvFlattened
+    detail::invertDiagonalAndFlatten(cuMatrix.getNonZeroValues().data(),
+                                     cuMatrix.getRowIndices().data(),
+                                     cuMatrix.getColumnIndices().data(),
+                                     detail::to_int(cuMatrix.N()),
+                                     detail::to_int(cuMatrix.blockSize()),
+                                     cuVec_diagInvFlattened.data());
+}
+
+template <class M, class X, class Y, int l>
+void
+CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+{
+}
+
+template <class M, class X, class Y, int l>
+void
+CuJac<M, X, Y, l>::apply(X& x, const Y& b)
+{
+    // x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
+    // cusparseDbsrmv computes -Ax_n + b
+    // cuda kernel computes D^-1 * (b- Ax_n), call this rhs
+    // use an axpy to compute x + w*rhs
+
+    const field_type one = 1.0, neg_one = -1.0;
+
+    const auto numberOfRows = detail::to_int(cuMatrix.N());
+    const auto numberOfColumns = detail::to_int(cuMatrix.N());
+    const auto numberOfNonzeroBlocks = detail::to_int(cuMatrix.nonzeroes());
+    const auto blockSize = detail::to_int(cuMatrix.blockSize());
+
+    auto nonZeroValues = cuMatrix.getNonZeroValues().data();
+    auto rowIndices = cuMatrix.getRowIndices().data();
+    auto columnIndices = cuMatrix.getColumnIndices().data();
+
+    // TODO: avoid making this copy, currently forced to because parameter is a const
+    cuVec_resultBuffer = b;
+
+    // bsrmv computes b - Ax
+    OPM_CUSPARSE_SAFE_CALL(detail::cusparseBsrmv(cuSparseHandle.get(),
+                                                 detail::CUSPARSE_MATRIX_ORDER,
+                                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                 numberOfRows,
+                                                 numberOfColumns,
+                                                 numberOfNonzeroBlocks,
+                                                 &neg_one,
+                                                 cuMatrixDescription->get(),
+                                                 nonZeroValues,
+                                                 rowIndices,
+                                                 columnIndices,
+                                                 blockSize,
+                                                 x.data(),
+                                                 &one,
+                                                 cuVec_resultBuffer.data()));
+
+    // multiply D^-1 with (b-Ax)
+    detail::blockVectorMultiplicationAtAllIndices(cuVec_diagInvFlattened.data(),
+                                                  detail::to_size_t(numberOfRows),
+                                                  detail::to_size_t(blockSize),
+                                                  cuVec_resultBuffer.data());
+
+    // Finish adding w*rhs to x
+    OPM_CUBLAS_SAFE_CALL(detail::cublasAxpy(
+        cuBlasHandle.get(), numberOfRows * blockSize, &relaxation_factor, cuVec_resultBuffer.data(), 1, x.data(), 1));
+}
+
+template <class M, class X, class Y, int l>
+void
+CuJac<M, X, Y, l>::post([[maybe_unused]] X& x)
+{
+}
+
+template <class M, class X, class Y, int l>
+Dune::SolverCategory::Category
+CuJac<M, X, Y, l>::category() const
+{
+    return Dune::SolverCategory::sequential;
+}
+
+template <class M, class X, class Y, int l>
+void
+CuJac<M, X, Y, l>::update()
+{
+    cuMatrix.updateNonzeroValues(detail::makeMatrixWithNonzeroDiagonal(cpuMatrix));
+    detail::invertDiagonalAndFlatten(cuMatrix.getNonZeroValues().data(),
+                                     cuMatrix.getRowIndices().data(),
+                                     cuMatrix.getColumnIndices().data(),
+                                     detail::to_int(cuMatrix.N()),
+                                     detail::to_int(cuMatrix.blockSize()),
+                                     cuVec_diagInvFlattened.data());
+}
+
+} // namespace Opm::cuistl
+#define INSTANTIATE_CUJAC_DUNE(realtype, blockdim)                                                                     \
+    template class ::Opm::cuistl::CuJac<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,             \
+                                        ::Opm::cuistl::CuVector<realtype>,                                             \
+                                        ::Opm::cuistl::CuVector<realtype>>;                                            \
+    template class ::Opm::cuistl::CuJac<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,              \
+                                        ::Opm::cuistl::CuVector<realtype>,                                             \
+                                        ::Opm::cuistl::CuVector<realtype>>
+
+INSTANTIATE_CUJAC_DUNE(double, 1);
+INSTANTIATE_CUJAC_DUNE(double, 2);
+INSTANTIATE_CUJAC_DUNE(double, 3);
+INSTANTIATE_CUJAC_DUNE(double, 4);
+INSTANTIATE_CUJAC_DUNE(double, 5);
+INSTANTIATE_CUJAC_DUNE(double, 6);
+
+INSTANTIATE_CUJAC_DUNE(float, 1);
+INSTANTIATE_CUJAC_DUNE(float, 2);
+INSTANTIATE_CUJAC_DUNE(float, 3);
+INSTANTIATE_CUJAC_DUNE(float, 4);
+INSTANTIATE_CUJAC_DUNE(float, 5);
+INSTANTIATE_CUJAC_DUNE(float, 6);
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@@ -0,0 +1,119 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUJAC_HPP
+#define OPM_CUJAC_HPP
+
+#include <dune/istl/preconditioner.hh>
+#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
+#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
+#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
+
+
+
+namespace Opm::cuistl
+{
+//! \brief Sequential Jacobi preconditioner on the GPU through the CuSparse library.
+//!
+//! This implementation calls the CuSparse functions, which in turn essentially
+//! does a level decomposition to get some parallelism.
+//!
+//! \note This is not expected to be a fast preconditioner.
+//!
+//! \tparam M The matrix type to operate on
+//! \tparam X Type of the update
+//! \tparam Y Type of the defect
+//! \tparam l Ignored. Just there to have the same number of template arguments
+//!    as other preconditioners.
+//!
+//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! arguments in case of future additions.
+template <class M, class X, class Y, int l = 1>
+class CuJac : public Dune::PreconditionerWithUpdate<X, Y>
+{
+public:
+    //! \brief The matrix type the preconditioner is for.
+    using matrix_type = typename std::remove_const<M>::type ;
+    //! \brief The domain type of the preconditioner.
+    using domain_type = X;
+    //! \brief The range type of the preconditioner.
+    using range_type = Y;
+    //! \brief The field type of the preconditioner.
+    using field_type = typename X::field_type;
+
+    //! \brief Constructor.
+    //!
+    //!  Constructor gets all parameters to operate the prec.
+    //! \param A The matrix to operate on.
+    //! \param w The relaxation factor.
+    //!
+    CuJac(const M& A, field_type w);
+
+    //! \brief Prepare the preconditioner.
+    //! \note Does nothing at the time being.
+    virtual void pre(X& x, Y& b) override;
+
+    //! \brief Apply the preconditoner.
+    virtual void apply(X& v, const Y& d) override;
+
+    //! \brief Post processing
+    //! \note Does nothing at the moment
+    virtual void post(X& x) override;
+
+    //! Category of the preconditioner (see SolverCategory::Category)
+    virtual Dune::SolverCategory::Category category() const override;
+
+    //! \brief Updates the matrix data.
+    virtual void update() override;
+
+
+    //! \returns false
+    static constexpr bool shouldCallPre()
+    {
+        return false;
+    }
+
+    //! \returns false
+    static constexpr bool shouldCallPost()
+    {
+        return false;
+    }
+
+
+private:
+    //! \brief Reference to the underlying matrix
+    const M& cpuMatrix;
+    //! \brief The relaxation factor to use.
+    field_type relaxation_factor;
+
+    CuSparseMatrix<field_type> cuMatrix;
+    
+    CuVector<field_type> cuVec_diagInvFlattened; // the diagonal of cuMatrix inverted, and then flattened to fit in a vector
+    CuVector<field_type> cuVec_resultBuffer;
+
+    detail::CuSparseMatrixDescriptionPtr cuMatrixDescription;
+    detail::CuSparseHandle& cuSparseHandle;
+    detail::CuBlasHandle& cuBlasHandle;
+    
+    size_t findBufferSize();
+};
+} // end namespace Opm::cuistl
+
+#endif
--- a/opm/simulators/linalg/cuistl/CuVector.hpp
+++ b/opm/simulators/linalg/cuistl/CuVector.hpp
@@ -374,5 +374,17 @@ private:

    void assertHasElements() const;
 };
+
+template <class T>
+std::ostream& operator<<(std::ostream &os, const CuVector<T> &arg)
+{
+    std::vector<T> v = arg.asStdVector();
+    for (int i = 0; i < v.size(); i++){
+        os << v[i] << " ";
+    }
+    os << std::endl;
+    return os;
+}
+
 } // namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
@@ -80,6 +80,10 @@ public:
            m_inputBuffer.reset(new CuVector<field_type>(v.dim()));
            m_outputBuffer.reset(new CuVector<field_type>(v.dim()));
        }
+        //TODO: to implement CuJac I added the moving of data from host to the output buffer
+        // This has performance impact for other preconditiners, so figure out how to avoid
+        m_outputBuffer->copyFromHost(v);
+
        m_inputBuffer->copyFromHost(d);
        m_underlyingPreconditioner->apply(*m_outputBuffer, *m_inputBuffer);
        m_outputBuffer->copyToHost(v);
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@@ -0,0 +1,155 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <stdexcept>
+namespace Opm::cuistl::detail
+{
+
+namespace
+{
+    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize1(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+    {
+        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 1;
+
+        if (thrIndex < numberOfRows){
+            size_t nnzIdx = rowIndices[thrIndex];
+            size_t nnzIdxLim = rowIndices[thrIndex+1]; 
+
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
+            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) { 
+                ++nnzIdx;
+            }
+
+            // pDiagBlock points to the start of where the diagonal block is stored
+            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
+            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
+            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+
+            pVecBlock[0] =   1.0/(pDiagBlock[0]);
+        }
+    }
+
+    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize2(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+    {
+        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 2;
+
+        if (thrIndex < numberOfRows){
+            size_t nnzIdx = rowIndices[thrIndex];
+            size_t nnzIdxLim = rowIndices[thrIndex+1];
+
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
+            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) {
+                ++nnzIdx;
+            }
+
+            // pDiagBlock points to the start of where the diagonal block is stored
+            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
+            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
+            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+
+            // based on Dune implementation
+            T det_inv = 1.0/(pDiagBlock[0]*pDiagBlock[3] - pDiagBlock[1]*pDiagBlock[2]);
+            pVecBlock[0] =   pDiagBlock[3] * det_inv;
+            pVecBlock[1] = - pDiagBlock[1] * det_inv;
+            pVecBlock[2] = - pDiagBlock[2] * det_inv;
+            pVecBlock[3] =   pDiagBlock[0] * det_inv;
+        }
+    }
+    template <class T> __global__ void cuInvertDiagonalAndFlattenBlocksize3(T* matNonZeroValues, int rowIndices[], int colIndices[], size_t numberOfRows, T* vec)
+    {
+        const auto thrIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 3;
+
+        if (thrIndex < numberOfRows){
+            size_t nnzIdx = rowIndices[thrIndex];
+            size_t nnzIdxLim = rowIndices[thrIndex+1];
+
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a zero element
+            for (;colIndices[nnzIdx] != thrIndex && nnzIdx <= nnzIdxLim;) {
+                ++nnzIdx;
+            }
+
+            // pDiagBlock points to the start of where the diagonal block is stored
+            T* pDiagBlock = (matNonZeroValues+(blocksize*blocksize*nnzIdx));
+            // pVecBlock points to the start of the block element in the vector where the inverse of the diagonal block element should be stored
+            T* pVecBlock = (vec + (blocksize*blocksize*thrIndex));
+
+            // based on Dune implementation
+            T t4  = pDiagBlock[0] * pDiagBlock[4];
+            T t6  = pDiagBlock[0] * pDiagBlock[5];
+            T t8  = pDiagBlock[1] * pDiagBlock[3];
+            T t10 = pDiagBlock[2] * pDiagBlock[3];
+            T t12 = pDiagBlock[1] * pDiagBlock[6];
+            T t14 = pDiagBlock[2] * pDiagBlock[6];
+
+            T t17 = 1.0/(t4*pDiagBlock[8]-t6*pDiagBlock[7]-t8*pDiagBlock[8]+
+                    t10*pDiagBlock[7]+t12*pDiagBlock[5]-t14*pDiagBlock[4]); // t17 is 1/determinant
+
+            pVecBlock[0] =  (pDiagBlock[4] * pDiagBlock[8] - pDiagBlock[5] * pDiagBlock[7])*t17;
+            pVecBlock[1] = -(pDiagBlock[1] * pDiagBlock[8] - pDiagBlock[2] * pDiagBlock[7])*t17;
+            pVecBlock[2] =  (pDiagBlock[1] * pDiagBlock[5] - pDiagBlock[2] * pDiagBlock[4])*t17;
+            pVecBlock[3] = -(pDiagBlock[3] * pDiagBlock[8] - pDiagBlock[5] * pDiagBlock[6])*t17;
+            pVecBlock[4] =  (pDiagBlock[0] * pDiagBlock[8] - t14) * t17;
+            pVecBlock[5] = -(t6-t10) * t17;
+            pVecBlock[6] =  (pDiagBlock[3] * pDiagBlock[7] - pDiagBlock[4] * pDiagBlock[6]) * t17;
+            pVecBlock[7] = -(pDiagBlock[0] * pDiagBlock[7] - t12) * t17;
+            pVecBlock[8] =  (t4-t8) * t17;
+        }
+    }
+
+    constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfRows)
+    {
+        return 1024;
+    }
+
+    inline size_t getBlocks(size_t numberOfRows)
+    {
+        const auto threads = getThreads(numberOfRows);
+        return (numberOfRows + threads - 1) / threads;
+    }
+} // namespace
+
+template <class T>
+void
+invertDiagonalAndFlatten(T* d_mat, int rowIndices[], int colIndices[], size_t numberOfRows, size_t blocksize, T* d_vec)
+{
+    switch (blocksize) {
+    case 1:
+        cuInvertDiagonalAndFlattenBlocksize1<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        break;
+    case 2:
+        cuInvertDiagonalAndFlattenBlocksize2<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        break;
+    case 3:
+        cuInvertDiagonalAndFlattenBlocksize3<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(d_mat, rowIndices, colIndices, numberOfRows, d_vec);
+        break;
+    default:
+        // TODO: Figure out what is why it did not produce an error or any output in the output stream or the DBG file when I forced this case to execute
+        OPM_THROW(std::invalid_argument, "Inverting diagonal is not defined for blocksize>3");
+        break;
+    }
+}
+
+template void invertDiagonalAndFlatten(double*, int*, int*, size_t, size_t, double*);
+template void invertDiagonalAndFlatten(float*, int*, int*, size_t, size_t, float*);
+
+} // namespace Opm::cuistl::detail
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
@@ -0,0 +1,39 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUISTL_CUSPARSE_MATRIX_OPERATIONS_HPP
+#define OPM_CUISTL_CUSPARSE_MATRIX_OPERATIONS_HPP
+#include <cstddef>
+#include <vector>
+namespace Opm::cuistl::detail
+{
+
+/**
+ * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
+ * @param d_mat pointer to GPU memory containing nonzerovalues of the sparse matrix
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param numberOfRows Integer describing the number of rows in the matrix
+ * @param blocksize Integer describing the sidelength of a block in the sparse matrix
+ * @param d_vec Pointer to the vector where the inverse of the diagonal matrix should be stored
+ */
+template <class T>
+void invertDiagonalAndFlatten(T* d_mat, int rowIndices[], int colIndices[], size_t numberOfRows, size_t blocksize, T* d_vec);
+
+} // namespace Opm::cuistl::detail
+#endif
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@@ -16,8 +16,10 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 // TODO: [perf] Get rid of thrust.
+#include <stdexcept>
 #include <thrust/device_ptr.h>
 #include <thrust/reduce.h>
 namespace Opm::cuistl::detail
@@ -45,6 +47,54 @@ namespace
        }
    }

+    template <class T>
+    __global__ void elementWiseMultiplyMVKernelBlocksize1(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    {
+        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 1;
+
+        if (globalIndex < numberOfElements) {
+            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
+            T* pVec = (vec + (blocksize * globalIndex));
+
+            T v0 = pVec[0];
+            pVec[0] = pMat[0] * v0;
+        }
+    }
+
+    template <class T>
+    __global__ void elementWiseMultiplyMVKernelBlocksize2(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    {
+        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 2;
+
+        if (globalIndex < numberOfElements) {
+            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
+            T* pVec = (vec + (blocksize * globalIndex));
+
+            T v0 = pVec[0], v1 = pVec[1];
+            pVec[0] = pMat[0] * v0 + pMat[1] * v1;
+            pVec[1] = pMat[2] * v0 + pMat[3] * v1;
+        }
+    }
+
+    template <class T>
+    __global__ void elementWiseMultiplyMVKernelBlocksize3(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    {
+        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
+        const int blocksize = 3;
+
+        if (globalIndex < numberOfElements) {
+            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
+            T* pVec = (vec + (blocksize * globalIndex));
+
+            T v0 = pVec[0], v1 = pVec[1], v2 = pVec[2];
+            pVec[0] = pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2;
+            pVec[1] = pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2;
+            pVec[2] = pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2;
+        }
+    }
+
    template <class T>
    __global__ void
    elementWiseMultiplyKernel(const T* a, const T* b, T* buffer, size_t numberOfElements, const int* indices)
@@ -109,4 +159,34 @@ template double innerProductAtIndices(const double*, const double*, double* buff
 template float innerProductAtIndices(const float*, const float*, float* buffer, size_t, const int*);
 template int innerProductAtIndices(const int*, const int*, int* buffer, size_t, const int*);

-} // namespace Opm::cuistl::impl
+
+template <class T>
+void
+blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
+                                      const size_t numberOfElements,
+                                      const size_t blocksize,
+                                      T* vec)
+{
+    switch (blocksize) {
+    case 1:
+        elementWiseMultiplyMVKernelBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, vec);
+        break;
+    case 2:
+        elementWiseMultiplyMVKernelBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, vec);
+        break;
+    case 3:
+        elementWiseMultiplyMVKernelBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, vec);
+        break;
+    default:
+        OPM_THROW(std::invalid_argument, "blockvector hadamard product not defined for blocksize>3");
+        break;
+    }
+}
+
+template void blockVectorMultiplicationAtAllIndices(double*, const size_t, const size_t, double*);
+template void blockVectorMultiplicationAtAllIndices(float*, const size_t, const size_t, float*);
+
+} // namespace Opm::cuistl::detail
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
@@ -54,5 +54,17 @@ void setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indice
 */
 template <class T>
 T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);
+
+/**
+ * @brief Compute the hadamard product of two vectors element by element, where the first vector containts square blocks, while the second vector containts vectors
+ * @param squareBlockVector A CuVector whose elements are NxN matrix blocks
+ * @param numberOfRows The number of rows in the vector
+ * @param blocksize The sidelength of the square block elements in the vector
+ * @param vec A pointer to the data of the CuVector we multiply the blockvector with, the result is stored in vec
+ * 
+ * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the diagonal of the matrix and use this product.
+ */
+template <class T>
+void blockVectorMultiplicationAtAllIndices(T* squareBlockVector, const size_t numberOfRows, const size_t blocksize, T* vec);
 } // namespace Opm::cuistl::detail
 #endif
--- a/tests/cuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp
@@ -0,0 +1,155 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+
+#define BOOST_TEST_MODULE TestCuSparseMatrixOperations
+#include <boost/mpl/list.hpp>
+#include <boost/test/unit_test.hpp>
+#include <cuda_runtime.h>
+#include <dune/istl/bcrsmatrix.hh>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+
+using NumericTypes = boost::mpl::list<double, float>;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, NumericTypes)
+{
+    const size_t blocksize = 3;
+    const size_t N = 2;
+    const int nonZeroes = 3;
+    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
+    using SpMatrix = Dune::BCRSMatrix<M>;
+    /*
+        create this sparse matrix
+        | |1 2 3| | 1  0  0| |
+        | |5 2 3| | 0  1  0| |
+        | |2 1 1| | 0  0  1| |
+        |                    |
+        | |0 0 0| |-1  0  0| |
+        | |0 0 0| | 0 -1  0| |
+        | |0 0 0| | 0  0 -1| |
+
+        The diagonal elements inverted, and put in a vector should look like this
+        | |-1/4  1/4  0| |
+        | | 1/4 -4/5  3| |
+        | | 1/4  3/4 -2| |
+        |                |
+        | |  -1    0  0| |
+        | |   0   -1  0| |
+        | |   0    0 -1| |
+    */
+
+    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
+    for (auto row = B.createbegin(); row != B.createend(); ++row) {
+        row.insert(row.index());
+        if (row.index() == 0) {
+            row.insert(row.index() + 1);
+        }
+    }
+
+    B[0][0][0][0]=1.0;
+    B[0][0][0][1]=2.0;
+    B[0][0][0][2]=3.0;
+    B[0][0][1][0]=5.0;
+    B[0][0][1][1]=2.0;
+    B[0][0][1][2]=3.0;
+    B[0][0][2][0]=2.0;
+    B[0][0][2][1]=1.0;
+    B[0][0][2][2]=1.0;
+
+    B[0][1][0][0]=1.0;
+    B[0][1][1][1]=1.0;
+    B[0][1][2][2]=1.0;
+
+    B[1][1][0][0]=-1.0;
+    B[1][1][1][1]=-1.0;
+    B[1][1][2][2]=-1.0;
+
+    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(Opm::cuistl::detail::makeMatrixWithNonzeroDiagonal(B));
+    Opm::cuistl::CuVector<T> d_invDiag(blocksize*blocksize*N);
+
+    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, blocksize, d_invDiag.data());
+
+    std::vector<T> expected_inv_diag{-1.0/4.0,1.0/4.0,0.0,1.0/4.0,-5.0/4.0,3.0,1.0/4.0,3.0/4.0,-2.0,-1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,-1.0};
+    std::vector<T> computed_inv_diag = d_invDiag.asStdVector();
+
+    BOOST_REQUIRE_EQUAL(expected_inv_diag.size(), computed_inv_diag.size());
+    for (size_t i = 0; i < expected_inv_diag.size(); ++i){
+        BOOST_CHECK_CLOSE(expected_inv_diag[i], computed_inv_diag[i], 1e-7);
+    }
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, NumericTypes)
+{
+    const size_t blocksize = 2;
+    const size_t N = 2;
+    const int nonZeroes = 3;
+    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
+    using SpMatrix = Dune::BCRSMatrix<M>;
+    /*
+        create this sparse matrix
+        | |  1 2| | 1  0| |
+        | |1/2 2| | 0  1| |
+        |                 |
+        | |  0 0| |-1  0| |
+        | |  0 0| | 0 -1| |
+
+        The diagonal elements inverted, and put in a vector should look like this
+        | |   2 - 2| |
+        | |-1/2   1| |
+        |            |
+        | |  -1   0| |
+        | |   0  -1| |W
+    */
+
+    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
+    for (auto row = B.createbegin(); row != B.createend(); ++row) {
+        row.insert(row.index());
+        if (row.index() == 0) {
+            row.insert(row.index() + 1);
+        }
+    }
+
+    B[0][0][0][0]=1.0;
+    B[0][0][0][1]=2.0;
+    B[0][0][1][0]=1.0/2.0;
+    B[0][0][1][1]=2.0;
+
+    B[0][1][0][0]=1.0;
+    B[0][1][1][1]=1.0;
+
+    B[1][1][0][0]=-1.0;
+    B[1][1][1][1]=-1.0;
+
+    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(Opm::cuistl::detail::makeMatrixWithNonzeroDiagonal(B));
+    Opm::cuistl::CuVector<T> d_invDiag(blocksize*blocksize*N);
+
+    Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, blocksize, d_invDiag.data());
+
+    std::vector<T> expected_inv_diag{2.0,-2.0,-1.0/2.0,1.0,-1.0,0.0,0.0,-1.0};
+    std::vector<T> computed_inv_diag = d_invDiag.asStdVector();
+
+    BOOST_REQUIRE_EQUAL(expected_inv_diag.size(), computed_inv_diag.size());
+    for (size_t i = 0; i < expected_inv_diag.size(); ++i){
+        BOOST_CHECK_CLOSE(expected_inv_diag[i], computed_inv_diag[i], 1e-7);
+    }
+}
--- a/tests/cuistl/test_cuVector_operations.cpp
+++ b/tests/cuistl/test_cuVector_operations.cpp
@@ -0,0 +1,90 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+
+#define BOOST_TEST_MODULE TestCuVectorOperations
+#include <boost/mpl/list.hpp>
+#include <boost/test/unit_test.hpp>
+#include <cuda_runtime.h>
+#include <dune/istl/bcrsmatrix.hh>
+#include <opm/simulators/linalg/cuistl/CuJac.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+
+using NumericTypes = boost::mpl::list<double, float>;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVectorVector, T, NumericTypes)
+{
+    /*
+        Example in the test for multiplying by element a blockvector with a vector of vectors
+        | |1 2 3| |   | |3| |   | |10| |
+        | |5 2 3| | X | |2| | = | |22| |
+        | |2 1 1| |   | |1| |   | |10| |
+    */
+
+    const size_t blocksize = 3;
+    const size_t N = 1;
+
+    std::vector<T> h_blockVector({1.0,2.0,3.0,5.0,2.0,3.0,2.0,1.0,2.0});
+    std::vector<T> h_vecVector({3.0,2.0,1.0});
+    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
+    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
+
+    Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
+
+    std::vector<T> expected_vec{10.0,22.0,10.0};
+    std::vector<T> computed_vec = d_vecVector.asStdVector();
+
+    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
+    for (size_t i = 0; i < expected_vec.size(); i++){
+        BOOST_CHECK_CLOSE(expected_vec[i], computed_vec[i], 1e-7);
+    }
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVectorVector, T, NumericTypes)
+{
+    /*
+        Example in the test for multiplying by element a blockvector with a vector of vectors
+        | |1 2| |   | |1| |   | | 7| |
+        | |3 4| | X | |3| | = | |15| |
+        |       |   |     |   |      |
+        | |4 3| |   | |2| |   | |20| |
+        | |2 1| |   | |4| |   | | 8| |
+    */
+
+    const size_t blocksize = 2;
+    const size_t N = 2;
+
+    std::vector<T> h_blockVector({1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0});
+    std::vector<T> h_vecVector({1.0,3.0,2.0,4.0});
+    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
+    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
+
+    Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
+
+    std::vector<T> expected_vec{7.0,15.0,20.0,8.0};
+    std::vector<T> computed_vec = d_vecVector.asStdVector();
+
+    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
+    for (size_t i = 0; i < expected_vec.size(); i++){
+        BOOST_CHECK_CLOSE(expected_vec[i], computed_vec[i], 1e-7);
+    }
+}
--- a/tests/cuistl/test_cujac.cpp
+++ b/tests/cuistl/test_cujac.cpp
@@ -0,0 +1,97 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+
+#define BOOST_TEST_MODULE TestCuJac
+#include <boost/mpl/list.hpp>
+#include <boost/test/unit_test.hpp>
+#include <cuda_runtime.h>
+#include <dune/istl/bcrsmatrix.hh>
+#include <opm/simulators/linalg/cuistl/CuJac.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+
+#include <iostream>
+
+using NumericTypes = boost::mpl::list<double, float>;
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
+{
+     /*
+        Test data to validate jacobi preconditioner, expected result is x_1
+            | |3 1|  | 1  0|       | |1| |     | |2| |       | |   1| |
+            | |2 1|  | 0  1|       | |2| |     | |1| |       | |   0| |
+        A = |              | x_0 = |     | b = |     | x_1 = |        |
+            | |0 0|  |-1  0|       | |1| |     | |3| |       | |  -1| |
+            | |0 0|  | 0 -1|       | |1| |     | |4| |       | |-1.5| |
+    */
+    const int N = 2;
+    const int blocksize = 2;
+    const int nonZeroes = 3;
+    using M = Dune::FieldMatrix<T, blocksize, blocksize>;
+    using SpMatrix = Dune::BCRSMatrix<M>;
+    using Vector = Dune::BlockVector<Dune::FieldVector<T, blocksize>>;
+    using cujac = Opm::cuistl::CuJac<SpMatrix, Opm::cuistl::CuVector<T>, Opm::cuistl::CuVector<T>>;
+    
+    SpMatrix B(N, N, nonZeroes, SpMatrix::row_wise);
+    for (auto row = B.createbegin(); row != B.createend(); ++row) {
+        row.insert(row.index());
+        if (row.index() == 0) {
+            row.insert(row.index() + 1);
+        }
+    }
+
+    B[0][0][0][0]=3.0;
+    B[0][0][0][1]=1.0;
+    B[0][0][1][0]=2.0;
+    B[0][0][1][1]=1.0;
+
+    B[0][1][0][0]=1.0;
+    B[0][1][1][1]=1.0;
+
+    B[1][1][0][0]=-1.0;
+    B[1][1][1][1]=-1.0;
+
+    auto CuJac = Opm::cuistl::PreconditionerAdapter<Vector, Vector, cujac>(std::make_shared<cujac>(B, 0.5));
+    
+    Vector h_dune_x(2);
+    h_dune_x[0][0] = 1.0;
+    h_dune_x[0][1] = 2.0;
+    h_dune_x[1][0] = 1.0;
+    h_dune_x[1][1] = 1.0;
+
+    Vector h_dune_b(2);
+    h_dune_b[0][0] = 2.0;
+    h_dune_b[0][1] = 1.0;
+    h_dune_b[1][0] = 3.0;
+    h_dune_b[1][1] = 4.0;
+    
+    const T expected_ans[2][2] = {{1.0, 0.0},{-1.0,-3.0/2.0}};
+
+    CuJac.apply(h_dune_x, h_dune_b);
+    BOOST_CHECK_CLOSE(h_dune_x[0][0], expected_ans[0][0], 1e-7);
+    BOOST_CHECK_CLOSE(h_dune_x[0][1], expected_ans[0][1], 1e-7);
+    BOOST_CHECK_CLOSE(h_dune_x[1][0], expected_ans[1][0], 1e-7);
+    BOOST_CHECK_CLOSE(h_dune_x[1][1], expected_ans[1][1], 1e-7);
+    BOOST_CHECK(true);
+}