Add an OPM implementation of ILU0

improve file structure in cuistl run clang-format
2025-02-25 18:55:30 -06:00 · 2024-06-19 13:14:59 +02:00 · 2024-06-19 13:14:59 +02:00 · 7a30aaa46e
commit 7a30aaa46e
parent ad595fed5e
22 changed files with 2489 additions and 1043 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -205,16 +205,21 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
  # HEADERS
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
@ -223,7 +228,11 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
@ -238,6 +247,8 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)
--- a/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
+++ b/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
@ -24,6 +24,7 @@
 #if USE_HIP
 #include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/hipistl/CuDILU.hpp>
 #include <opm/simulators/linalg/hipistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/hipistl/CuJac.hpp>
 #include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
 #include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
@ -32,6 +33,7 @@
 #else
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -349,7 +349,6 @@ struct StandardPreconditioners {
        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            // const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
            using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
@ -358,6 +357,18 @@ struct StandardPreconditioners {
            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
        F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
            using OpmCuILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
 #endif
    }
@ -605,6 +616,15 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                std::make_shared<CUJac>(op.getmat(), w));
        });
        F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
            using CUILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
        });
        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@ -16,110 +16,23 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <cuda.h>
+#include <chrono>
-#include <cuda_runtime.h>
+#include <config.h>
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
 #include <limits>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <vector>
 #include <config.h>
 #include <chrono>
 #include <limits>
 #include <tuple>
 namespace
 {
 std::vector<int>
 createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
 {
    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[globCnt++] = static_cast<int>(col);
        }
    }
    return res;
 }
 std::vector<int>
 createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
 {
    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[col] = globCnt++;
        }
    }
    return res;
 }
 template <class M, class field_type, class GPUM>
 void
 createReorderedMatrix(const M& naturalMatrix,
                      std::vector<int>& reorderedToNatural,
                      std::unique_ptr<GPUM>& reorderedGpuMat)
 {
    M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
        // For elements in A
        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
            dstRowIt.insert(elem.index());
        }
    }
    reorderedGpuMat.reset(new auto (GPUM::fromMatrix(reorderedMatrix, true)));
 }
 template <class M, class field_type, class GPUM>
 void
 extractLowerAndUpperMatrices(const M& naturalMatrix,
                             std::vector<int>& reorderedToNatural,
                             std::unique_ptr<GPUM>& lower,
                             std::unique_ptr<GPUM>& upper)
 {
    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
    M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
    M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
         lowerIt != reorderedLower.createend();
         ++lowerIt, ++upperIt) {
        auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
        for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
            if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
                lowerIt.insert(elem.index());
            } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
                upperIt.insert(elem.index());
            }
        }
    }
    lower.reset(new auto (GPUM::fromMatrix(reorderedLower, true)));
    upper.reset(new auto (GPUM::fromMatrix(reorderedUpper, true)));
    return;
 }
 } // NAMESPACE
 namespace Opm::cuistl
 {
@ -127,8 +40,8 @@ template <class M, class X, class Y, int l>
 CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
-    , m_reorderedToNatural(createReorderedToNatural(m_levelSets))
+    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
-    , m_naturalToReordered(createNaturalToReordered(m_levelSets))
+    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
    , m_gpuNaturalToReorder(m_naturalToReordered)
    , m_gpuReorderToNatural(m_reorderedToNatural)
@ -154,12 +67,12 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag.reset(new auto(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N())));
+        m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
-        extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
+        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
-            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
+            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
-    } else {
+                                                                                              m_reorderedToNatural);
-        createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
-            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
+            m_cpuMatrix, m_reorderedToNatural);
    }
    computeDiagAndMoveReorderedData();
@ -188,7 +101,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
-                detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
+                detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -200,7 +113,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
-                detail::computeLowerSolveLevelSet<field_type, blocksize_>(
+                detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
@ -221,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
            const int numOfRowsInLevel = m_levelSets[level].size();
            levelStartIdx -= numOfRowsInLevel;
            if (m_splitMatrix) {
-                detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
+                detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@ -232,7 +145,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
-                detail::computeUpperSolveLevelSet<field_type, blocksize_>(
+                detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
@ -304,7 +217,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
-                detail::computeDiluDiagonalSplit<field_type, blocksize_>(
+                detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -319,7 +232,8 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                    m_gpuDInv.data(),
                    m_updateThreadBlockSize);
            } else {
-                detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
+                detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
                    m_gpuReorderToNatural.data(),
@ -360,7 +274,8 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
            std::ignore = cudaDeviceSynchronize();
            auto afterUpdate = std::chrono::high_resolution_clock::now();
            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
+                long long durationInMicroSec
                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
                if (durationInMicroSec < bestUpdateTime) {
                    bestUpdateTime = durationInMicroSec;
                    bestUpdateBlockSize = thrBlockSize;
@ -373,7 +288,8 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
            std::ignore = cudaDeviceSynchronize();
            auto afterApply = std::chrono::high_resolution_clock::now();
            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
+                long long durationInMicroSec
                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
                if (durationInMicroSec < bestApplyTime) {
                    bestApplyTime = durationInMicroSec;
                    bestApplyBlockSize = thrBlockSize;
--- a/opm/simulators/linalg/cuistl/CuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.hpp
@ -19,17 +19,11 @@
 #ifndef OPM_CUDILU_HPP
 #define OPM_CUDILU_HPP
-#include <dune/istl/preconditioner.hh>
+#include <memory>
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
 #include <optional>
 #include <vector>
 #include <memory>
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@ -16,27 +16,13 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cusparse.h>
 #include <dune/common/fmatrix.hh>
 #include <dune/common/fvector.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <dune/istl/bvector.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
 #include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
@ -67,11 +53,7 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
                             A.nonzeroes()));
    // Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
-    detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
+    invertDiagonalAndFlatten();
                                                                                m_gpuMatrix.getRowIndices().data(),
                                                                                m_gpuMatrix.getColumnIndices().data(),
                                                                                m_gpuMatrix.N(),
                                                                                m_diagInvFlattened.data());
 }
 template <class M, class X, class Y, int l>
@ -111,7 +93,15 @@ void
 CuJac<M, X, Y, l>::update()
 {
    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
-    detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
+    invertDiagonalAndFlatten();
 }
 template <class M, class X, class Y, int l>
 void
 CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
 {
    detail::JAC::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(
        m_gpuMatrix.getNonZeroValues().data(),
        m_gpuMatrix.getRowIndices().data(),
        m_gpuMatrix.getColumnIndices().data(),
        m_gpuMatrix.N(),
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@ -103,6 +103,8 @@ private:
    CuSparseMatrix<field_type> m_gpuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
    CuVector<field_type> m_diagInvFlattened;
    void invertDiagonalAndFlatten();
 };
 } // end namespace Opm::cuistl
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
@ -0,0 +1,318 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <chrono>
 #include <config.h>
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
 #include <limits>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/TimingMacros.hpp>
 #include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <tuple>
 namespace Opm::cuistl
 {
 template <class M, class X, class Y, int l>
 OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
    , m_gpuMatrixReorderedLower(nullptr)
    , m_gpuMatrixReorderedUpper(nullptr)
    , m_gpuNaturalToReorder(m_naturalToReordered)
    , m_gpuReorderToNatural(m_reorderedToNatural)
    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
    , m_splitMatrix(splitMatrix)
    , m_tuneThreadBlockSizes(tuneKernels)
 {
    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
    // Some sanity check
    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
    OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
                 fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
                             m_gpuMatrix.blockSize(),
                             A[0][0].N()));
    OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
                 fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
                             m_gpuMatrix.dim(),
                             A.N() * A[0][0].N()));
    OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
                 fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
                                                                                              m_reorderedToNatural);
    } else {
        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
            m_cpuMatrix, m_reorderedToNatural);
    }
    LUFactorizeAndMoveData();
 #ifdef USE_HIP
    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
    }
 #endif
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
 {
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
        int levelStartIdx = 0;
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
                detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
                    m_gpuReorderToNatural.data(),
                    levelStartIdx,
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
                detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
                                                                         m_gpuReorderedLU->getRowIndices().data(),
                                                                         m_gpuReorderedLU->getColumnIndices().data(),
                                                                         m_gpuReorderToNatural.data(),
                                                                         levelStartIdx,
                                                                         numOfRowsInLevel,
                                                                         d.data(),
                                                                         v.data(),
                                                                         m_applyThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
        levelStartIdx = m_cpuMatrix.N();
        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            levelStartIdx -= numOfRowsInLevel;
            if (m_splitMatrix) {
                detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                    m_gpuReorderToNatural.data(),
                    levelStartIdx,
                    numOfRowsInLevel,
                    m_gpuMatrixReorderedDiag.value().data(),
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
                detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
                                                                         m_gpuReorderedLU->getRowIndices().data(),
                                                                         m_gpuReorderedLU->getColumnIndices().data(),
                                                                         m_gpuReorderToNatural.data(),
                                                                         levelStartIdx,
                                                                         numOfRowsInLevel,
                                                                         v.data(),
                                                                         m_applyThreadBlockSize);
            }
        }
    }
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
 {
 }
 template <class M, class X, class Y, int l>
 Dune::SolverCategory::Category
 OpmCuILU0<M, X, Y, l>::category() const
 {
    return Dune::SolverCategory::sequential;
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
        LUFactorizeAndMoveData();
    }
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
 {
    OPM_TIMEBLOCK(prec_update);
    {
        if (m_splitMatrix) {
            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
                m_gpuMatrix.getNonZeroValues().data(),
                m_gpuMatrix.getRowIndices().data(),
                m_gpuMatrix.getColumnIndices().data(),
                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                m_gpuMatrixReorderedLower->getRowIndices().data(),
                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                m_gpuMatrixReorderedUpper->getRowIndices().data(),
                m_gpuMatrixReorderedDiag.value().data(),
                m_gpuNaturalToReorder.data(),
                m_gpuMatrixReorderedLower->N(),
                m_updateThreadBlockSize);
        } else {
            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
                                                                   m_gpuMatrix.getRowIndices().data(),
                                                                   m_gpuReorderedLU->getNonZeroValues().data(),
                                                                   m_gpuReorderedLU->getRowIndices().data(),
                                                                   m_gpuNaturalToReorder.data(),
                                                                   m_gpuReorderedLU->N(),
                                                                   m_updateThreadBlockSize);
        }
        int levelStartIdx = 0;
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
                detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                    m_gpuMatrixReorderedDiag.value().data(),
                    m_gpuReorderToNatural.data(),
                    m_gpuNaturalToReorder.data(),
                    levelStartIdx,
                    numOfRowsInLevel,
                    m_updateThreadBlockSize);
            } else {
                detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
                                                                      m_gpuReorderedLU->getRowIndices().data(),
                                                                      m_gpuReorderedLU->getColumnIndices().data(),
                                                                      m_gpuNaturalToReorder.data(),
                                                                      m_gpuReorderToNatural.data(),
                                                                      numOfRowsInLevel,
                                                                      levelStartIdx,
                                                                      m_updateThreadBlockSize);
            }
            levelStartIdx += numOfRowsInLevel;
        }
    }
 }
 template <class M, class X, class Y, int l>
 void
 OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
 {
    // TODO generalize this tuning process in a function separate of the class
    long long bestApplyTime = std::numeric_limits<long long>::max();
    long long bestUpdateTime = std::numeric_limits<long long>::max();
    int bestApplyBlockSize = -1;
    int bestUpdateBlockSize = -1;
    int interval = 64;
    // temporary buffers for the apply
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;
    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
        // sometimes the first kernel launch kan be slower, so take the time twice
        for (int i = 0; i < 2; ++i) {
            auto beforeUpdate = std::chrono::high_resolution_clock::now();
            m_updateThreadBlockSize = thrBlockSize;
            update();
            std::ignore = cudaDeviceSynchronize();
            auto afterUpdate = std::chrono::high_resolution_clock::now();
            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
                long long durationInMicroSec
                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
                if (durationInMicroSec < bestUpdateTime) {
                    bestUpdateTime = durationInMicroSec;
                    bestUpdateBlockSize = thrBlockSize;
                }
            }
            auto beforeApply = std::chrono::high_resolution_clock::now();
            m_applyThreadBlockSize = thrBlockSize;
            apply(tmpV, tmpD);
            std::ignore = cudaDeviceSynchronize();
            auto afterApply = std::chrono::high_resolution_clock::now();
            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
                long long durationInMicroSec
                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
                if (durationInMicroSec < bestApplyTime) {
                    bestApplyTime = durationInMicroSec;
                    bestApplyBlockSize = thrBlockSize;
                }
            }
        }
    }
    m_applyThreadBlockSize = bestApplyBlockSize;
    m_updateThreadBlockSize = bestUpdateBlockSize;
 }
 } // namespace Opm::cuistl
 #define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
                                            ::Opm::cuistl::CuVector<realtype>,                                         \
                                            ::Opm::cuistl::CuVector<realtype>>;                                        \
    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
                                            ::Opm::cuistl::CuVector<realtype>,                                         \
                                            ::Opm::cuistl::CuVector<realtype>>
 INSTANTIATE_CUDILU_DUNE(double, 1);
 INSTANTIATE_CUDILU_DUNE(double, 2);
 INSTANTIATE_CUDILU_DUNE(double, 3);
 INSTANTIATE_CUDILU_DUNE(double, 4);
 INSTANTIATE_CUDILU_DUNE(double, 5);
 INSTANTIATE_CUDILU_DUNE(double, 6);
 INSTANTIATE_CUDILU_DUNE(float, 1);
 INSTANTIATE_CUDILU_DUNE(float, 2);
 INSTANTIATE_CUDILU_DUNE(float, 3);
 INSTANTIATE_CUDILU_DUNE(float, 4);
 INSTANTIATE_CUDILU_DUNE(float, 5);
 INSTANTIATE_CUDILU_DUNE(float, 6);
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
@ -0,0 +1,139 @@
 /*
  Copyright 2022-2023 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_CUILU0_OPM_Impl_HPP
 #define OPM_CUILU0_OPM_Impl_HPP
 #include <memory>
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>
 namespace Opm::cuistl
 {
 //! \brief ILU0 preconditioner on the GPU.
 //!
 //! \tparam M The matrix type to operate on
 //! \tparam X Type of the update
 //! \tparam Y Type of the defect
 //! \tparam l Ignored. Just there to have the same number of template arguments
 //!    as other preconditioners.
 //!
 //! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
 //! arguments in case of future additions.
 template <class M, class X, class Y, int l = 1>
 class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
 {
 public:
    //! \brief The matrix type the preconditioner is for.
    using matrix_type = typename std::remove_const<M>::type;
    //! \brief The domain type of the preconditioner.
    using domain_type = X;
    //! \brief The range type of the preconditioner.
    using range_type = Y;
    //! \brief The field type of the preconditioner.
    using field_type = typename X::field_type;
    //! \brief The GPU matrix type
    using CuMat = CuSparseMatrix<field_type>;
    //! \brief Constructor.
    //!
    //!  Constructor gets all parameters to operate the prec.
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
    explicit OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels);
    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
    void pre(X& x, Y& b) override;
    //! \brief Apply the preconditoner.
    void apply(X& v, const Y& d) override;
    //! \brief Post processing
    //! \note Does nothing at the moment
    void post(X& x) override;
    //! Category of the preconditioner (see SolverCategory::Category)
    Dune::SolverCategory::Category category() const override;
    //! \brief Updates the matrix data.
    void update() final;
    //! \brief Compute LU factorization, and update the data of the reordered matrix
    void LUFactorizeAndMoveData();
    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
    //! \returns false
    static constexpr bool shouldCallPre()
    {
        return false;
    }
    //! \returns false
    static constexpr bool shouldCallPost()
    {
        return false;
    }
 private:
    //! \brief Reference to the underlying matrix
    const M& m_cpuMatrix;
    //! \brief size_t describing the dimensions of the square block elements
    static constexpr const size_t blocksize_ = matrix_type::block_type::cols;
    //! \brief SparseTable storing each row by level
    Opm::SparseTable<size_t> m_levelSets;
    //! \brief converts from index in reordered structure to index natural ordered structure
    std::vector<int> m_reorderedToNatural;
    //! \brief converts from index in natural ordered structure to index reordered strucutre
    std::vector<int> m_naturalToReordered;
    //! \brief The A matrix stored on the gpu, and its reordred version
    CuMat m_gpuMatrix;
    std::unique_ptr<CuMat> m_gpuReorderedLU;
    //! \brief If matrix splitting is enabled, then we store the lower and upper part separately
    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
    //! \brief If matrix splitting is enabled, we also store the diagonal separately
    std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
    //! row conversion from natural to reordered matrix indices stored on the GPU
    CuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
    CuVector<int> m_gpuReorderToNatural;
    //! \brief Stores the inverted diagonal that we use in ILU0
    CuVector<field_type> m_gpuDInv;
    //! \brief Bool storing whether or not we should store matrices in a split format
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
    bool m_tuneThreadBlockSizes;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
    int m_applyThreadBlockSize = -1;
    int m_updateThreadBlockSize = -1;
 };
 } // end namespace Opm::cuistl
 #endif
--- a/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
+++ b/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
@ -0,0 +1,110 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_COLORING_AND_REORDERING_UTILS_HPP
 #define OPM_COLORING_AND_REORDERING_UTILS_HPP
 #include <fmt/core.h>
 #include <memory>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
 #include <tuple>
 #include <vector>
 /*
 This file contains a collection of utility functions used in the GPU implementation of ILU and DILU
 The functions deal with creating the mappings between reordered and natural indices, as well as
 extracting sparsity structures from dune matrices and creating cusparsematrix indices
 */
 namespace Opm::cuistl::detail
 {
 inline std::vector<int>
 createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
 {
    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[globCnt++] = static_cast<int>(col);
        }
    }
    return res;
 }
 inline std::vector<int>
 createNaturalToReordered(const Opm::SparseTable<size_t>& levelSets)
 {
    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
    int globCnt = 0;
    for (auto row : levelSets) {
        for (auto col : row) {
            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
            res[col] = globCnt++;
        }
    }
    return res;
 }
 template <class M, class field_type, class GPUM>
 inline std::unique_ptr<GPUM>
 createReorderedMatrix(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
 {
    M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
        // For elements in A
        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
            dstRowIt.insert(elem.index());
        }
    }
    return std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedMatrix, true)));
 }
 template <class M, class field_type, class GPUM>
 inline std::tuple<std::unique_ptr<GPUM>, std::unique_ptr<GPUM>>
 extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
 {
    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
    M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
    M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
         lowerIt != reorderedLower.createend();
         ++lowerIt, ++upperIt) {
        auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
        for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
            if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
                lowerIt.insert(elem.index());
            } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
                upperIt.insert(elem.index());
            }
        }
    }
    return {std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedLower, true))),
            std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedUpper, true)))};
 }
 } // namespace Opm::cuistl::detail
 #endif
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@ -16,425 +16,17 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <stdexcept>
 #include <config.h>
 namespace Opm::cuistl::detail
 {
 namespace
 {
    // TODO: figure out if this can be generalized effectively, this seems excessively verbose
    // explicit formulas based on Dune cpu code
    template <class T, int blocksize>
    __device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
    {
        if (blocksize == 1) {
            dstBlock[0] = 1.0 / (srcBlock[0]);
        } else if (blocksize == 2) {
            T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
            dstBlock[0] = srcBlock[3] * detInv;
            dstBlock[1] = -srcBlock[1] * detInv;
            dstBlock[2] = -srcBlock[2] * detInv;
            dstBlock[3] = srcBlock[0] * detInv;
        } else if (blocksize == 3) {
            // based on Dune implementation
            T t4 = srcBlock[0] * srcBlock[4];
            T t6 = srcBlock[0] * srcBlock[5];
            T t8 = srcBlock[1] * srcBlock[3];
            T t10 = srcBlock[2] * srcBlock[3];
            T t12 = srcBlock[1] * srcBlock[6];
            T t14 = srcBlock[2] * srcBlock[6];
            T t17 = 1.0
                / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
                   - t14 * srcBlock[4]); // t17 is 1/determinant
            dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
            dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
            dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
            dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
            dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
            dstBlock[5] = -(t6 - t10) * t17;
            dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
            dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
            dstBlock[8] = (t4 - t8) * t17;
        }
    }
    // explicit formulas based on Dune cpu code
    template <class T, int blocksize>
    __device__ __forceinline__ void invBlockInPlace(T* __restrict__ block)
    {
        if (blocksize == 1) {
            block[0] = 1.0 / (block[0]);
        } else if (blocksize == 2) {
            T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
            T temp = block[0];
            block[0] = block[3] * detInv;
            block[1] = -block[1] * detInv;
            block[2] = -block[2] * detInv;
            block[3] = temp * detInv;
        } else if (blocksize == 3) {
            T t4 = block[0] * block[4];
            T t6 = block[0] * block[5];
            T t8 = block[1] * block[3];
            T t10 = block[2] * block[3];
            T t12 = block[1] * block[6];
            T t14 = block[2] * block[6];
            T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
            T t17 = T(1.0) / det;
            T matrix01 = block[1];
            T matrix00 = block[0];
            T matrix10 = block[3];
            T matrix11 = block[4];
            block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
            block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
            block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
            block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
            block[4] = (matrix00 * block[8] - t14) * t17;
            block[5] = -(t6 - t10) * t17;
            block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
            block[7] = -(matrix00 * block[7] - t12) * t17;
            block[8] = (t4 - t8) * t17;
        }
    }
    enum class MVType { SET, PLUS, MINUS };
    // SET:   c  = A*b
    // PLS:   c += A*b
    // MINUS: c -= A*b
    template <class T, int blocksize, MVType OP>
    __device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c)
    {
        for (int i = 0; i < blocksize; ++i) {
            if (OP == MVType::SET) {
                c[i] = 0;
            }
            for (int j = 0; j < blocksize; ++j) {
                if (OP == MVType::SET || OP == MVType::PLUS) {
                    c[i] += A[i * blocksize + j] * b[j];
                } else if (OP == MVType::MINUS) {
                    c[i] -= A[i * blocksize + j] * b[j];
                }
            }
        }
    }
    template <class T, int blocksize>
    __device__ __forceinline__ void mv(const T* a, const T* b, T* c)
    {
        matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
    }
    template <class T, int blocksize>
    __device__ __forceinline__ void umv(const T* a, const T* b, T* c)
    {
        matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
    }
    template <class T, int blocksize>
    __device__ __forceinline__ void mmv(const T* a, const T* b, T* c)
    {
        matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
    }
    // dst -= A*B*C
    template <class T, int blocksize>
    __device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst)
    {
        T tmp[blocksize * blocksize] = {0};
        // tmp = A*B
        for (int i = 0; i < blocksize; ++i) {
            for (int k = 0; k < blocksize; ++k) {
                for (int j = 0; j < blocksize; ++j) {
                    tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
                }
            }
        }
        // dst = tmp*C
        for (int i = 0; i < blocksize; ++i) {
            for (int k = 0; k < blocksize; ++k) {
                for (int j = 0; j < blocksize; ++j) {
                    dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
                }
            }
        }
    }
    template <class T, int blocksize>
    __global__ void
    cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
    {
        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
        if (row < numberOfRows) {
            size_t nnzIdx = rowIndices[row];
            size_t nnzIdxLim = rowIndices[row + 1];
            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
            // zero element
            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
                ++nnzIdx;
            }
            // diagBlock points to the start of where the diagonal block is stored
            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
            // element should be stored
            T* vecBlock = &vec[blocksize * blocksize * row];
            invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeLowerSolveLevelSet(T* mat,
                                                int* rowIndices,
                                                int* colIndices,
                                                int* indexConversion,
                                                int startIdx,
                                                int rowsInLevelSet,
                                                const T* dInv,
                                                const T* d,
                                                T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
                                                     int* rowIndices,
                                                     int* colIndices,
                                                     int* indexConversion,
                                                     int startIdx,
                                                     int rowsInLevelSet,
                                                     const T* dInv,
                                                     const T* d,
                                                     T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            // TODO: removce the first condition in the for loop
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeUpperSolveLevelSet(T* mat,
                                                int* rowIndices,
                                                int* colIndices,
                                                int* indexConversion,
                                                int startIdx,
                                                int rowsInLevelSet,
                                                const T* dInv,
                                                T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize] = {0};
            for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
                const int col = colIndices[block];
                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
                                                     int* rowIndices,
                                                     int* colIndices,
                                                     int* indexConversion,
                                                     int startIdx,
                                                     int rowsInLevelSet,
                                                     const T* dInv,
                                                     T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize] = {0};
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeDiluDiagonal(T* mat,
                                          int* rowIndices,
                                          int* colIndices,
                                          int* reorderedToNatural,
                                          int* naturalToReordered,
                                          const int startIdx,
                                          int rowsInLevelSet,
                                          T* dInv)
    {
        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            int diagIdx = nnzIdx;
            while (colIndices[diagIdx] != naturalRowIdx) {
                ++diagIdx;
            }
            T dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
                }
            }
            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
                const int col = naturalToReordered[colIndices[block]];
                // find element with indices swapped
                // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
                // we binary search over
                int left = rowIndices[col];
                int right = rowIndices[col + 1] - 1;
                int mid = left + (right - left) / 2; // overflow-safe average;
                while (left <= right) {
                    const int col = colIndices[mid];
                    if (col == naturalRowIdx) {
                        break;
                    } else if (col < naturalRowIdx) {
                        left = mid + 1;
                    } else {
                        right = mid - 1;
                    }
                    mid = left + (right - left) / 2; // overflow-safe average
                }
                const int symOpposite = mid;
                mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
                                              &dInv[col * blocksize * blocksize],
                                              &mat[symOpposite * blocksize * blocksize],
                                              dInvTmp);
            }
            invBlockInPlace<T, blocksize>(dInvTmp);
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
                }
            }
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
                                               int* lowerRowIndices,
                                               int* lowerColIndices,
                                               T* reorderedUpperMat,
                                               int* upperRowIndices,
                                               int* upperColIndices,
                                               T* diagonal,
                                               int* reorderedToNatural,
                                               int* naturalToReordered,
                                               const int startIdx,
                                               int rowsInLevelSet,
                                               T* dInv)
    {
        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
            const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
            T dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                }
            }
            for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
                const int col = naturalToReordered[lowerColIndices[block]];
                int symOppositeIdx = upperRowIndices[col];
                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
                    if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
                        break;
                    }
                }
                const int symOppositeBlock = symOppositeIdx;
                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
                                              &dInv[col * blocksize * blocksize],
                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
                                              dInvTmp);
            }
            invBlockInPlace<T, blocksize>(dInvTmp);
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
                }
            }
        }
    }
    template <class T, int blocksize>
    __global__ void cuMoveDataToReordered(
        T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
@ -505,195 +97,21 @@ namespace
            }
        }
    }
    // Kernel here is the function object of the cuda kernel
    template <class Kernel>
    inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1)
    {
        if (suggestedThrBlockSize != -1){
            return suggestedThrBlockSize;
        }
 // Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned
 #if USE_HIP
    return 512;
 #else
        int blockSize = 1024;
        int tmpGridSize;
        std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
        return blockSize;
 #endif
    }
    inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
    {
        return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
    }
 } // namespace
 template <class T, int blocksize>
 void
-invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
+copyMatDataToReordered(T* srcMatrix,
-{
+                       int* srcRowIndices,
-    if (blocksize <= 3) {
+                       T* dstMatrix,
-        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
+                       int* dstRowIndices,
        int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
        cuInvertDiagonalAndFlatten<T, blocksize>
            <<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 // perform the lower solve for all rows in the same level set
 template <class T, int blocksize>
 void
 computeLowerSolveLevelSet(T* reorderedMat,
                          int* rowIndices,
                          int* colIndices,
                          int* indexConversion,
                          int startIdx,
                          int rowsInLevelSet,
                          const T* dInv,
                          const T* d,
                          T* v,
                          int thrBlockSize)
 {
    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuComputeLowerSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 template <class T, int blocksize>
 void
 computeLowerSolveLevelSetSplit(T* reorderedMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               const T* d,
                               T* v,
                               int thrBlockSize)
 {
    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
 void
 computeUpperSolveLevelSet(T* reorderedMat,
                          int* rowIndices,
                          int* colIndices,
                          int* indexConversion,
                          int startIdx,
                          int rowsInLevelSet,
                          const T* dInv,
                          T* v,
                          int thrBlockSize)
 {
    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuComputeUpperSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 template <class T, int blocksize>
 void
 computeUpperSolveLevelSetSplit(T* reorderedMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               T* v,
                               int thrBlockSize)
 {
    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 template <class T, int blocksize>
 void
 computeDiluDiagonal(T* reorderedMat,
                    int* rowIndices,
                    int* colIndices,
                    int* reorderedToNatural,
                       int* naturalToReordered,
-                    const int startIdx,
+                       size_t numberOfRows,
                    int rowsInLevelSet,
                    T* dInv,
                       int thrBlockSize)
 {
-    if (blocksize <= 3) {
+    int threadBlockSize
-        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
-        int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
        cuComputeDiluDiagonal<T, blocksize>
            <<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
                                                                        rowIndices,
                                                                        colIndices,
                                                                        reorderedToNatural,
                                                                        naturalToReordered,
                                                                        startIdx,
                                                                        rowsInLevelSet,
                                                                        dInv);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 template <class T, int blocksize>
 void
 computeDiluDiagonalSplit(T* reorderedLowerMat,
                         int* lowerRowIndices,
                         int* lowerColIndices,
                         T* reorderedUpperMat,
                         int* upperRowIndices,
                         int* upperColIndices,
                         T* diagonal,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         const int startIdx,
                         int rowsInLevelSet,
                         T* dInv,
                         int thrBlockSize)
 {
    if (blocksize <= 3) {
        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
        int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                                                                                     lowerRowIndices,
                                                                                     lowerColIndices,
                                                                                     reorderedUpperMat,
                                                                                     upperRowIndices,
                                                                                     upperColIndices,
                                                                                     diagonal,
                                                                                     reorderedToNatural,
                                                                                     naturalToReordered,
                                                                                     startIdx,
                                                                                     rowsInLevelSet,
                                                                                     dInv);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 template <class T, int blocksize>
 void
 copyMatDataToReordered(
    T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows,
    int thrBlockSize)
 {
    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
    int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
 }
@ -712,8 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
                            size_t numberOfRows,
                            int thrBlockSize)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
-    int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
+        cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
                                                                                 srcRowIndices,
                                                                                 srcColumnIndices,
@ -727,16 +146,8 @@ copyMatDataToReorderedSplit(T* srcMatrix,
 }
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);                                  \
    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int);                         \
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);     \
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);                   \
    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                           \
    template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);               \
    template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);     \
    template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);          \
    template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
@ -22,189 +22,6 @@
 #include <vector>
 namespace Opm::cuistl::detail
 {
 /**
 * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
 * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param numberOfRows Integer describing the number of rows in the matrix
 * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
 */
 template <class T, int blocksize>
 void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
 * reorderedMat
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
 template <class T, int blocksize>
 void computeLowerSolveLevelSet(T* reorderedMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               const T* d,
                               T* v,
                               int threadBlockSize);
 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
 * reorderedUpperMat
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
 template <class T, int blocksize>
 void computeLowerSolveLevelSetSplit(T* reorderedUpperMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               const T* d,
                               T* v,
                               int threadBlockSize);
 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
 template <class T, int blocksize>
 void computeUpperSolveLevelSet(T* reorderedMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               T* v,
                               int threadBlockSize);
 template <class T, int blocksize>
 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
 void computeUpperSolveLevelSetSplit(T* reorderedUpperMat,
                               int* rowIndices,
                               int* colIndices,
                               int* indexConversion,
                               int startIdx,
                               int rowsInLevelSet,
                               const T* dInv,
                               T* v,
                               int threadBlockSize);
 /**
 * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
 * containing the diagonal blocks
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
 template <class T, int blocksize>
 void computeDiluDiagonal(T* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         int startIdx,
                         int rowsInLevelSet,
                         T* dInv,
                         int threadBlockSize);
 template <class T, int blocksize>
 /**
 * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
 * containing the diagonal blocks
 * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
 * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
 * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param diagonal The diagonal elements of the reordered matrix
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
 void computeDiluDiagonalSplit(T* reorderedLowerMat,
                         int* lowerRowIndices,
                         int* lowerColIndices,
                         T* reorderedUpperMat,
                         int* upperRowIndices,
                         int* upperColIndices,
                         T* diagonal,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         int startIdx,
                         int rowsInLevelSet,
                         T* dInv,
                         int threadBlockSize);
 /**
 * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
 * @param srcMatrix The source matrix we will copy data from
@ -216,26 +33,40 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
 * @param numberOfRows The number of rows in the matrices
 */
 template <class T, int blocksize>
-void copyMatDataToReordered(
+void copyMatDataToReordered(T* srcMatrix,
-    T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
+                            int* srcRowIndices,
                            T* dstMatrix,
                            int* dstRowIndices,
                            int* naturalToReordered,
                            size_t numberOfRows,
                            int threadBlockSize);
 /**
 * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
 * @param srcMatrix The source matrix we will copy data from
 * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
 * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht bsr
+ * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
- * format
+ * bsr format
 * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht bsr
+ * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
- * format
+ * bsr format
 * @param [out] dstDiag The destination buffer for the diagonal part of the matrix
 * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
 * @param numberOfRows The number of rows in the matrices
 */
 template <class T, int blocksize>
-void copyMatDataToReorderedSplit(
+void copyMatDataToReorderedSplit(T* srcMatrix,
-    T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
+                                 int* srcRowIndices,
                                 int* srcColumnIndices,
                                 T* dstLowerMatrix,
                                 int* dstLowerRowIndices,
                                 T* dstUpperMatrix,
                                 int* dstUpperRowIndices,
                                 T* dstDiag,
                                 int* naturalToReordered,
                                 size_t numberOfRows,
                                 int threadBlockSize);
 } // namespace Opm::cuistl::detail
 #endif
--- a/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
@ -0,0 +1,234 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
 #define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
 #include <config.h>
 #include <cuda_runtime.h>
 /*
    This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
    The functions provides various matrix operations that are used by the preconditioners.
 */
 namespace
 {
 // TODO: figure out if this can be generalized effectively, this seems excessively verbose
 // explicit formulas based on Dune cpu code
 template <class T, int blocksize>
 __device__ __forceinline__ void
 invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
 {
    if (blocksize == 1) {
        dstBlock[0] = 1.0 / (srcBlock[0]);
    } else if (blocksize == 2) {
        T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
        dstBlock[0] = srcBlock[3] * detInv;
        dstBlock[1] = -srcBlock[1] * detInv;
        dstBlock[2] = -srcBlock[2] * detInv;
        dstBlock[3] = srcBlock[0] * detInv;
    } else if (blocksize == 3) {
        // based on Dune implementation
        T t4 = srcBlock[0] * srcBlock[4];
        T t6 = srcBlock[0] * srcBlock[5];
        T t8 = srcBlock[1] * srcBlock[3];
        T t10 = srcBlock[2] * srcBlock[3];
        T t12 = srcBlock[1] * srcBlock[6];
        T t14 = srcBlock[2] * srcBlock[6];
        T t17 = 1.0
            / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
               - t14 * srcBlock[4]); // t17 is 1/determinant
        dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
        dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
        dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
        dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
        dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
        dstBlock[5] = -(t6 - t10) * t17;
        dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
        dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
        dstBlock[8] = (t4 - t8) * t17;
    }
 }
 // explicit formulas based on Dune cpu code
 template <class T, int blocksize>
 __device__ __forceinline__ void
 invBlockInPlace(T* __restrict__ block)
 {
    if (blocksize == 1) {
        block[0] = 1.0 / (block[0]);
    } else if (blocksize == 2) {
        T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
        T temp = block[0];
        block[0] = block[3] * detInv;
        block[1] = -block[1] * detInv;
        block[2] = -block[2] * detInv;
        block[3] = temp * detInv;
    } else if (blocksize == 3) {
        T t4 = block[0] * block[4];
        T t6 = block[0] * block[5];
        T t8 = block[1] * block[3];
        T t10 = block[2] * block[3];
        T t12 = block[1] * block[6];
        T t14 = block[2] * block[6];
        T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
        T t17 = T(1.0) / det;
        T matrix01 = block[1];
        T matrix00 = block[0];
        T matrix10 = block[3];
        T matrix11 = block[4];
        block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
        block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
        block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
        block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
        block[4] = (matrix00 * block[8] - t14) * t17;
        block[5] = -(t6 - t10) * t17;
        block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
        block[7] = -(matrix00 * block[7] - t12) * t17;
        block[8] = (t4 - t8) * t17;
    }
 }
 enum class MVType { SET, PLUS, MINUS };
 // SET:   c  = A*b
 // PLS:   c += A*b
 // MINUS: c -= A*b
 template <class T, int blocksize, MVType OP>
 __device__ __forceinline__ void
 matrixVectorProductWithAction(const T* A, const T* b, T* c)
 {
    for (int i = 0; i < blocksize; ++i) {
        if (OP == MVType::SET) {
            c[i] = 0;
        }
        for (int j = 0; j < blocksize; ++j) {
            if (OP == MVType::SET || OP == MVType::PLUS) {
                c[i] += A[i * blocksize + j] * b[j];
            } else if (OP == MVType::MINUS) {
                c[i] -= A[i * blocksize + j] * b[j];
            }
        }
    }
 }
 template <class T, int blocksize>
 __device__ __forceinline__ void
 mv(const T* a, const T* b, T* c)
 {
    matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
 }
 template <class T, int blocksize>
 __device__ __forceinline__ void
 umv(const T* a, const T* b, T* c)
 {
    matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
 }
 template <class T, int blocksize>
 __device__ __forceinline__ void
 mmv(const T* a, const T* b, T* c)
 {
    matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
 }
 // dst -= A*B*C
 template <class T, int blocksize>
 __device__ __forceinline__ void
 mmx2Subtraction(T* A, T* B, T* C, T* dst)
 {
    T tmp[blocksize * blocksize] = {0};
    // tmp = A*B
    for (int i = 0; i < blocksize; ++i) {
        for (int k = 0; k < blocksize; ++k) {
            for (int j = 0; j < blocksize; ++j) {
                tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
            }
        }
    }
    // dst = tmp*C
    for (int i = 0; i < blocksize; ++i) {
        for (int k = 0; k < blocksize; ++k) {
            for (int j = 0; j < blocksize; ++j) {
                dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
            }
        }
    }
 }
 // C = A*B, assumes the three buffers do not overlap
 template <class T, int blocksize>
 __device__ __forceinline__ void
 mmNoOverlap(T* A, T* B, T* C)
 {
    for (int i = 0; i < blocksize; ++i) {
        for (int k = 0; k < blocksize; ++k) {
            for (int j = 0; j < blocksize; ++j) {
                C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
            }
        }
    }
 }
 // C = A*B, buffers may overlap
 template <class T, int blocksize>
 __device__ __forceinline__ void
 mmOverlap(T* A, T* B, T* C)
 {
    T tmp[blocksize * blocksize] = {0};
    for (int i = 0; i < blocksize; ++i) {
        for (int k = 0; k < blocksize; ++k) {
            for (int j = 0; j < blocksize; ++j) {
                tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
            }
        }
    }
    for (int i = 0; i < blocksize; ++i) {
        for (int j = 0; j < blocksize; ++j) {
            C[i * blocksize + j] = tmp[i * blocksize + j];
        }
    }
 }
 // A -= B
 template <class T, int blocksize>
 __device__ __forceinline__ void
 matrixSubtraction(T* A, T* B)
 {
    for (int i = 0; i < blocksize; ++i) {
        for (int j = 0; j < blocksize; ++j) {
            A[i * blocksize + j] -= B[i * blocksize + j];
        }
    }
 }
 } // namespace
 #endif
--- a/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
+++ b/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
@ -0,0 +1,66 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_GPU_THREAD_UTILS_HPP
 #define OPM_GPU_THREAD_UTILS_HPP
 #include <cstddef>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 /*
    This file provides some logic for handling how to choose the correct thread-block size
 */
 namespace Opm::cuistl::detail
 {
 constexpr inline size_t
 getThreads([[maybe_unused]] size_t numberOfRows)
 {
    return 1024;
 }
 inline size_t
 getBlocks(size_t numberOfRows)
 {
    const auto threads = getThreads(numberOfRows);
    return (numberOfRows + threads - 1) / threads;
 }
 // Kernel here is the function object of the cuda kernel
 template <class Kernel>
 inline int
 getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1)
 {
    if (suggestedThrBlockSize != -1) {
        return suggestedThrBlockSize;
    }
    int blockSize;
    int tmpGridSize;
    OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
    return blockSize;
 }
 inline int
 getNumberOfBlocks(int wantedThreads, int threadBlockSize)
 {
    return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
 }
 } // namespace Opm::cuistl::detail
 #endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu
@ -0,0 +1,437 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <stdexcept>
 namespace Opm::cuistl::detail::DILU
 {
 namespace
 {
    template <class T, int blocksize>
    __global__ void cuSolveLowerLevelSet(T* mat,
                                         int* rowIndices,
                                         int* colIndices,
                                         int* indexConversion,
                                         int startIdx,
                                         int rowsInLevelSet,
                                         const T* dInv,
                                         const T* d,
                                         T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveLowerLevelSetSplit(T* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
                                              const T* dInv,
                                              const T* d,
                                              T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            // TODO: removce the first condition in the for loop
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveUpperLevelSet(T* mat,
                                         int* rowIndices,
                                         int* colIndices,
                                         int* indexConversion,
                                         int startIdx,
                                         int rowsInLevelSet,
                                         const T* dInv,
                                         T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize] = {0};
            for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
                const int col = colIndices[block];
                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveUpperLevelSetSplit(T* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
                                              const T* dInv,
                                              T* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];
            T rhs[blocksize] = {0};
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeDiluDiagonal(T* mat,
                                          int* rowIndices,
                                          int* colIndices,
                                          int* reorderedToNatural,
                                          int* naturalToReordered,
                                          const int startIdx,
                                          int rowsInLevelSet,
                                          T* dInv)
    {
        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
            const size_t nnzIdx = rowIndices[reorderedRowIdx];
            int diagIdx = nnzIdx;
            while (colIndices[diagIdx] != naturalRowIdx) {
                ++diagIdx;
            }
            T dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
                }
            }
            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
                const int col = naturalToReordered[colIndices[block]];
                // find element with indices swapped
                // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
                // we binary search over
                int left = rowIndices[col];
                int right = rowIndices[col + 1] - 1;
                int mid;
                while (left <= right) {
                    mid = left + (right - left) / 2; // overflow-safe average
                    const int col = colIndices[mid];
                    if (col == naturalRowIdx) {
                        break;
                    } else if (col < naturalRowIdx) {
                        left = mid + 1;
                    } else {
                        right = mid - 1;
                    }
                }
                const int symOpposite = mid;
                mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
                                              &dInv[col * blocksize * blocksize],
                                              &mat[symOpposite * blocksize * blocksize],
                                              dInvTmp);
            }
            invBlockInPlace<T, blocksize>(dInvTmp);
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
                }
            }
        }
    }
    template <class T, int blocksize>
    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
                                               int* lowerRowIndices,
                                               int* lowerColIndices,
                                               T* reorderedUpperMat,
                                               int* upperRowIndices,
                                               int* upperColIndices,
                                               T* diagonal,
                                               int* reorderedToNatural,
                                               int* naturalToReordered,
                                               const int startIdx,
                                               int rowsInLevelSet,
                                               T* dInv)
    {
        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
            const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
            T dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                }
            }
            for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
                const int col = naturalToReordered[lowerColIndices[block]];
                int symOppositeIdx = upperRowIndices[col];
                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
                    if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
                        break;
                    }
                }
                const int symOppositeBlock = symOppositeIdx;
                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
                                              &dInv[col * blocksize * blocksize],
                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
                                              dInvTmp);
            }
            invBlockInPlace<T, blocksize>(dInvTmp);
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
                }
            }
        }
    }
 } // namespace
 // perform the lower solve for all rows in the same level set
 template <class T, int blocksize>
 void
 solveLowerLevelSet(T* reorderedMat,
                   int* rowIndices,
                   int* colIndices,
                   int* indexConversion,
                   int startIdx,
                   int rowsInLevelSet,
                   const T* dInv,
                   const T* d,
                   T* v,
                   int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 template <class T, int blocksize>
 void
 solveLowerLevelSetSplit(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* dInv,
                        const T* d,
                        T* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
 void
 solveUpperLevelSet(T* reorderedMat,
                   int* rowIndices,
                   int* colIndices,
                   int* indexConversion,
                   int startIdx,
                   int rowsInLevelSet,
                   const T* dInv,
                   T* v,
                   int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 template <class T, int blocksize>
 void
 solveUpperLevelSetSplit(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* dInv,
                        T* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 template <class T, int blocksize>
 void
 computeDiluDiagonal(T* reorderedMat,
                    int* rowIndices,
                    int* colIndices,
                    int* reorderedToNatural,
                    int* naturalToReordered,
                    const int startIdx,
                    int rowsInLevelSet,
                    T* dInv,
                    int thrBlockSize)
 {
    if (blocksize <= 3) {
        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
                                                                                rowIndices,
                                                                                colIndices,
                                                                                reorderedToNatural,
                                                                                naturalToReordered,
                                                                                startIdx,
                                                                                rowsInLevelSet,
                                                                                dInv);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 template <class T, int blocksize>
 void
 computeDiluDiagonalSplit(T* reorderedLowerMat,
                         int* lowerRowIndices,
                         int* lowerColIndices,
                         T* reorderedUpperMat,
                         int* upperRowIndices,
                         int* upperColIndices,
                         T* diagonal,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         const int startIdx,
                         int rowsInLevelSet,
                         T* dInv,
                         int thrBlockSize)
 {
    if (blocksize <= 3) {
        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                                                                                     lowerRowIndices,
                                                                                     lowerColIndices,
                                                                                     reorderedUpperMat,
                                                                                     upperRowIndices,
                                                                                     upperColIndices,
                                                                                     diagonal,
                                                                                     reorderedToNatural,
                                                                                     naturalToReordered,
                                                                                     startIdx,
                                                                                     rowsInLevelSet,
                                                                                     dInv);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                      \
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
 INSTANTIATE_KERNEL_WRAPPERS(float, 3);
 INSTANTIATE_KERNEL_WRAPPERS(float, 4);
 INSTANTIATE_KERNEL_WRAPPERS(float, 5);
 INSTANTIATE_KERNEL_WRAPPERS(float, 6);
 INSTANTIATE_KERNEL_WRAPPERS(double, 1);
 INSTANTIATE_KERNEL_WRAPPERS(double, 2);
 INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 } // namespace Opm::cuistl::detail::DILU
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -0,0 +1,202 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_DILU_KERNELS_HPP
 #define OPM_DILU_KERNELS_HPP
 #include <cstddef>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <vector>
 namespace Opm::cuistl::detail::DILU
 {
 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
 * reorderedMat
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
 template <class T, int blocksize>
 void solveLowerLevelSet(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* dInv,
                        const T* d,
                        T* v,
                        int threadBlockSize);
 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
 * such that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
 * reorderedUpperMat
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
 template <class T, int blocksize>
 void solveLowerLevelSetSplit(T* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
                             const T* dInv,
                             const T* d,
                             T* v,
                             int threadBlockSize);
 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
 template <class T, int blocksize>
 void solveUpperLevelSet(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* dInv,
                        T* v,
                        int threadBlockSize);
 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
 * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
 template <class T, int blocksize>
 void solveUpperLevelSetSplit(T* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
                             const T* dInv,
                             T* v,
                             int threadBlockSize);
 /**
 * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
 * containing the diagonal blocks
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
 template <class T, int blocksize>
 void computeDiluDiagonal(T* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         int startIdx,
                         int rowsInLevelSet,
                         T* dInv,
                         int threadBlockSize);
 template <class T, int blocksize>
 /**
 * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
 * containing the diagonal blocks
 * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
 * matrix. The matrix reordered such that rows in the same level sets are contiguous
 * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
 * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
 * matrix. The matrix reordered such that rows in the same level sets are contiguous
 * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
 * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param diagonal The diagonal elements of the reordered matrix
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
 void computeDiluDiagonalSplit(T* reorderedLowerMat,
                              int* lowerRowIndices,
                              int* lowerColIndices,
                              T* reorderedUpperMat,
                              int* upperRowIndices,
                              int* upperColIndices,
                              T* diagonal,
                              int* reorderedToNatural,
                              int* naturalToReordered,
                              int startIdx,
                              int rowsInLevelSet,
                              T* dInv,
                              int threadBlockSize);
 } // namespace Opm::cuistl::detail::DILU
 #endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -0,0 +1,476 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
 #include <stdexcept>
 /*
    The LU factorization and apply step is written based on the Dune implementations
 */
 namespace Opm::cuistl::detail::ILU0
 {
 namespace
 {
    template <class T, int blocksize>
    __global__ void cuLUFactorization(T* srcMatrix,
                                      int* srcRowIndices,
                                      int* srcColumnIndices,
                                      int* naturalToReordered,
                                      int* reorderedToNatual,
                                      size_t rowsInLevelSet,
                                      int startIdx)
    {
        auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        constexpr int scalarsInBlock = blocksize * blocksize;
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            int naturalIdx = reorderedToNatual[reorderedIdx];
            // for each element under the diagonal
            int endOfRowI = srcRowIndices[reorderedIdx + 1];
            int ij;
            for (ij = srcRowIndices[reorderedIdx];
                 ij < srcRowIndices[reorderedIdx + 1] && srcColumnIndices[ij] < naturalIdx;
                 ++ij) {
                // consider the block we are looking at to be A_ij
                // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
                // already inverted find the A_jj block
                int jj; // index in bcsr of the A_jj element
                int j = naturalToReordered[srcColumnIndices[ij]];
                int startOfRowJ = srcRowIndices[j];
                int endOfRowJ = srcRowIndices[j + 1];
                for (int blockInRowJ = startOfRowJ; blockInRowJ < endOfRowJ; ++blockInRowJ) {
                    if (srcColumnIndices[blockInRowJ] == srcColumnIndices[ij]) {
                        jj = blockInRowJ;
                        break;
                    }
                }
                // the DUNE implementation is inplace, and I need to take care to make sure
                // this out of place implementation is equivalent
                mmOverlap<T, blocksize>(
                    &srcMatrix[ij * scalarsInBlock], &srcMatrix[jj * scalarsInBlock], &srcMatrix[ij * scalarsInBlock]);
                // we have now accounted for an element under the diagonal and the diagonal element above it
                // now iterate over the blocks that align in the
                int jk = jj + 1;
                int ik = ij + 1;
                // this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
                while (ik != endOfRowI && jk != endOfRowJ) {
                    if (srcColumnIndices[ik] == srcColumnIndices[jk]) {
                        // A_jk = A_ij * A_jk
                        T tmp[scalarsInBlock] = {0};
                        mmNoOverlap<T, blocksize>(
                            &srcMatrix[ij * scalarsInBlock], &srcMatrix[jk * scalarsInBlock], tmp);
                        matrixSubtraction<T, blocksize>(&srcMatrix[ik * scalarsInBlock], tmp);
                        ++ik;
                        ++jk;
                    } else {
                        if (srcColumnIndices[ik] < srcColumnIndices[jk]) {
                            ++ik;
                        } else {
                            ++jk;
                        }
                    }
                }
            }
            invBlockInPlace<T, blocksize>(&srcMatrix[ij * scalarsInBlock]);
        }
    }
    // An index in the BCSR matrix can be in one of three states, above, on, or above the diagonal
    enum class POSITION_TYPE { UNDER_DIAG, ON_DIAG, ABOVE_DIAG };
    // this handles incrementation of an index on a row that is in a lower/upper split datastructure.
    // The point is that if we are in the lower or upper part we increment as usual.
    // if we reach the diagonal, we update the state, the index is not needed.
    // when we increment from the diagonal, we set it to be the first block in the upper part of that row
    __device__ __forceinline__ void
    incrementAcrossSplitStructure(int& idx, POSITION_TYPE& currentState, int lastLowerIdx, int firstUpperIdx)
    {
        if (currentState == POSITION_TYPE::UNDER_DIAG) {
            if (idx != lastLowerIdx - 1) { // we should increment to be on the diagonal
                ++idx;
            } else {
                currentState = POSITION_TYPE::ON_DIAG;
            }
        } else if (currentState == POSITION_TYPE::ON_DIAG) {
            idx = firstUpperIdx;
            currentState = POSITION_TYPE::ABOVE_DIAG;
        } else { // currentState = POSITION_TYPE::ABOVE_DIAG
            ++idx;
        }
    }
    template <class T, int blocksize>
    __global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
                                           int* lowerRowIndices,
                                           int* lowerColIndices,
                                           T* reorderedUpperMat,
                                           int* upperRowIndices,
                                           int* upperColIndices,
                                           T* diagonal,
                                           int* reorderedToNatural,
                                           int* naturalToReordered,
                                           const int startIdx,
                                           int rowsInLevelSet)
    {
        auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        constexpr int scalarsInBlock = blocksize * blocksize;
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            // if (reorderedIdx < rowsInLevelSet){
            int naturalIdx = reorderedToNatural[reorderedIdx];
            // for each element under the diagonal
            int endOfRowILower = lowerRowIndices[reorderedIdx + 1];
            int endOfRowIUpper = upperRowIndices[reorderedIdx + 1];
            int startOfRowILower = lowerRowIndices[reorderedIdx];
            int startOfRowIUpper = upperRowIndices[reorderedIdx];
            for (int ij = startOfRowILower; ij < endOfRowILower; ++ij) {
                // consider the block we are looking at to be A_ij
                // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
                // already inverted find the A_jj block
                int j = naturalToReordered[lowerColIndices[ij]];
                int endOfRowJ = upperRowIndices[j + 1];
                int startOfRowJ = upperRowIndices[j];
                // the DUNE implementation is inplace, and I need to take care to make sure
                // this out of place implementation is equivalent
                mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
                                        &diagonal[j * scalarsInBlock],
                                        &reorderedLowerMat[ij * scalarsInBlock]);
                // we have now accounted for an element under the diagonal and the diagonal element above it
                // now iterate over the blocks that align in the
                int jk = startOfRowJ;
                POSITION_TYPE ikState = POSITION_TYPE::UNDER_DIAG; // it is guaranteed that the ij element we consider
                                                                   // is under the diagonal
                int ik = ij;
                incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
                // this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
                // checking for ik != endOfRowUpper is not sufficient alone because
                // the same block index might be found in the lower part of the matrix
                while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
                    T* ikBlockPtr;
                    int ikColumn;
                    if (ikState == POSITION_TYPE::UNDER_DIAG) {
                        ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
                        ikColumn = lowerColIndices[ik];
                    } else if (ikState == POSITION_TYPE::ON_DIAG) {
                        ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
                        ikColumn = naturalIdx;
                    } else { // ikState == POSITION_TYPE::ABOVE_DIAG
                        ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
                        ikColumn = upperColIndices[ik];
                    }
                    if (ikColumn == upperColIndices[jk]) {
                        // A_jk = A_ij * A_jk
                        T tmp[scalarsInBlock] = {0};
                        mmNoOverlap<T, blocksize>(
                            &reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
                        matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
                        incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
                        ;
                        ++jk;
                    } else {
                        if (ikColumn < upperColIndices[jk]) {
                            incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
                            ;
                        } else {
                            ++jk;
                        }
                    }
                }
            }
            invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveLowerLevelSet(T* mat,
                                         int* rowIndices,
                                         int* colIndices,
                                         int* indexConversion,
                                         int startIdx,
                                         int rowsInLevelSet,
                                         const T* d,
                                         T* v)
    {
        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedIdx];
            const int naturalRowIdx = indexConversion[reorderedIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            int block = nnzIdx;
            for (; colIndices[block] < naturalRowIdx; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            for (int i = 0; i < blocksize; ++i) {
                v[naturalRowIdx * blocksize + i] = rhs[i];
            }
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveUpperLevelSet(
        T* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, T* v)
    {
        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = v[naturalRowIdx * blocksize + i];
            }
            int block = nnzIdxLim - 1;
            for (; colIndices[block] > naturalRowIdx; --block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&mat[block * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveLowerLevelSetSplit(T* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
                                              const T* d,
                                              T* v)
    {
        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedIdx];
            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            for (int i = 0; i < blocksize; ++i) {
                v[naturalRowIdx * blocksize + i] = rhs[i];
            }
        }
    }
    template <class T, int blocksize>
    __global__ void cuSolveUpperLevelSetSplit(T* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
                                              const T* dInv,
                                              T* v)
    {
        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedIdx < rowsInLevelSet + startIdx) {
            const size_t nnzIdx = rowIndices[reorderedIdx];
            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedIdx];
            T rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = v[naturalRowIdx * blocksize + i];
            }
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }
            mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
 } // namespace
 template <class T, int blocksize>
 void
 solveLowerLevelSet(T* reorderedMat,
                   int* rowIndices,
                   int* colIndices,
                   int* indexConversion,
                   int startIdx,
                   int rowsInLevelSet,
                   const T* d,
                   T* v,
                   int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
 void
 solveUpperLevelSet(T* reorderedMat,
                   int* rowIndices,
                   int* colIndices,
                   int* indexConversion,
                   int startIdx,
                   int rowsInLevelSet,
                   T* v,
                   int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }
 template <class T, int blocksize>
 void
 solveLowerLevelSetSplit(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* d,
                        T* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
 void
 solveUpperLevelSetSplit(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* dInv,
                        T* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 template <class T, int blocksize>
 void
 LUFactorization(T* srcMatrix,
                int* srcRowIndices,
                int* srcColumnIndices,
                int* naturalToReordered,
                int* reorderedToNatual,
                size_t rowsInLevelSet,
                int startIdx,
                int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
 template <class T, int blocksize>
 void
 LUFactorizationSplit(T* reorderedLowerMat,
                     int* lowerRowIndices,
                     int* lowerColIndices,
                     T* reorderedUpperMat,
                     int* upperRowIndices,
                     int* upperColIndices,
                     T* diagonal,
                     int* reorderedToNatural,
                     int* naturalToReordered,
                     const int startIdx,
                     int rowsInLevelSet,
                     int thrBlockSize)
 {
    int threadBlockSize
        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
    cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
                                                                             lowerRowIndices,
                                                                             lowerColIndices,
                                                                             reorderedUpperMat,
                                                                             upperRowIndices,
                                                                             upperColIndices,
                                                                             diagonal,
                                                                             reorderedToNatural,
                                                                             naturalToReordered,
                                                                             startIdx,
                                                                             rowsInLevelSet);
 }
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
    template void LUFactorizationSplit<T, blocksize>(                                                                  \
        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int);                                          \
    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
 INSTANTIATE_KERNEL_WRAPPERS(float, 3);
 INSTANTIATE_KERNEL_WRAPPERS(float, 4);
 INSTANTIATE_KERNEL_WRAPPERS(float, 5);
 INSTANTIATE_KERNEL_WRAPPERS(float, 6);
 INSTANTIATE_KERNEL_WRAPPERS(double, 1);
 INSTANTIATE_KERNEL_WRAPPERS(double, 2);
 INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 } // namespace Opm::cuistl::detail::ILU0
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -0,0 +1,193 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_ILU0_KERNELS_HPP
 #define OPM_ILU0_KERNELS_HPP
 #include <cstddef>
 #include <vector>
 namespace Opm::cuistl::detail::ILU0
 {
 /**
 * @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param [out] v Will store the results of the upper solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void solveUpperLevelSet(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        T* v,
                        int threadBlockSize);
 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
 * that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void solveLowerLevelSet(T* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
                        const T* d,
                        T* v,
                        int threadBlockSize);
 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
 * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void solveUpperLevelSetSplit(T* reorderedMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
                             const T* dInv,
                             T* v,
                             int threadBlockSize);
 /**
 * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
 * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
 * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
 * in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void solveLowerLevelSetSplit(T* reorderedLowerMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
                             const T* d,
                             T* v,
                             int threadBlockSize);
 /**
 * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
 * elements store the inverse of the diagonal entries
 * @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
 * such that rows in the same level sets are contiguous
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param startIdx Index of the first row of the matrix to be solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void LUFactorization(T* reorderedMat,
                     int* rowIndices,
                     int* columnIndices,
                     int* naturalToReordered,
                     int* reorderedToNatual,
                     size_t rowsInLevelSet,
                     int startIdx,
                     int threadBlockSize);
 /**
 * @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
 * triangular part)
 * @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
 * matrix. The matrix reordered such that rows in the same level sets are contiguous
 * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
 * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
 * @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
 * matrix. The matrix reordered such that rows in the same level sets are contiguous
 * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
 * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
 * @param [out] diagonal The diagonal elements of the ILU0 factorization inverted
 * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
 * index in the natural ordered matrix
 * @param startIdx Index of the first row of the matrix to be solve
 * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
 * function
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
 template <class T, int blocksize>
 void LUFactorizationSplit(T* reorderedLowerMat,
                          int* lowerRowIndices,
                          int* lowerColIndices,
                          T* reorderedUpperMat,
                          int* upperRowIndices,
                          int* upperColIndices,
                          T* diagonal,
                          int* reorderedToNatural,
                          int* naturalToReordered,
                          int startIdx,
                          int rowsInLevelSet,
                          int threadBlockSize);
 } // namespace Opm::cuistl::detail::ILU0
 #endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu
@ -0,0 +1,88 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
 #include <stdexcept>
 namespace Opm::cuistl::detail::JAC
 {
 namespace
 {
    template <class T, int blocksize>
    __global__ void
    cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
    {
        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
        if (row < numberOfRows) {
            size_t nnzIdx = rowIndices[row];
            size_t nnzIdxLim = rowIndices[row + 1];
            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
            // zero element
            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
                ++nnzIdx;
            }
            // diagBlock points to the start of where the diagonal block is stored
            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
            // element should be stored
            T* vecBlock = &vec[blocksize * blocksize * row];
            invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
        }
    }
 } // namespace
 template <class T, int blocksize>
 void
 invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
 {
    if (blocksize <= 3) {
        int threadBlockSize
            = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
        cuInvertDiagonalAndFlatten<T, blocksize>
            <<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
 INSTANTIATE_KERNEL_WRAPPERS(float, 3);
 INSTANTIATE_KERNEL_WRAPPERS(float, 4);
 INSTANTIATE_KERNEL_WRAPPERS(float, 5);
 INSTANTIATE_KERNEL_WRAPPERS(float, 6);
 INSTANTIATE_KERNEL_WRAPPERS(double, 1);
 INSTANTIATE_KERNEL_WRAPPERS(double, 2);
 INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
 } // namespace Opm::cuistl::detail::JAC
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp
@ -0,0 +1,38 @@
 /*
  Copyright 2024 SINTEF AS
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_JAC_KERNELS_HPP
 #define OPM_JAC_KERNELS_HPP
 #include <cstddef>
 #include <vector>
 namespace Opm::cuistl::detail::JAC
 {
 /**
 * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
 * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
 * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
 * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
 * @param numberOfRows Integer describing the number of rows in the matrix
 * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
 */
 template <class T, int blocksize>
 void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
 } // namespace Opm::cuistl::detail::JAC
 #endif
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@ -16,14 +16,15 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 #include <stdexcept>
 #include <config.h>
 namespace Opm::cuistl::detail
 {
@ -83,20 +84,8 @@ namespace
        }
    }
    constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements)
    {
        return 1024;
    }
    inline size_t getBlocks(size_t numberOfElements)
    {
        const auto threads = getThreads(numberOfElements);
        return (numberOfElements + threads - 1) / threads;
    }
    template <class T>
-    __global__ void
+    __global__ void prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
    prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
@ -105,8 +94,7 @@ namespace
        }
    }
    template <class T>
-    __global__ void
+    __global__ void syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
    syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
@ -116,33 +104,13 @@ namespace
    }
 } // namespace
 // Kernel here is the function object of the cuda kernel
 template <class Kernel>
 inline int getCudaRecomendedThreadBlockSize(Kernel k)
 {
 #if USE_HIP
    return 512;
 #else
    int blockSize;
    int tmpGridSize;
    std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
    return blockSize;
 #endif
 }
 inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
 {
    return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
 }
 template <class T>
 void
 setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-    setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+    setVectorValueKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, value);
        deviceData, numberOfElements, value);
 }
 template void setVectorValue(double*, size_t, const double&);
@ -153,10 +121,9 @@ template <class T>
 void
 setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-    setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+    setZeroAtIndexSetKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, indices);
        deviceData, numberOfElements, indices);
 }
 template void setZeroAtIndexSet(double*, size_t, const int*);
 template void setZeroAtIndexSet(float*, size_t, const int*);
@ -164,12 +131,16 @@ template void setZeroAtIndexSet(int*, size_t, const int*);
 template <class T>
 T
-innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
+innerProductAtIndices(cublasHandle_t cublasHandle,
                      const T* deviceA,
                      const T* deviceB,
                      T* buffer,
                      size_t numberOfElements,
                      const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-    elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+    elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);
        deviceA, deviceB, buffer, numberOfElements, indices);
    // TODO: [perf] Get rid of the allocation here.
    CuVector<T> oneVector(numberOfElements);
@ -184,11 +155,12 @@ template float innerProductAtIndices(cublasHandle_t, const float*, const float*,
 template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*);
 template <class T>
-void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
+void
 prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-    prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
+    prepareSendBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
    OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
 }
 template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
@ -196,11 +168,12 @@ template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberO
 template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
 template <class T>
-void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
+void
 syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-    syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
+    syncFromRecvBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
    // cudaDeviceSynchronize(); // Not needed, I guess...
 }
 template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
@ -217,30 +190,24 @@ weightedDiagMV(const T* squareBlockVector,
               T* dstVec)
 {
    switch (blocksize) {
-    case 1:
+    case 1: {
-        {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
-            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+        weightedDiagMV<T, 1>
-            weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
+    } break;
-        }
+    case 2: {
-        break;
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
-    case 2:
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-        {
+        weightedDiagMV<T, 2>
-            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
+    } break;
-            weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+    case 3: {
-                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
-        }
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
-        break;
+        weightedDiagMV<T, 3>
-    case 3:
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-        {
+    } break;
            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
            weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
        }
        break;
    default:
        OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
        break;
--- a/tests/cuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp
@ -28,6 +28,7 @@
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
 #include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
 using NumericTypes = boost::mpl::list<double, float>;
@ -87,7 +88,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
-    Opm::cuistl::detail::invertDiagonalAndFlatten<T, 3>(
+    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
    std::vector<T> expectedInvDiag {-1.0 / 4.0,
@ -161,7 +162,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
-    Opm::cuistl::detail::invertDiagonalAndFlatten<T, 2>(
+    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
    std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};