Add an OPM implementation of ILU0

improve file structure in cuistl run clang-format
2025-02-25 18:55:30 -06:00 · 2024-06-19 13:14:59 +02:00 · 2024-06-19 13:14:59 +02:00 · 7a30aaa46e
commit 7a30aaa46e
parent ad595fed5e
22 changed files with 2489 additions and 1043 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -205,16 +205,21 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)

  # HEADERS
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
@ -223,7 +228,11 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
@ -238,6 +247,8 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp)
+  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
  ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)
--- a/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
+++ b/opm/simulators/linalg/PreconditionerFactoryGPUIncludeWrapper.hpp
@ -24,6 +24,7 @@
 #if USE_HIP
 #include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/hipistl/CuDILU.hpp>
+#include <opm/simulators/linalg/hipistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/hipistl/CuJac.hpp>
 #include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
 #include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
@ -32,6 +33,7 @@
 #else
 #include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
+#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -349,7 +349,6 @@ struct StandardPreconditioners {
        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            // const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
            using field_type = typename V::field_type;
            using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
            auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
@ -358,6 +357,18 @@ struct StandardPreconditioners {
            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
            return wrapped;
        });
+
+        F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
+            const bool split_matrix = prm.get<bool>("split_matrix", true);
+            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            using field_type = typename V::field_type;
+            using OpmCuILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+            auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
+
+            auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
+            auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
+            return wrapped;
+        });
 #endif
    }

@ -605,6 +616,15 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
                std::make_shared<CUJac>(op.getmat(), w));
        });

+        F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
+            const bool split_matrix = prm.get<bool>("split_matrix", true);
+            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            using field_type = typename V::field_type;
+            using CUILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
+
+            return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
+        });
+
        F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@ -16,110 +16,23 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <chrono>
+#include <config.h>
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <fmt/core.h>
+#include <limits>
 #include <opm/common/ErrorMacros.hpp>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
-#include <vector>
-#include <config.h>
-#include <chrono>
-#include <limits>
 #include <tuple>
-
-
-namespace
-{
-std::vector<int>
-createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
-{
-    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
-    int globCnt = 0;
-    for (auto row : levelSets) {
-        for (auto col : row) {
-            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
-                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
-            res[globCnt++] = static_cast<int>(col);
-        }
-    }
-    return res;
-}
-
-std::vector<int>
-createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
-{
-    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
-    int globCnt = 0;
-    for (auto row : levelSets) {
-        for (auto col : row) {
-            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
-                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
-            res[col] = globCnt++;
-        }
-    }
-    return res;
-}
-
-template <class M, class field_type, class GPUM>
-void
-createReorderedMatrix(const M& naturalMatrix,
-                      std::vector<int>& reorderedToNatural,
-                      std::unique_ptr<GPUM>& reorderedGpuMat)
-{
-    M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
-    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
-        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
-        // For elements in A
-        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
-            dstRowIt.insert(elem.index());
-        }
-    }
-
-    reorderedGpuMat.reset(new auto (GPUM::fromMatrix(reorderedMatrix, true)));
-}
-
-template <class M, class field_type, class GPUM>
-void
-extractLowerAndUpperMatrices(const M& naturalMatrix,
-                             std::vector<int>& reorderedToNatural,
-                             std::unique_ptr<GPUM>& lower,
-                             std::unique_ptr<GPUM>& upper)
-{
-    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
-
-    M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
-    M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
-
-    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
-         lowerIt != reorderedLower.createend();
-         ++lowerIt, ++upperIt) {
-
-        auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
-
-        for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
-            if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
-                lowerIt.insert(elem.index());
-            } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
-                upperIt.insert(elem.index());
-            }
-        }
-    }
-
-    lower.reset(new auto (GPUM::fromMatrix(reorderedLower, true)));
-    upper.reset(new auto (GPUM::fromMatrix(reorderedUpper, true)));
-    return;
-}
-
-} // NAMESPACE
-
 namespace Opm::cuistl
 {

@ -127,8 +40,8 @@ template <class M, class X, class Y, int l>
 CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
-    , m_reorderedToNatural(createReorderedToNatural(m_levelSets))
-    , m_naturalToReordered(createNaturalToReordered(m_levelSets))
+    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
+    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
    , m_gpuNaturalToReorder(m_naturalToReordered)
    , m_gpuReorderToNatural(m_reorderedToNatural)
@ -154,19 +67,19 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
                             m_gpuMatrix.nonzeroes(),
                             A.nonzeroes()));
    if (m_splitMatrix) {
-        m_gpuMatrixReorderedDiag.reset(new auto(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N())));
-        extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
-            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
-    } else {
-        createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
-            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
+        m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
+        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
+            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
+                                                                                              m_reorderedToNatural);
+        m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+            m_cpuMatrix, m_reorderedToNatural);
    }
    computeDiagAndMoveReorderedData();

    // HIP does currently not support automtically picking thread block sizes as well as CUDA
    // So only when tuning and using hip should we do our own manual tuning
 #ifdef USE_HIP
-    if (m_tuneThreadBlockSizes){
+    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
    }
 #endif
@ -188,7 +101,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
-                detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
+                detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -200,7 +113,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
-                detail::computeLowerSolveLevelSet<field_type, blocksize_>(
+                detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
@ -221,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
            const int numOfRowsInLevel = m_levelSets[level].size();
            levelStartIdx -= numOfRowsInLevel;
            if (m_splitMatrix) {
-                detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
+                detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@ -232,7 +145,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
                    v.data(),
                    m_applyThreadBlockSize);
            } else {
-                detail::computeUpperSolveLevelSet<field_type, blocksize_>(
+                detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
@ -304,7 +217,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
        for (int level = 0; level < m_levelSets.size(); ++level) {
            const int numOfRowsInLevel = m_levelSets[level].size();
            if (m_splitMatrix) {
-                detail::computeDiluDiagonalSplit<field_type, blocksize_>(
+                detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -319,7 +232,8 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
                    m_gpuDInv.data(),
                    m_updateThreadBlockSize);
            } else {
-                detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
+                detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
+                    m_gpuMatrixReordered->getNonZeroValues().data(),
                    m_gpuMatrixReordered->getRowIndices().data(),
                    m_gpuMatrixReordered->getColumnIndices().data(),
                    m_gpuReorderToNatural.data(),
@ -345,23 +259,24 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
    int bestUpdateBlockSize = -1;
    int interval = 64;

-    //temporary buffers for the apply
+    // temporary buffers for the apply
    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
    tmpD = 1;

-    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
+    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
        // sometimes the first kernel launch kan be slower, so take the time twice
-        for (int i = 0; i < 2; ++i){
+        for (int i = 0; i < 2; ++i) {

            auto beforeUpdate = std::chrono::high_resolution_clock::now();
            m_updateThreadBlockSize = thrBlockSize;
            update();
            std::ignore = cudaDeviceSynchronize();
            auto afterUpdate = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
-                if (durationInMicroSec < bestUpdateTime){
+            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
+                long long durationInMicroSec
+                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
+                if (durationInMicroSec < bestUpdateTime) {
                    bestUpdateTime = durationInMicroSec;
                    bestUpdateBlockSize = thrBlockSize;
                }
@ -372,9 +287,10 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
            apply(tmpV, tmpD);
            std::ignore = cudaDeviceSynchronize();
            auto afterApply = std::chrono::high_resolution_clock::now();
-            if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
-                long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
-                if (durationInMicroSec < bestApplyTime){
+            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
+                long long durationInMicroSec
+                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
+                if (durationInMicroSec < bestApplyTime) {
                    bestApplyTime = durationInMicroSec;
                    bestApplyBlockSize = thrBlockSize;
                }
--- a/opm/simulators/linalg/cuistl/CuDILU.hpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.hpp
@ -19,17 +19,11 @@
 #ifndef OPM_CUDILU_HPP
 #define OPM_CUDILU_HPP

-#include <dune/istl/preconditioner.hh>
+#include <memory>
 #include <opm/grid/utility/SparseTable.hpp>
-#include <opm/simulators/linalg/GraphColoring.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
-#include <optional>
 #include <vector>
-#include <memory>



--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@ -16,27 +16,13 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cusparse.h>
 #include <dune/common/fmatrix.hh>
-#include <dune/common/fvector.hh>
 #include <dune/istl/bcrsmatrix.hh>
-#include <dune/istl/bvector.hh>
 #include <fmt/core.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/CuJac.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
-#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
-#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
-#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
-#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
 #include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>

@ -67,11 +53,7 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
                             A.nonzeroes()));

    // Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
-    detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                                m_gpuMatrix.getRowIndices().data(),
-                                                                                m_gpuMatrix.getColumnIndices().data(),
-                                                                                m_gpuMatrix.N(),
-                                                                                m_diagInvFlattened.data());
+    invertDiagonalAndFlatten();
 }

 template <class M, class X, class Y, int l>
@ -111,7 +93,15 @@ void
 CuJac<M, X, Y, l>::update()
 {
    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
-    detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
+    invertDiagonalAndFlatten();
+}
+
+template <class M, class X, class Y, int l>
+void
+CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
+{
+    detail::JAC::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(
+        m_gpuMatrix.getNonZeroValues().data(),
        m_gpuMatrix.getRowIndices().data(),
        m_gpuMatrix.getColumnIndices().data(),
        m_gpuMatrix.N(),
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@ -103,6 +103,8 @@ private:
    CuSparseMatrix<field_type> m_gpuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
    CuVector<field_type> m_diagInvFlattened;
+
+    void invertDiagonalAndFlatten();
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.cpp
@ -0,0 +1,318 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <chrono>
+#include <config.h>
+#include <dune/common/fmatrix.hh>
+#include <dune/istl/bcrsmatrix.hh>
+#include <fmt/core.h>
+#include <limits>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/common/TimingMacros.hpp>
+#include <opm/simulators/linalg/GraphColoring.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
+#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
+#include <opm/simulators/linalg/matrixblock.hh>
+#include <tuple>
+namespace Opm::cuistl
+{
+
+template <class M, class X, class Y, int l>
+OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
+    : m_cpuMatrix(A)
+    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
+    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
+    , m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
+    , m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
+    , m_gpuMatrixReorderedLower(nullptr)
+    , m_gpuMatrixReorderedUpper(nullptr)
+    , m_gpuNaturalToReorder(m_naturalToReordered)
+    , m_gpuReorderToNatural(m_reorderedToNatural)
+    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
+    , m_splitMatrix(splitMatrix)
+    , m_tuneThreadBlockSizes(tuneKernels)
+{
+    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
+    // Some sanity check
+    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
+                 fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
+    OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
+                 fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
+                             m_gpuMatrix.blockSize(),
+                             A[0][0].N()));
+    OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
+                 fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
+                             m_gpuMatrix.dim(),
+                             A.N() * A[0][0].N()));
+    OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
+                 fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
+                             m_gpuMatrix.nonzeroes(),
+                             A.nonzeroes()));
+    if (m_splitMatrix) {
+        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
+        std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
+            = detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
+                                                                                              m_reorderedToNatural);
+    } else {
+        m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+            m_cpuMatrix, m_reorderedToNatural);
+    }
+    LUFactorizeAndMoveData();
+
+#ifdef USE_HIP
+    if (m_tuneThreadBlockSizes) {
+        tuneThreadBlockSizes();
+    }
+#endif
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
+{
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
+{
+    OPM_TIMEBLOCK(prec_apply);
+    {
+        int levelStartIdx = 0;
+        for (int level = 0; level < m_levelSets.size(); ++level) {
+            const int numOfRowsInLevel = m_levelSets[level].size();
+            if (m_splitMatrix) {
+                detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    d.data(),
+                    v.data(),
+                    m_applyThreadBlockSize);
+            } else {
+                detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                         m_gpuReorderedLU->getRowIndices().data(),
+                                                                         m_gpuReorderedLU->getColumnIndices().data(),
+                                                                         m_gpuReorderToNatural.data(),
+                                                                         levelStartIdx,
+                                                                         numOfRowsInLevel,
+                                                                         d.data(),
+                                                                         v.data(),
+                                                                         m_applyThreadBlockSize);
+            }
+            levelStartIdx += numOfRowsInLevel;
+        }
+
+        levelStartIdx = m_cpuMatrix.N();
+        for (int level = m_levelSets.size() - 1; level >= 0; --level) {
+            const int numOfRowsInLevel = m_levelSets[level].size();
+            levelStartIdx -= numOfRowsInLevel;
+            if (m_splitMatrix) {
+                detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    v.data(),
+                    m_applyThreadBlockSize);
+            } else {
+                detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                         m_gpuReorderedLU->getRowIndices().data(),
+                                                                         m_gpuReorderedLU->getColumnIndices().data(),
+                                                                         m_gpuReorderToNatural.data(),
+                                                                         levelStartIdx,
+                                                                         numOfRowsInLevel,
+                                                                         v.data(),
+                                                                         m_applyThreadBlockSize);
+            }
+        }
+    }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
+{
+}
+
+template <class M, class X, class Y, int l>
+Dune::SolverCategory::Category
+OpmCuILU0<M, X, Y, l>::category() const
+{
+    return Dune::SolverCategory::sequential;
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::update()
+{
+    OPM_TIMEBLOCK(prec_update);
+    {
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
+        LUFactorizeAndMoveData();
+    }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
+{
+    OPM_TIMEBLOCK(prec_update);
+    {
+        if (m_splitMatrix) {
+            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+                m_gpuMatrix.getNonZeroValues().data(),
+                m_gpuMatrix.getRowIndices().data(),
+                m_gpuMatrix.getColumnIndices().data(),
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedDiag.value().data(),
+                m_gpuNaturalToReorder.data(),
+                m_gpuMatrixReorderedLower->N(),
+                m_updateThreadBlockSize);
+        } else {
+            detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
+                                                                   m_gpuMatrix.getRowIndices().data(),
+                                                                   m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                   m_gpuReorderedLU->getRowIndices().data(),
+                                                                   m_gpuNaturalToReorder.data(),
+                                                                   m_gpuReorderedLU->N(),
+                                                                   m_updateThreadBlockSize);
+        }
+        int levelStartIdx = 0;
+        for (int level = 0; level < m_levelSets.size(); ++level) {
+            const int numOfRowsInLevel = m_levelSets[level].size();
+
+            if (m_splitMatrix) {
+                detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_updateThreadBlockSize);
+
+            } else {
+                detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
+                                                                      m_gpuReorderedLU->getRowIndices().data(),
+                                                                      m_gpuReorderedLU->getColumnIndices().data(),
+                                                                      m_gpuNaturalToReorder.data(),
+                                                                      m_gpuReorderToNatural.data(),
+                                                                      numOfRowsInLevel,
+                                                                      levelStartIdx,
+                                                                      m_updateThreadBlockSize);
+            }
+            levelStartIdx += numOfRowsInLevel;
+        }
+    }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
+{
+    // TODO generalize this tuning process in a function separate of the class
+    long long bestApplyTime = std::numeric_limits<long long>::max();
+    long long bestUpdateTime = std::numeric_limits<long long>::max();
+    int bestApplyBlockSize = -1;
+    int bestUpdateBlockSize = -1;
+    int interval = 64;
+
+    // temporary buffers for the apply
+    CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
+    tmpD = 1;
+
+    for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
+        // sometimes the first kernel launch kan be slower, so take the time twice
+        for (int i = 0; i < 2; ++i) {
+
+            auto beforeUpdate = std::chrono::high_resolution_clock::now();
+            m_updateThreadBlockSize = thrBlockSize;
+            update();
+            std::ignore = cudaDeviceSynchronize();
+            auto afterUpdate = std::chrono::high_resolution_clock::now();
+            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
+                long long durationInMicroSec
+                    = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
+                if (durationInMicroSec < bestUpdateTime) {
+                    bestUpdateTime = durationInMicroSec;
+                    bestUpdateBlockSize = thrBlockSize;
+                }
+            }
+
+            auto beforeApply = std::chrono::high_resolution_clock::now();
+            m_applyThreadBlockSize = thrBlockSize;
+            apply(tmpV, tmpD);
+            std::ignore = cudaDeviceSynchronize();
+            auto afterApply = std::chrono::high_resolution_clock::now();
+            if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
+                long long durationInMicroSec
+                    = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
+                if (durationInMicroSec < bestApplyTime) {
+                    bestApplyTime = durationInMicroSec;
+                    bestApplyBlockSize = thrBlockSize;
+                }
+            }
+        }
+    }
+
+    m_applyThreadBlockSize = bestApplyBlockSize;
+    m_updateThreadBlockSize = bestUpdateBlockSize;
+}
+
+} // namespace Opm::cuistl
+#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim)                                                                    \
+    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>,         \
+                                            ::Opm::cuistl::CuVector<realtype>,                                         \
+                                            ::Opm::cuistl::CuVector<realtype>>;                                        \
+    template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>,          \
+                                            ::Opm::cuistl::CuVector<realtype>,                                         \
+                                            ::Opm::cuistl::CuVector<realtype>>
+
+INSTANTIATE_CUDILU_DUNE(double, 1);
+INSTANTIATE_CUDILU_DUNE(double, 2);
+INSTANTIATE_CUDILU_DUNE(double, 3);
+INSTANTIATE_CUDILU_DUNE(double, 4);
+INSTANTIATE_CUDILU_DUNE(double, 5);
+INSTANTIATE_CUDILU_DUNE(double, 6);
+
+INSTANTIATE_CUDILU_DUNE(float, 1);
+INSTANTIATE_CUDILU_DUNE(float, 2);
+INSTANTIATE_CUDILU_DUNE(float, 3);
+INSTANTIATE_CUDILU_DUNE(float, 4);
+INSTANTIATE_CUDILU_DUNE(float, 5);
+INSTANTIATE_CUDILU_DUNE(float, 6);
--- a/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
+++ b/opm/simulators/linalg/cuistl/OpmCuILU0.hpp
@ -0,0 +1,139 @@
+/*
+  Copyright 2022-2023 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_CUILU0_OPM_Impl_HPP
+#define OPM_CUILU0_OPM_Impl_HPP
+
+#include <memory>
+#include <opm/grid/utility/SparseTable.hpp>
+#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
+#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <optional>
+#include <type_traits>
+#include <vector>
+
+
+namespace Opm::cuistl
+{
+//! \brief ILU0 preconditioner on the GPU.
+//!
+//! \tparam M The matrix type to operate on
+//! \tparam X Type of the update
+//! \tparam Y Type of the defect
+//! \tparam l Ignored. Just there to have the same number of template arguments
+//!    as other preconditioners.
+//!
+//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
+//! arguments in case of future additions.
+template <class M, class X, class Y, int l = 1>
+class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
+{
+public:
+    //! \brief The matrix type the preconditioner is for.
+    using matrix_type = typename std::remove_const<M>::type;
+    //! \brief The domain type of the preconditioner.
+    using domain_type = X;
+    //! \brief The range type of the preconditioner.
+    using range_type = Y;
+    //! \brief The field type of the preconditioner.
+    using field_type = typename X::field_type;
+    //! \brief The GPU matrix type
+    using CuMat = CuSparseMatrix<field_type>;
+
+    //! \brief Constructor.
+    //!
+    //!  Constructor gets all parameters to operate the prec.
+    //! \param A The matrix to operate on.
+    //! \param w The relaxation factor.
+    //!
+    explicit OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels);
+
+    //! \brief Prepare the preconditioner.
+    //! \note Does nothing at the time being.
+    void pre(X& x, Y& b) override;
+
+    //! \brief Apply the preconditoner.
+    void apply(X& v, const Y& d) override;
+
+    //! \brief Post processing
+    //! \note Does nothing at the moment
+    void post(X& x) override;
+
+    //! Category of the preconditioner (see SolverCategory::Category)
+    Dune::SolverCategory::Category category() const override;
+
+    //! \brief Updates the matrix data.
+    void update() final;
+
+    //! \brief Compute LU factorization, and update the data of the reordered matrix
+    void LUFactorizeAndMoveData();
+
+    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
+    void tuneThreadBlockSizes();
+
+    //! \returns false
+    static constexpr bool shouldCallPre()
+    {
+        return false;
+    }
+
+    //! \returns false
+    static constexpr bool shouldCallPost()
+    {
+        return false;
+    }
+
+
+private:
+    //! \brief Reference to the underlying matrix
+    const M& m_cpuMatrix;
+    //! \brief size_t describing the dimensions of the square block elements
+    static constexpr const size_t blocksize_ = matrix_type::block_type::cols;
+    //! \brief SparseTable storing each row by level
+    Opm::SparseTable<size_t> m_levelSets;
+    //! \brief converts from index in reordered structure to index natural ordered structure
+    std::vector<int> m_reorderedToNatural;
+    //! \brief converts from index in natural ordered structure to index reordered strucutre
+    std::vector<int> m_naturalToReordered;
+    //! \brief The A matrix stored on the gpu, and its reordred version
+    CuMat m_gpuMatrix;
+    std::unique_ptr<CuMat> m_gpuReorderedLU;
+    //! \brief If matrix splitting is enabled, then we store the lower and upper part separately
+    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
+    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
+    //! \brief If matrix splitting is enabled, we also store the diagonal separately
+    std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
+    //! row conversion from natural to reordered matrix indices stored on the GPU
+    CuVector<int> m_gpuNaturalToReorder;
+    //! row conversion from reordered to natural matrix indices stored on the GPU
+    CuVector<int> m_gpuReorderToNatural;
+    //! \brief Stores the inverted diagonal that we use in ILU0
+    CuVector<field_type> m_gpuDInv;
+    //! \brief Bool storing whether or not we should store matrices in a split format
+    bool m_splitMatrix;
+    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
+    bool m_tuneThreadBlockSizes;
+    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
+    //! The default value of -1 indicates that we have not calibrated and selected a value yet
+    int m_applyThreadBlockSize = -1;
+    int m_updateThreadBlockSize = -1;
+};
+} // end namespace Opm::cuistl
+
+#endif
--- a/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
+++ b/opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp
@ -0,0 +1,110 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_COLORING_AND_REORDERING_UTILS_HPP
+#define OPM_COLORING_AND_REORDERING_UTILS_HPP
+
+#include <fmt/core.h>
+#include <memory>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/grid/utility/SparseTable.hpp>
+#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
+#include <tuple>
+#include <vector>
+/*
+This file contains a collection of utility functions used in the GPU implementation of ILU and DILU
+The functions deal with creating the mappings between reordered and natural indices, as well as
+extracting sparsity structures from dune matrices and creating cusparsematrix indices
+*/
+namespace Opm::cuistl::detail
+{
+inline std::vector<int>
+createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
+{
+    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    int globCnt = 0;
+    for (auto row : levelSets) {
+        for (auto col : row) {
+            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
+            res[globCnt++] = static_cast<int>(col);
+        }
+    }
+    return res;
+}
+
+inline std::vector<int>
+createNaturalToReordered(const Opm::SparseTable<size_t>& levelSets)
+{
+    auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
+    int globCnt = 0;
+    for (auto row : levelSets) {
+        for (auto col : row) {
+            OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
+                         fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
+            res[col] = globCnt++;
+        }
+    }
+    return res;
+}
+
+template <class M, class field_type, class GPUM>
+inline std::unique_ptr<GPUM>
+createReorderedMatrix(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
+{
+    M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
+    for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
+        // For elements in A
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
+            dstRowIt.insert(elem.index());
+        }
+    }
+    return std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedMatrix, true)));
+}
+
+template <class M, class field_type, class GPUM>
+inline std::tuple<std::unique_ptr<GPUM>, std::unique_ptr<GPUM>>
+extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
+{
+    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
+
+    M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
+    M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
+
+    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
+         lowerIt != reorderedLower.createend();
+         ++lowerIt, ++upperIt) {
+
+        auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
+
+        for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
+            if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
+                lowerIt.insert(elem.index());
+            } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
+                upperIt.insert(elem.index());
+            }
+        }
+    }
+
+    return {std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedLower, true))),
+            std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedUpper, true)))};
+}
+} // namespace Opm::cuistl::detail
+
+#endif
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@ -16,425 +16,17 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <config.h>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
 #include <stdexcept>
-#include <config.h>

 namespace Opm::cuistl::detail
 {
 namespace
 {
-
-    // TODO: figure out if this can be generalized effectively, this seems excessively verbose
-    // explicit formulas based on Dune cpu code
-    template <class T, int blocksize>
-    __device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
-    {
-        if (blocksize == 1) {
-            dstBlock[0] = 1.0 / (srcBlock[0]);
-        } else if (blocksize == 2) {
-            T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
-            dstBlock[0] = srcBlock[3] * detInv;
-            dstBlock[1] = -srcBlock[1] * detInv;
-            dstBlock[2] = -srcBlock[2] * detInv;
-            dstBlock[3] = srcBlock[0] * detInv;
-        } else if (blocksize == 3) {
-            // based on Dune implementation
-            T t4 = srcBlock[0] * srcBlock[4];
-            T t6 = srcBlock[0] * srcBlock[5];
-            T t8 = srcBlock[1] * srcBlock[3];
-            T t10 = srcBlock[2] * srcBlock[3];
-            T t12 = srcBlock[1] * srcBlock[6];
-            T t14 = srcBlock[2] * srcBlock[6];
-
-            T t17 = 1.0
-                / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
-                   - t14 * srcBlock[4]); // t17 is 1/determinant
-
-            dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
-            dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
-            dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
-            dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
-            dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
-            dstBlock[5] = -(t6 - t10) * t17;
-            dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
-            dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
-            dstBlock[8] = (t4 - t8) * t17;
-        }
-    }
-
-    // explicit formulas based on Dune cpu code
-    template <class T, int blocksize>
-    __device__ __forceinline__ void invBlockInPlace(T* __restrict__ block)
-    {
-        if (blocksize == 1) {
-            block[0] = 1.0 / (block[0]);
-        } else if (blocksize == 2) {
-            T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
-
-            T temp = block[0];
-            block[0] = block[3] * detInv;
-            block[1] = -block[1] * detInv;
-            block[2] = -block[2] * detInv;
-            block[3] = temp * detInv;
-        } else if (blocksize == 3) {
-            T t4 = block[0] * block[4];
-            T t6 = block[0] * block[5];
-            T t8 = block[1] * block[3];
-            T t10 = block[2] * block[3];
-            T t12 = block[1] * block[6];
-            T t14 = block[2] * block[6];
-
-            T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
-            T t17 = T(1.0) / det;
-
-            T matrix01 = block[1];
-            T matrix00 = block[0];
-            T matrix10 = block[3];
-            T matrix11 = block[4];
-
-            block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
-            block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
-            block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
-            block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
-            block[4] = (matrix00 * block[8] - t14) * t17;
-            block[5] = -(t6 - t10) * t17;
-            block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
-            block[7] = -(matrix00 * block[7] - t12) * t17;
-            block[8] = (t4 - t8) * t17;
-        }
-    }
-
-    enum class MVType { SET, PLUS, MINUS };
-    // SET:   c  = A*b
-    // PLS:   c += A*b
-    // MINUS: c -= A*b
-    template <class T, int blocksize, MVType OP>
-    __device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c)
-    {
-        for (int i = 0; i < blocksize; ++i) {
-            if (OP == MVType::SET) {
-                c[i] = 0;
-            }
-
-            for (int j = 0; j < blocksize; ++j) {
-                if (OP == MVType::SET || OP == MVType::PLUS) {
-                    c[i] += A[i * blocksize + j] * b[j];
-                } else if (OP == MVType::MINUS) {
-                    c[i] -= A[i * blocksize + j] * b[j];
-                }
-            }
-        }
-    }
-
-    template <class T, int blocksize>
-    __device__ __forceinline__ void mv(const T* a, const T* b, T* c)
-    {
-        matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
-    }
-
-    template <class T, int blocksize>
-    __device__ __forceinline__ void umv(const T* a, const T* b, T* c)
-    {
-        matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
-    }
-
-    template <class T, int blocksize>
-    __device__ __forceinline__ void mmv(const T* a, const T* b, T* c)
-    {
-        matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
-    }
-
-    // dst -= A*B*C
-    template <class T, int blocksize>
-    __device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst)
-    {
-
-        T tmp[blocksize * blocksize] = {0};
-        // tmp = A*B
-        for (int i = 0; i < blocksize; ++i) {
-            for (int k = 0; k < blocksize; ++k) {
-                for (int j = 0; j < blocksize; ++j) {
-                    tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
-                }
-            }
-        }
-
-        // dst = tmp*C
-        for (int i = 0; i < blocksize; ++i) {
-            for (int k = 0; k < blocksize; ++k) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
-                }
-            }
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void
-    cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
-    {
-        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
-
-        if (row < numberOfRows) {
-            size_t nnzIdx = rowIndices[row];
-            size_t nnzIdxLim = rowIndices[row + 1];
-
-            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
-            // zero element
-            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
-                ++nnzIdx;
-            }
-
-            // diagBlock points to the start of where the diagonal block is stored
-            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
-            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
-            // element should be stored
-            T* vecBlock = &vec[blocksize * blocksize * row];
-
-            invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void cuComputeLowerSolveLevelSet(T* mat,
-                                                int* rowIndices,
-                                                int* colIndices,
-                                                int* indexConversion,
-                                                int startIdx,
-                                                int rowsInLevelSet,
-                                                const T* dInv,
-                                                const T* d,
-                                                T* v)
-    {
-        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-
-            const size_t nnzIdx = rowIndices[reorderedRowIdx];
-            const int naturalRowIdx = indexConversion[reorderedRowIdx];
-
-            T rhs[blocksize];
-            for (int i = 0; i < blocksize; i++) {
-                rhs[i] = d[naturalRowIdx * blocksize + i];
-            }
-
-            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
-                const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
-            }
-
-            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
-                                                     int* rowIndices,
-                                                     int* colIndices,
-                                                     int* indexConversion,
-                                                     int startIdx,
-                                                     int rowsInLevelSet,
-                                                     const T* dInv,
-                                                     const T* d,
-                                                     T* v)
-    {
-        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-
-            const size_t nnzIdx = rowIndices[reorderedRowIdx];
-            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
-            const int naturalRowIdx = indexConversion[reorderedRowIdx];
-
-            T rhs[blocksize];
-            for (int i = 0; i < blocksize; i++) {
-                rhs[i] = d[naturalRowIdx * blocksize + i];
-            }
-
-            // TODO: removce the first condition in the for loop
-            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
-                const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
-            }
-
-            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
-        }
-    }
-
-
-    template <class T, int blocksize>
-    __global__ void cuComputeUpperSolveLevelSet(T* mat,
-                                                int* rowIndices,
-                                                int* colIndices,
-                                                int* indexConversion,
-                                                int startIdx,
-                                                int rowsInLevelSet,
-                                                const T* dInv,
-                                                T* v)
-    {
-        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
-            const int naturalRowIdx = indexConversion[reorderedRowIdx];
-
-            T rhs[blocksize] = {0};
-            for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
-                const int col = colIndices[block];
-                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
-            }
-
-            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
-                                                     int* rowIndices,
-                                                     int* colIndices,
-                                                     int* indexConversion,
-                                                     int startIdx,
-                                                     int rowsInLevelSet,
-                                                     const T* dInv,
-                                                     T* v)
-    {
-        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-            const size_t nnzIdx = rowIndices[reorderedRowIdx];
-            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
-            const int naturalRowIdx = indexConversion[reorderedRowIdx];
-
-            T rhs[blocksize] = {0};
-            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
-                const int col = colIndices[block];
-                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
-            }
-
-            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void cuComputeDiluDiagonal(T* mat,
-                                          int* rowIndices,
-                                          int* colIndices,
-                                          int* reorderedToNatural,
-                                          int* naturalToReordered,
-                                          const int startIdx,
-                                          int rowsInLevelSet,
-                                          T* dInv)
-    {
-        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
-            const size_t nnzIdx = rowIndices[reorderedRowIdx];
-
-            int diagIdx = nnzIdx;
-            while (colIndices[diagIdx] != naturalRowIdx) {
-                ++diagIdx;
-            }
-
-            T dInvTmp[blocksize * blocksize];
-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
-                }
-            }
-
-            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
-                const int col = naturalToReordered[colIndices[block]];
-                // find element with indices swapped
-                // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
-                // we binary search over
-                int left = rowIndices[col];
-                int right = rowIndices[col + 1] - 1;
-                int mid = left + (right - left) / 2; // overflow-safe average;
-
-                while (left <= right) {
-                    const int col = colIndices[mid];
-
-                    if (col == naturalRowIdx) {
-                        break;
-                    } else if (col < naturalRowIdx) {
-                        left = mid + 1;
-                    } else {
-                        right = mid - 1;
-                    }
-                    mid = left + (right - left) / 2; // overflow-safe average
-                }
-
-                const int symOpposite = mid;
-
-                mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &mat[symOpposite * blocksize * blocksize],
-                                              dInvTmp);
-            }
-
-            invBlockInPlace<T, blocksize>(dInvTmp);
-
-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
-                }
-            }
-        }
-    }
-
-    template <class T, int blocksize>
-    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
-                                               int* lowerRowIndices,
-                                               int* lowerColIndices,
-                                               T* reorderedUpperMat,
-                                               int* upperRowIndices,
-                                               int* upperColIndices,
-                                               T* diagonal,
-                                               int* reorderedToNatural,
-                                               int* naturalToReordered,
-                                               const int startIdx,
-                                               int rowsInLevelSet,
-                                               T* dInv)
-    {
-        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
-        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
-            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
-            const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
-            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
-
-            T dInvTmp[blocksize * blocksize];
-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
-                }
-            }
-
-            for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
-                const int col = naturalToReordered[lowerColIndices[block]];
-
-                int symOppositeIdx = upperRowIndices[col];
-                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
-                    if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
-                        break;
-                    }
-                }
-
-                const int symOppositeBlock = symOppositeIdx;
-
-                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
-                                              dInvTmp);
-            }
-
-            invBlockInPlace<T, blocksize>(dInvTmp);
-
-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
-                }
-            }
-        }
-    }
-
    template <class T, int blocksize>
    __global__ void cuMoveDataToReordered(
        T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
@ -505,195 +97,21 @@ namespace
            }
        }
    }
-
-    // Kernel here is the function object of the cuda kernel
-    template <class Kernel>
-    inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1)
-    {
-        if (suggestedThrBlockSize != -1){
-            return suggestedThrBlockSize;
-        }
-// Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned
-#if USE_HIP
-    return 512;
-#else
-        int blockSize = 1024;
-        int tmpGridSize;
-        std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
-        return blockSize;
-#endif
-    }
-
-    inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
-    {
-        return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
-    }
-
 } // namespace

 template <class T, int blocksize>
 void
-invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
-{
-    if (blocksize <= 3) {
-        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
-        int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
-        cuInvertDiagonalAndFlatten<T, blocksize>
-            <<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
-    } else {
-        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
-    }
-}
-
-// perform the lower solve for all rows in the same level set
-template <class T, int blocksize>
-void
-computeLowerSolveLevelSet(T* reorderedMat,
-                          int* rowIndices,
-                          int* colIndices,
-                          int* indexConversion,
-                          int startIdx,
-                          int rowsInLevelSet,
-                          const T* dInv,
-                          const T* d,
-                          T* v,
-                          int thrBlockSize)
-{
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuComputeLowerSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
-}
-
-
-template <class T, int blocksize>
-void
-computeLowerSolveLevelSetSplit(T* reorderedMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               const T* d,
-                               T* v,
-                               int thrBlockSize)
-{
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
-}
-// perform the upper solve for all rows in the same level set
-template <class T, int blocksize>
-void
-computeUpperSolveLevelSet(T* reorderedMat,
-                          int* rowIndices,
-                          int* colIndices,
-                          int* indexConversion,
-                          int startIdx,
-                          int rowsInLevelSet,
-                          const T* dInv,
-                          T* v,
-                          int thrBlockSize)
-{
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuComputeUpperSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
-}
-
-template <class T, int blocksize>
-void
-computeUpperSolveLevelSetSplit(T* reorderedMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               T* v,
-                               int thrBlockSize)
-{
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
-}
-
-template <class T, int blocksize>
-void
-computeDiluDiagonal(T* reorderedMat,
-                    int* rowIndices,
-                    int* colIndices,
-                    int* reorderedToNatural,
+copyMatDataToReordered(T* srcMatrix,
+                       int* srcRowIndices,
+                       T* dstMatrix,
+                       int* dstRowIndices,
                       int* naturalToReordered,
-                    const int startIdx,
-                    int rowsInLevelSet,
-                    T* dInv,
+                       size_t numberOfRows,
                       int thrBlockSize)
 {
-    if (blocksize <= 3) {
-        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
-        int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonal<T, blocksize>
-            <<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
-                                                                        rowIndices,
-                                                                        colIndices,
-                                                                        reorderedToNatural,
-                                                                        naturalToReordered,
-                                                                        startIdx,
-                                                                        rowsInLevelSet,
-                                                                        dInv);
-    } else {
-        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
-    }
-}
-
-template <class T, int blocksize>
-void
-computeDiluDiagonalSplit(T* reorderedLowerMat,
-                         int* lowerRowIndices,
-                         int* lowerColIndices,
-                         T* reorderedUpperMat,
-                         int* upperRowIndices,
-                         int* upperColIndices,
-                         T* diagonal,
-                         int* reorderedToNatural,
-                         int* naturalToReordered,
-                         const int startIdx,
-                         int rowsInLevelSet,
-                         T* dInv,
-                         int thrBlockSize)
-{
-    if (blocksize <= 3) {
-        int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
-        int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
-                                                                                     lowerRowIndices,
-                                                                                     lowerColIndices,
-                                                                                     reorderedUpperMat,
-                                                                                     upperRowIndices,
-                                                                                     upperColIndices,
-                                                                                     diagonal,
-                                                                                     reorderedToNatural,
-                                                                                     naturalToReordered,
-                                                                                     startIdx,
-                                                                                     rowsInLevelSet,
-                                                                                     dInv);
-    } else {
-        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
-    }
-}
-
-template <class T, int blocksize>
-void
-copyMatDataToReordered(
-    T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows,
-    int thrBlockSize)
-{
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
        srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
 }
@ -712,8 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
                            size_t numberOfRows,
                            int thrBlockSize)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
-    int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
                                                                                 srcRowIndices,
                                                                                 srcColumnIndices,
@ -727,16 +146,8 @@ copyMatDataToReorderedSplit(T* srcMatrix,
 }

 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);                                  \
    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int);                         \
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);     \
-    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);                   \
-    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
-        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                           \
-    template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);               \
-    template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);     \
-    template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);          \
-    template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);

 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp
@ -22,189 +22,6 @@
 #include <vector>
 namespace Opm::cuistl::detail
 {
-
-/**
- * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
- * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param numberOfRows Integer describing the number of rows in the matrix
- * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
- */
-template <class T, int blocksize>
-void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
-
-/**
- * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
- * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
- * in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
- * reorderedMat
- * @param d Stores the defect
- * @param [out] v Will store the results of the lower solve
- */
-template <class T, int blocksize>
-void computeLowerSolveLevelSet(T* reorderedMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               const T* d,
-                               T* v,
-                               int threadBlockSize);
-
-/**
- * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
- * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
- * in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
- * reorderedUpperMat
- * @param d Stores the defect
- * @param [out] v Will store the results of the lower solve
- */
-template <class T, int blocksize>
-void computeLowerSolveLevelSetSplit(T* reorderedUpperMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               const T* d,
-                               T* v,
-                               int threadBlockSize);
-
-/**
- * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
- * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
- * in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
- * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
- * solve
- */
-template <class T, int blocksize>
-void computeUpperSolveLevelSet(T* reorderedMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               T* v,
-                               int threadBlockSize);
-template <class T, int blocksize>
-
-/**
- * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
- * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
- * in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
- * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
- * solve
- */
-void computeUpperSolveLevelSetSplit(T* reorderedUpperMat,
-                               int* rowIndices,
-                               int* colIndices,
-                               int* indexConversion,
-                               int startIdx,
-                               int rowsInLevelSet,
-                               const T* dInv,
-                               T* v,
-                               int threadBlockSize);
-
-/**
- * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
- * containing the diagonal blocks
- * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous
- * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
- * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
- * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
- * index in the natural ordered matrix
- * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
- * index in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
- */
-template <class T, int blocksize>
-void computeDiluDiagonal(T* reorderedMat,
-                         int* rowIndices,
-                         int* colIndices,
-                         int* reorderedToNatural,
-                         int* naturalToReordered,
-                         int startIdx,
-                         int rowsInLevelSet,
-                         T* dInv,
-                         int threadBlockSize);
-template <class T, int blocksize>
-
-/**
- * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
- * containing the diagonal blocks
- * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous
- * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
- * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
- * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse matrix. The matrix reordered such
- * that rows in the same level sets are contiguous
- * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
- * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
- * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
- * index in the natural ordered matrix
- * @param diagonal The diagonal elements of the reordered matrix
- * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
- * index in the natural ordered matrix
- * @param startIdx Index of the first row of the matrix to be solve
- * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
- * function
- * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
- */
-void computeDiluDiagonalSplit(T* reorderedLowerMat,
-                         int* lowerRowIndices,
-                         int* lowerColIndices,
-                         T* reorderedUpperMat,
-                         int* upperRowIndices,
-                         int* upperColIndices,
-                         T* diagonal,
-                         int* reorderedToNatural,
-                         int* naturalToReordered,
-                         int startIdx,
-                         int rowsInLevelSet,
-                         T* dInv,
-                         int threadBlockSize);
-
 /**
 * @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
 * @param srcMatrix The source matrix we will copy data from
@ -216,26 +33,40 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
 * @param numberOfRows The number of rows in the matrices
 */
 template <class T, int blocksize>
-void copyMatDataToReordered(
-    T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
+void copyMatDataToReordered(T* srcMatrix,
+                            int* srcRowIndices,
+                            T* dstMatrix,
+                            int* dstRowIndices,
+                            int* naturalToReordered,
+                            size_t numberOfRows,
+                            int threadBlockSize);

 /**
 * @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
 * @param srcMatrix The source matrix we will copy data from
 * @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
 * @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht bsr
- * format
+ * @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
+ * bsr format
 * @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
- * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht bsr
- * format
+ * @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
+ * bsr format
 * @param [out] dstDiag The destination buffer for the diagonal part of the matrix
 * @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
 * @param numberOfRows The number of rows in the matrices
 */
 template <class T, int blocksize>
-void copyMatDataToReorderedSplit(
-    T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
+void copyMatDataToReorderedSplit(T* srcMatrix,
+                                 int* srcRowIndices,
+                                 int* srcColumnIndices,
+                                 T* dstLowerMatrix,
+                                 int* dstLowerRowIndices,
+                                 T* dstUpperMatrix,
+                                 int* dstUpperRowIndices,
+                                 T* dstDiag,
+                                 int* naturalToReordered,
+                                 size_t numberOfRows,
+                                 int threadBlockSize);

 } // namespace Opm::cuistl::detail
 #endif
--- a/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
@ -0,0 +1,234 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
+#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
+
+#include <config.h>
+#include <cuda_runtime.h>
+
+/*
+    This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
+    The functions provides various matrix operations that are used by the preconditioners.
+*/
+
+namespace
+{
+// TODO: figure out if this can be generalized effectively, this seems excessively verbose
+// explicit formulas based on Dune cpu code
+template <class T, int blocksize>
+__device__ __forceinline__ void
+invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
+{
+    if (blocksize == 1) {
+        dstBlock[0] = 1.0 / (srcBlock[0]);
+    } else if (blocksize == 2) {
+        T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
+        dstBlock[0] = srcBlock[3] * detInv;
+        dstBlock[1] = -srcBlock[1] * detInv;
+        dstBlock[2] = -srcBlock[2] * detInv;
+        dstBlock[3] = srcBlock[0] * detInv;
+    } else if (blocksize == 3) {
+        // based on Dune implementation
+        T t4 = srcBlock[0] * srcBlock[4];
+        T t6 = srcBlock[0] * srcBlock[5];
+        T t8 = srcBlock[1] * srcBlock[3];
+        T t10 = srcBlock[2] * srcBlock[3];
+        T t12 = srcBlock[1] * srcBlock[6];
+        T t14 = srcBlock[2] * srcBlock[6];
+
+        T t17 = 1.0
+            / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
+               - t14 * srcBlock[4]); // t17 is 1/determinant
+
+        dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
+        dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
+        dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
+        dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
+        dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
+        dstBlock[5] = -(t6 - t10) * t17;
+        dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
+        dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
+        dstBlock[8] = (t4 - t8) * t17;
+    }
+}
+
+// explicit formulas based on Dune cpu code
+template <class T, int blocksize>
+__device__ __forceinline__ void
+invBlockInPlace(T* __restrict__ block)
+{
+    if (blocksize == 1) {
+        block[0] = 1.0 / (block[0]);
+    } else if (blocksize == 2) {
+        T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
+
+        T temp = block[0];
+        block[0] = block[3] * detInv;
+        block[1] = -block[1] * detInv;
+        block[2] = -block[2] * detInv;
+        block[3] = temp * detInv;
+    } else if (blocksize == 3) {
+        T t4 = block[0] * block[4];
+        T t6 = block[0] * block[5];
+        T t8 = block[1] * block[3];
+        T t10 = block[2] * block[3];
+        T t12 = block[1] * block[6];
+        T t14 = block[2] * block[6];
+
+        T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
+        T t17 = T(1.0) / det;
+
+        T matrix01 = block[1];
+        T matrix00 = block[0];
+        T matrix10 = block[3];
+        T matrix11 = block[4];
+
+        block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
+        block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
+        block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
+        block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
+        block[4] = (matrix00 * block[8] - t14) * t17;
+        block[5] = -(t6 - t10) * t17;
+        block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
+        block[7] = -(matrix00 * block[7] - t12) * t17;
+        block[8] = (t4 - t8) * t17;
+    }
+}
+
+enum class MVType { SET, PLUS, MINUS };
+// SET:   c  = A*b
+// PLS:   c += A*b
+// MINUS: c -= A*b
+template <class T, int blocksize, MVType OP>
+__device__ __forceinline__ void
+matrixVectorProductWithAction(const T* A, const T* b, T* c)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        if (OP == MVType::SET) {
+            c[i] = 0;
+        }
+
+        for (int j = 0; j < blocksize; ++j) {
+            if (OP == MVType::SET || OP == MVType::PLUS) {
+                c[i] += A[i * blocksize + j] * b[j];
+            } else if (OP == MVType::MINUS) {
+                c[i] -= A[i * blocksize + j] * b[j];
+            }
+        }
+    }
+}
+
+template <class T, int blocksize>
+__device__ __forceinline__ void
+mv(const T* a, const T* b, T* c)
+{
+    matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
+}
+
+template <class T, int blocksize>
+__device__ __forceinline__ void
+umv(const T* a, const T* b, T* c)
+{
+    matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
+}
+
+template <class T, int blocksize>
+__device__ __forceinline__ void
+mmv(const T* a, const T* b, T* c)
+{
+    matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
+}
+
+// dst -= A*B*C
+template <class T, int blocksize>
+__device__ __forceinline__ void
+mmx2Subtraction(T* A, T* B, T* C, T* dst)
+{
+
+    T tmp[blocksize * blocksize] = {0};
+    // tmp = A*B
+    for (int i = 0; i < blocksize; ++i) {
+        for (int k = 0; k < blocksize; ++k) {
+            for (int j = 0; j < blocksize; ++j) {
+                tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
+            }
+        }
+    }
+
+    // dst = tmp*C
+    for (int i = 0; i < blocksize; ++i) {
+        for (int k = 0; k < blocksize; ++k) {
+            for (int j = 0; j < blocksize; ++j) {
+                dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
+            }
+        }
+    }
+}
+
+// C = A*B, assumes the three buffers do not overlap
+template <class T, int blocksize>
+__device__ __forceinline__ void
+mmNoOverlap(T* A, T* B, T* C)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        for (int k = 0; k < blocksize; ++k) {
+            for (int j = 0; j < blocksize; ++j) {
+                C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
+            }
+        }
+    }
+}
+
+// C = A*B, buffers may overlap
+template <class T, int blocksize>
+__device__ __forceinline__ void
+mmOverlap(T* A, T* B, T* C)
+{
+    T tmp[blocksize * blocksize] = {0};
+    for (int i = 0; i < blocksize; ++i) {
+        for (int k = 0; k < blocksize; ++k) {
+            for (int j = 0; j < blocksize; ++j) {
+                tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
+            }
+        }
+    }
+
+    for (int i = 0; i < blocksize; ++i) {
+        for (int j = 0; j < blocksize; ++j) {
+            C[i * blocksize + j] = tmp[i * blocksize + j];
+        }
+    }
+}
+
+// A -= B
+template <class T, int blocksize>
+__device__ __forceinline__ void
+matrixSubtraction(T* A, T* B)
+{
+
+    for (int i = 0; i < blocksize; ++i) {
+        for (int j = 0; j < blocksize; ++j) {
+            A[i * blocksize + j] -= B[i * blocksize + j];
+        }
+    }
+}
+} // namespace
+
+#endif
--- a/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
+++ b/opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
@ -0,0 +1,66 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_GPU_THREAD_UTILS_HPP
+#define OPM_GPU_THREAD_UTILS_HPP
+#include <cstddef>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
+
+/*
+    This file provides some logic for handling how to choose the correct thread-block size
+*/
+namespace Opm::cuistl::detail
+{
+constexpr inline size_t
+getThreads([[maybe_unused]] size_t numberOfRows)
+{
+    return 1024;
+}
+
+inline size_t
+getBlocks(size_t numberOfRows)
+{
+    const auto threads = getThreads(numberOfRows);
+    return (numberOfRows + threads - 1) / threads;
+}
+
+// Kernel here is the function object of the cuda kernel
+template <class Kernel>
+inline int
+getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1)
+{
+    if (suggestedThrBlockSize != -1) {
+        return suggestedThrBlockSize;
+    }
+    int blockSize;
+    int tmpGridSize;
+    OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
+    return blockSize;
+}
+
+inline int
+getNumberOfBlocks(int wantedThreads, int threadBlockSize)
+{
+    return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
+}
+
+} // namespace Opm::cuistl::detail
+
+#endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.cu
@ -0,0 +1,437 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
+#include <stdexcept>
+
+namespace Opm::cuistl::detail::DILU
+{
+namespace
+{
+
+    template <class T, int blocksize>
+    __global__ void cuSolveLowerLevelSet(T* mat,
+                                         int* rowIndices,
+                                         int* colIndices,
+                                         int* indexConversion,
+                                         int startIdx,
+                                         int rowsInLevelSet,
+                                         const T* dInv,
+                                         const T* d,
+                                         T* v)
+    {
+        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+
+            const size_t nnzIdx = rowIndices[reorderedRowIdx];
+            const int naturalRowIdx = indexConversion[reorderedRowIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = d[naturalRowIdx * blocksize + i];
+            }
+
+            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveLowerLevelSetSplit(T* mat,
+                                              int* rowIndices,
+                                              int* colIndices,
+                                              int* indexConversion,
+                                              int startIdx,
+                                              int rowsInLevelSet,
+                                              const T* dInv,
+                                              const T* d,
+                                              T* v)
+    {
+        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+
+            const size_t nnzIdx = rowIndices[reorderedRowIdx];
+            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedRowIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = d[naturalRowIdx * blocksize + i];
+            }
+
+            // TODO: removce the first condition in the for loop
+            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+
+
+    template <class T, int blocksize>
+    __global__ void cuSolveUpperLevelSet(T* mat,
+                                         int* rowIndices,
+                                         int* colIndices,
+                                         int* indexConversion,
+                                         int startIdx,
+                                         int rowsInLevelSet,
+                                         const T* dInv,
+                                         T* v)
+    {
+        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedRowIdx];
+
+            T rhs[blocksize] = {0};
+            for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
+                const int col = colIndices[block];
+                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveUpperLevelSetSplit(T* mat,
+                                              int* rowIndices,
+                                              int* colIndices,
+                                              int* indexConversion,
+                                              int startIdx,
+                                              int rowsInLevelSet,
+                                              const T* dInv,
+                                              T* v)
+    {
+        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+            const size_t nnzIdx = rowIndices[reorderedRowIdx];
+            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedRowIdx];
+
+            T rhs[blocksize] = {0};
+            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
+                const int col = colIndices[block];
+                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuComputeDiluDiagonal(T* mat,
+                                          int* rowIndices,
+                                          int* colIndices,
+                                          int* reorderedToNatural,
+                                          int* naturalToReordered,
+                                          const int startIdx,
+                                          int rowsInLevelSet,
+                                          T* dInv)
+    {
+        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
+            const size_t nnzIdx = rowIndices[reorderedRowIdx];
+
+            int diagIdx = nnzIdx;
+            while (colIndices[diagIdx] != naturalRowIdx) {
+                ++diagIdx;
+            }
+
+            T dInvTmp[blocksize * blocksize];
+            for (int i = 0; i < blocksize; ++i) {
+                for (int j = 0; j < blocksize; ++j) {
+                    dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
+                }
+            }
+
+            for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
+                const int col = naturalToReordered[colIndices[block]];
+                // find element with indices swapped
+                // Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
+                // we binary search over
+                int left = rowIndices[col];
+                int right = rowIndices[col + 1] - 1;
+                int mid;
+
+                while (left <= right) {
+                    mid = left + (right - left) / 2; // overflow-safe average
+                    const int col = colIndices[mid];
+
+                    if (col == naturalRowIdx) {
+                        break;
+                    } else if (col < naturalRowIdx) {
+                        left = mid + 1;
+                    } else {
+                        right = mid - 1;
+                    }
+                }
+
+                const int symOpposite = mid;
+
+                mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
+                                              &dInv[col * blocksize * blocksize],
+                                              &mat[symOpposite * blocksize * blocksize],
+                                              dInvTmp);
+            }
+
+            invBlockInPlace<T, blocksize>(dInvTmp);
+
+            for (int i = 0; i < blocksize; ++i) {
+                for (int j = 0; j < blocksize; ++j) {
+                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
+                }
+            }
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
+                                               int* lowerRowIndices,
+                                               int* lowerColIndices,
+                                               T* reorderedUpperMat,
+                                               int* upperRowIndices,
+                                               int* upperColIndices,
+                                               T* diagonal,
+                                               int* reorderedToNatural,
+                                               int* naturalToReordered,
+                                               const int startIdx,
+                                               int rowsInLevelSet,
+                                               T* dInv)
+    {
+        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
+        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
+            const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
+            const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
+            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
+
+            T dInvTmp[blocksize * blocksize];
+            for (int i = 0; i < blocksize; ++i) {
+                for (int j = 0; j < blocksize; ++j) {
+                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                }
+            }
+
+            for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
+                const int col = naturalToReordered[lowerColIndices[block]];
+
+                int symOppositeIdx = upperRowIndices[col];
+                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
+                    if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
+                        break;
+                    }
+                }
+
+                const int symOppositeBlock = symOppositeIdx;
+
+                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
+                                              &dInv[col * blocksize * blocksize],
+                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                              dInvTmp);
+            }
+
+            invBlockInPlace<T, blocksize>(dInvTmp);
+
+            for (int i = 0; i < blocksize; ++i) {
+                for (int j = 0; j < blocksize; ++j) {
+                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
+                }
+            }
+        }
+    }
+} // namespace
+
+// perform the lower solve for all rows in the same level set
+template <class T, int blocksize>
+void
+solveLowerLevelSet(T* reorderedMat,
+                   int* rowIndices,
+                   int* colIndices,
+                   int* indexConversion,
+                   int startIdx,
+                   int rowsInLevelSet,
+                   const T* dInv,
+                   const T* d,
+                   T* v,
+                   int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+}
+
+
+template <class T, int blocksize>
+void
+solveLowerLevelSetSplit(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* dInv,
+                        const T* d,
+                        T* v,
+                        int thrBlockSize)
+{
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+}
+// perform the upper solve for all rows in the same level set
+template <class T, int blocksize>
+void
+solveUpperLevelSet(T* reorderedMat,
+                   int* rowIndices,
+                   int* colIndices,
+                   int* indexConversion,
+                   int startIdx,
+                   int rowsInLevelSet,
+                   const T* dInv,
+                   T* v,
+                   int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+}
+
+template <class T, int blocksize>
+void
+solveUpperLevelSetSplit(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* dInv,
+                        T* v,
+                        int thrBlockSize)
+{
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+}
+
+template <class T, int blocksize>
+void
+computeDiluDiagonal(T* reorderedMat,
+                    int* rowIndices,
+                    int* colIndices,
+                    int* reorderedToNatural,
+                    int* naturalToReordered,
+                    const int startIdx,
+                    int rowsInLevelSet,
+                    T* dInv,
+                    int thrBlockSize)
+{
+    if (blocksize <= 3) {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+            cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
+                                                                                rowIndices,
+                                                                                colIndices,
+                                                                                reorderedToNatural,
+                                                                                naturalToReordered,
+                                                                                startIdx,
+                                                                                rowsInLevelSet,
+                                                                                dInv);
+    } else {
+        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
+    }
+}
+
+template <class T, int blocksize>
+void
+computeDiluDiagonalSplit(T* reorderedLowerMat,
+                         int* lowerRowIndices,
+                         int* lowerColIndices,
+                         T* reorderedUpperMat,
+                         int* upperRowIndices,
+                         int* upperColIndices,
+                         T* diagonal,
+                         int* reorderedToNatural,
+                         int* naturalToReordered,
+                         const int startIdx,
+                         int rowsInLevelSet,
+                         T* dInv,
+                         int thrBlockSize)
+{
+    if (blocksize <= 3) {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+            cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+                                                                                     lowerRowIndices,
+                                                                                     lowerColIndices,
+                                                                                     reorderedUpperMat,
+                                                                                     upperRowIndices,
+                                                                                     upperColIndices,
+                                                                                     diagonal,
+                                                                                     reorderedToNatural,
+                                                                                     naturalToReordered,
+                                                                                     startIdx,
+                                                                                     rowsInLevelSet,
+                                                                                     dInv);
+    } else {
+        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
+    }
+}
+
+#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
+    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
+    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
+        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                      \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
+    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
+    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+
+INSTANTIATE_KERNEL_WRAPPERS(float, 1);
+INSTANTIATE_KERNEL_WRAPPERS(float, 2);
+INSTANTIATE_KERNEL_WRAPPERS(float, 3);
+INSTANTIATE_KERNEL_WRAPPERS(float, 4);
+INSTANTIATE_KERNEL_WRAPPERS(float, 5);
+INSTANTIATE_KERNEL_WRAPPERS(float, 6);
+INSTANTIATE_KERNEL_WRAPPERS(double, 1);
+INSTANTIATE_KERNEL_WRAPPERS(double, 2);
+INSTANTIATE_KERNEL_WRAPPERS(double, 3);
+INSTANTIATE_KERNEL_WRAPPERS(double, 4);
+INSTANTIATE_KERNEL_WRAPPERS(double, 5);
+INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+} // namespace Opm::cuistl::detail::DILU
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -0,0 +1,202 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_DILU_KERNELS_HPP
+#define OPM_DILU_KERNELS_HPP
+
+#include <cstddef>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+namespace Opm::cuistl::detail::DILU
+{
+
+/**
+ * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
+ * that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
+ * reorderedMat
+ * @param d Stores the defect
+ * @param [out] v Will store the results of the lower solve
+ */
+template <class T, int blocksize>
+void solveLowerLevelSet(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* dInv,
+                        const T* d,
+                        T* v,
+                        int threadBlockSize);
+
+/**
+ * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
+ * such that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
+ * reorderedUpperMat
+ * @param d Stores the defect
+ * @param [out] v Will store the results of the lower solve
+ */
+template <class T, int blocksize>
+void solveLowerLevelSetSplit(T* reorderedUpperMat,
+                             int* rowIndices,
+                             int* colIndices,
+                             int* indexConversion,
+                             int startIdx,
+                             int rowsInLevelSet,
+                             const T* dInv,
+                             const T* d,
+                             T* v,
+                             int threadBlockSize);
+
+/**
+ * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
+ * that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
+ * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
+ * solve
+ */
+template <class T, int blocksize>
+void solveUpperLevelSet(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* dInv,
+                        T* v,
+                        int threadBlockSize);
+
+/**
+ * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
+ * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
+ * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
+ * solve
+ */
+template <class T, int blocksize>
+void solveUpperLevelSetSplit(T* reorderedUpperMat,
+                             int* rowIndices,
+                             int* colIndices,
+                             int* indexConversion,
+                             int startIdx,
+                             int rowsInLevelSet,
+                             const T* dInv,
+                             T* v,
+                             int threadBlockSize);
+
+/**
+ * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
+ * containing the diagonal blocks
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
+ * that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
+ */
+template <class T, int blocksize>
+void computeDiluDiagonal(T* reorderedMat,
+                         int* rowIndices,
+                         int* colIndices,
+                         int* reorderedToNatural,
+                         int* naturalToReordered,
+                         int startIdx,
+                         int rowsInLevelSet,
+                         T* dInv,
+                         int threadBlockSize);
+template <class T, int blocksize>
+
+/**
+ * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
+ * containing the diagonal blocks
+ * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
+ * matrix. The matrix reordered such that rows in the same level sets are contiguous
+ * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
+ * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
+ * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
+ * matrix. The matrix reordered such that rows in the same level sets are contiguous
+ * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
+ * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
+ * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param diagonal The diagonal elements of the reordered matrix
+ * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
+ */
+void computeDiluDiagonalSplit(T* reorderedLowerMat,
+                              int* lowerRowIndices,
+                              int* lowerColIndices,
+                              T* reorderedUpperMat,
+                              int* upperRowIndices,
+                              int* upperColIndices,
+                              T* diagonal,
+                              int* reorderedToNatural,
+                              int* naturalToReordered,
+                              int startIdx,
+                              int rowsInLevelSet,
+                              T* dInv,
+                              int threadBlockSize);
+
+} // namespace Opm::cuistl::detail::DILU
+#endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -0,0 +1,476 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
+#include <stdexcept>
+
+/*
+    The LU factorization and apply step is written based on the Dune implementations
+*/
+
+namespace Opm::cuistl::detail::ILU0
+{
+namespace
+{
+    template <class T, int blocksize>
+    __global__ void cuLUFactorization(T* srcMatrix,
+                                      int* srcRowIndices,
+                                      int* srcColumnIndices,
+                                      int* naturalToReordered,
+                                      int* reorderedToNatual,
+                                      size_t rowsInLevelSet,
+                                      int startIdx)
+    {
+        auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
+        constexpr int scalarsInBlock = blocksize * blocksize;
+
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+            int naturalIdx = reorderedToNatual[reorderedIdx];
+            // for each element under the diagonal
+            int endOfRowI = srcRowIndices[reorderedIdx + 1];
+            int ij;
+            for (ij = srcRowIndices[reorderedIdx];
+                 ij < srcRowIndices[reorderedIdx + 1] && srcColumnIndices[ij] < naturalIdx;
+                 ++ij) {
+                // consider the block we are looking at to be A_ij
+                // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
+                // already inverted find the A_jj block
+                int jj; // index in bcsr of the A_jj element
+                int j = naturalToReordered[srcColumnIndices[ij]];
+                int startOfRowJ = srcRowIndices[j];
+                int endOfRowJ = srcRowIndices[j + 1];
+                for (int blockInRowJ = startOfRowJ; blockInRowJ < endOfRowJ; ++blockInRowJ) {
+                    if (srcColumnIndices[blockInRowJ] == srcColumnIndices[ij]) {
+                        jj = blockInRowJ;
+                        break;
+                    }
+                }
+
+                // the DUNE implementation is inplace, and I need to take care to make sure
+                // this out of place implementation is equivalent
+                mmOverlap<T, blocksize>(
+                    &srcMatrix[ij * scalarsInBlock], &srcMatrix[jj * scalarsInBlock], &srcMatrix[ij * scalarsInBlock]);
+
+                // we have now accounted for an element under the diagonal and the diagonal element above it
+                // now iterate over the blocks that align in the
+                int jk = jj + 1;
+                int ik = ij + 1;
+                // this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
+                while (ik != endOfRowI && jk != endOfRowJ) {
+                    if (srcColumnIndices[ik] == srcColumnIndices[jk]) {
+                        // A_jk = A_ij * A_jk
+                        T tmp[scalarsInBlock] = {0};
+                        mmNoOverlap<T, blocksize>(
+                            &srcMatrix[ij * scalarsInBlock], &srcMatrix[jk * scalarsInBlock], tmp);
+                        matrixSubtraction<T, blocksize>(&srcMatrix[ik * scalarsInBlock], tmp);
+                        ++ik;
+                        ++jk;
+                    } else {
+                        if (srcColumnIndices[ik] < srcColumnIndices[jk]) {
+                            ++ik;
+                        } else {
+                            ++jk;
+                        }
+                    }
+                }
+            }
+            invBlockInPlace<T, blocksize>(&srcMatrix[ij * scalarsInBlock]);
+        }
+    }
+
+    // An index in the BCSR matrix can be in one of three states, above, on, or above the diagonal
+    enum class POSITION_TYPE { UNDER_DIAG, ON_DIAG, ABOVE_DIAG };
+    // this handles incrementation of an index on a row that is in a lower/upper split datastructure.
+    // The point is that if we are in the lower or upper part we increment as usual.
+    // if we reach the diagonal, we update the state, the index is not needed.
+    // when we increment from the diagonal, we set it to be the first block in the upper part of that row
+    __device__ __forceinline__ void
+    incrementAcrossSplitStructure(int& idx, POSITION_TYPE& currentState, int lastLowerIdx, int firstUpperIdx)
+    {
+        if (currentState == POSITION_TYPE::UNDER_DIAG) {
+            if (idx != lastLowerIdx - 1) { // we should increment to be on the diagonal
+                ++idx;
+            } else {
+                currentState = POSITION_TYPE::ON_DIAG;
+            }
+        } else if (currentState == POSITION_TYPE::ON_DIAG) {
+            idx = firstUpperIdx;
+            currentState = POSITION_TYPE::ABOVE_DIAG;
+        } else { // currentState = POSITION_TYPE::ABOVE_DIAG
+            ++idx;
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
+                                           int* lowerRowIndices,
+                                           int* lowerColIndices,
+                                           T* reorderedUpperMat,
+                                           int* upperRowIndices,
+                                           int* upperColIndices,
+                                           T* diagonal,
+                                           int* reorderedToNatural,
+                                           int* naturalToReordered,
+                                           const int startIdx,
+                                           int rowsInLevelSet)
+    {
+        auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
+        constexpr int scalarsInBlock = blocksize * blocksize;
+
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+            // if (reorderedIdx < rowsInLevelSet){
+            int naturalIdx = reorderedToNatural[reorderedIdx];
+            // for each element under the diagonal
+            int endOfRowILower = lowerRowIndices[reorderedIdx + 1];
+            int endOfRowIUpper = upperRowIndices[reorderedIdx + 1];
+            int startOfRowILower = lowerRowIndices[reorderedIdx];
+            int startOfRowIUpper = upperRowIndices[reorderedIdx];
+            for (int ij = startOfRowILower; ij < endOfRowILower; ++ij) {
+
+                // consider the block we are looking at to be A_ij
+                // we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
+                // already inverted find the A_jj block
+                int j = naturalToReordered[lowerColIndices[ij]];
+                int endOfRowJ = upperRowIndices[j + 1];
+                int startOfRowJ = upperRowIndices[j];
+
+                // the DUNE implementation is inplace, and I need to take care to make sure
+                // this out of place implementation is equivalent
+                mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
+                                        &diagonal[j * scalarsInBlock],
+                                        &reorderedLowerMat[ij * scalarsInBlock]);
+
+                // we have now accounted for an element under the diagonal and the diagonal element above it
+                // now iterate over the blocks that align in the
+                int jk = startOfRowJ;
+                POSITION_TYPE ikState = POSITION_TYPE::UNDER_DIAG; // it is guaranteed that the ij element we consider
+                                                                   // is under the diagonal
+                int ik = ij;
+                incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
+                // this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
+
+                // checking for ik != endOfRowUpper is not sufficient alone because
+                // the same block index might be found in the lower part of the matrix
+                while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
+
+                    T* ikBlockPtr;
+                    int ikColumn;
+                    if (ikState == POSITION_TYPE::UNDER_DIAG) {
+                        ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
+                        ikColumn = lowerColIndices[ik];
+                    } else if (ikState == POSITION_TYPE::ON_DIAG) {
+                        ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
+                        ikColumn = naturalIdx;
+                    } else { // ikState == POSITION_TYPE::ABOVE_DIAG
+                        ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
+                        ikColumn = upperColIndices[ik];
+                    }
+
+                    if (ikColumn == upperColIndices[jk]) {
+                        // A_jk = A_ij * A_jk
+                        T tmp[scalarsInBlock] = {0};
+
+                        mmNoOverlap<T, blocksize>(
+                            &reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
+                        matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
+                        incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
+                        ;
+                        ++jk;
+                    } else {
+                        if (ikColumn < upperColIndices[jk]) {
+                            incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
+                            ;
+                        } else {
+                            ++jk;
+                        }
+                    }
+                }
+            }
+            invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveLowerLevelSet(T* mat,
+                                         int* rowIndices,
+                                         int* colIndices,
+                                         int* indexConversion,
+                                         int startIdx,
+                                         int rowsInLevelSet,
+                                         const T* d,
+                                         T* v)
+    {
+        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+
+            const size_t nnzIdx = rowIndices[reorderedIdx];
+            const int naturalRowIdx = indexConversion[reorderedIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = d[naturalRowIdx * blocksize + i];
+            }
+
+            int block = nnzIdx;
+            for (; colIndices[block] < naturalRowIdx; ++block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+            for (int i = 0; i < blocksize; ++i) {
+                v[naturalRowIdx * blocksize + i] = rhs[i];
+            }
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveUpperLevelSet(
+        T* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, T* v)
+    {
+        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+
+            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = v[naturalRowIdx * blocksize + i];
+            }
+
+            int block = nnzIdxLim - 1;
+            for (; colIndices[block] > naturalRowIdx; --block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mv<T, blocksize>(&mat[block * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveLowerLevelSetSplit(T* mat,
+                                              int* rowIndices,
+                                              int* colIndices,
+                                              int* indexConversion,
+                                              int startIdx,
+                                              int rowsInLevelSet,
+                                              const T* d,
+                                              T* v)
+    {
+        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+            const size_t nnzIdx = rowIndices[reorderedIdx];
+            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = d[naturalRowIdx * blocksize + i];
+            }
+
+            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+            for (int i = 0; i < blocksize; ++i) {
+                v[naturalRowIdx * blocksize + i] = rhs[i];
+            }
+        }
+    }
+
+    template <class T, int blocksize>
+    __global__ void cuSolveUpperLevelSetSplit(T* mat,
+                                              int* rowIndices,
+                                              int* colIndices,
+                                              int* indexConversion,
+                                              int startIdx,
+                                              int rowsInLevelSet,
+                                              const T* dInv,
+                                              T* v)
+    {
+        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
+        if (reorderedIdx < rowsInLevelSet + startIdx) {
+            const size_t nnzIdx = rowIndices[reorderedIdx];
+            const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
+            const int naturalRowIdx = indexConversion[reorderedIdx];
+
+            T rhs[blocksize];
+            for (int i = 0; i < blocksize; i++) {
+                rhs[i] = v[naturalRowIdx * blocksize + i];
+            }
+
+            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
+                const int col = colIndices[block];
+                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+            }
+
+            mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+        }
+    }
+} // namespace
+
+template <class T, int blocksize>
+void
+solveLowerLevelSet(T* reorderedMat,
+                   int* rowIndices,
+                   int* colIndices,
+                   int* indexConversion,
+                   int startIdx,
+                   int rowsInLevelSet,
+                   const T* d,
+                   T* v,
+                   int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+}
+// perform the upper solve for all rows in the same level set
+template <class T, int blocksize>
+void
+solveUpperLevelSet(T* reorderedMat,
+                   int* rowIndices,
+                   int* colIndices,
+                   int* indexConversion,
+                   int startIdx,
+                   int rowsInLevelSet,
+                   T* v,
+                   int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
+}
+
+template <class T, int blocksize>
+void
+solveLowerLevelSetSplit(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* d,
+                        T* v,
+                        int thrBlockSize)
+{
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+}
+// perform the upper solve for all rows in the same level set
+template <class T, int blocksize>
+void
+solveUpperLevelSetSplit(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* dInv,
+                        T* v,
+                        int thrBlockSize)
+{
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
+        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+}
+
+template <class T, int blocksize>
+void
+LUFactorization(T* srcMatrix,
+                int* srcRowIndices,
+                int* srcColumnIndices,
+                int* naturalToReordered,
+                int* reorderedToNatual,
+                size_t rowsInLevelSet,
+                int startIdx,
+                int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+        srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
+}
+
+template <class T, int blocksize>
+void
+LUFactorizationSplit(T* reorderedLowerMat,
+                     int* lowerRowIndices,
+                     int* lowerColIndices,
+                     T* reorderedUpperMat,
+                     int* upperRowIndices,
+                     int* upperColIndices,
+                     T* diagonal,
+                     int* reorderedToNatural,
+                     int* naturalToReordered,
+                     const int startIdx,
+                     int rowsInLevelSet,
+                     int thrBlockSize)
+{
+    int threadBlockSize
+        = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
+    cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+                                                                             lowerRowIndices,
+                                                                             lowerColIndices,
+                                                                             reorderedUpperMat,
+                                                                             upperRowIndices,
+                                                                             upperColIndices,
+                                                                             diagonal,
+                                                                             reorderedToNatural,
+                                                                             naturalToReordered,
+                                                                             startIdx,
+                                                                             rowsInLevelSet);
+}
+
+#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
+    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
+    template void LUFactorizationSplit<T, blocksize>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int);                                          \
+    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
+    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
+
+INSTANTIATE_KERNEL_WRAPPERS(float, 1);
+INSTANTIATE_KERNEL_WRAPPERS(float, 2);
+INSTANTIATE_KERNEL_WRAPPERS(float, 3);
+INSTANTIATE_KERNEL_WRAPPERS(float, 4);
+INSTANTIATE_KERNEL_WRAPPERS(float, 5);
+INSTANTIATE_KERNEL_WRAPPERS(float, 6);
+INSTANTIATE_KERNEL_WRAPPERS(double, 1);
+INSTANTIATE_KERNEL_WRAPPERS(double, 2);
+INSTANTIATE_KERNEL_WRAPPERS(double, 3);
+INSTANTIATE_KERNEL_WRAPPERS(double, 4);
+INSTANTIATE_KERNEL_WRAPPERS(double, 5);
+INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+} // namespace Opm::cuistl::detail::ILU0
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -0,0 +1,193 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_ILU0_KERNELS_HPP
+#define OPM_ILU0_KERNELS_HPP
+#include <cstddef>
+#include <vector>
+namespace Opm::cuistl::detail::ILU0
+{
+
+/**
+ * @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
+ * that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param [out] v Will store the results of the upper solve
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void solveUpperLevelSet(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        T* v,
+                        int threadBlockSize);
+
+/**
+ * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
+ * that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param d Stores the defect
+ * @param [out] v Will store the results of the lower solve
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void solveLowerLevelSet(T* reorderedMat,
+                        int* rowIndices,
+                        int* colIndices,
+                        int* indexConversion,
+                        int startIdx,
+                        int rowsInLevelSet,
+                        const T* d,
+                        T* v,
+                        int threadBlockSize);
+
+/**
+ * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
+ * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
+ * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
+ * solve
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void solveUpperLevelSetSplit(T* reorderedMat,
+                             int* rowIndices,
+                             int* colIndices,
+                             int* indexConversion,
+                             int startIdx,
+                             int rowsInLevelSet,
+                             const T* dInv,
+                             T* v,
+                             int threadBlockSize);
+
+/**
+ * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
+ * @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
+ * such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
+ * in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
+ * @param d Stores the defect
+ * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
+ * solve
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void solveLowerLevelSetSplit(T* reorderedLowerMat,
+                             int* rowIndices,
+                             int* colIndices,
+                             int* indexConversion,
+                             int startIdx,
+                             int rowsInLevelSet,
+                             const T* d,
+                             T* v,
+                             int threadBlockSize);
+
+/**
+ * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
+ * elements store the inverse of the diagonal entries
+ * @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
+ * such that rows in the same level sets are contiguous
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void LUFactorization(T* reorderedMat,
+                     int* rowIndices,
+                     int* columnIndices,
+                     int* naturalToReordered,
+                     int* reorderedToNatual,
+                     size_t rowsInLevelSet,
+                     int startIdx,
+                     int threadBlockSize);
+
+/**
+ * @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
+ * triangular part)
+ * @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
+ * matrix. The matrix reordered such that rows in the same level sets are contiguous
+ * @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
+ * @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
+ * @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
+ * matrix. The matrix reordered such that rows in the same level sets are contiguous
+ * @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
+ * @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
+ * @param [out] diagonal The diagonal elements of the ILU0 factorization inverted
+ * @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
+ * index in the natural ordered matrix
+ * @param startIdx Index of the first row of the matrix to be solve
+ * @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
+ * function
+ * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
+ */
+template <class T, int blocksize>
+void LUFactorizationSplit(T* reorderedLowerMat,
+                          int* lowerRowIndices,
+                          int* lowerColIndices,
+                          T* reorderedUpperMat,
+                          int* upperRowIndices,
+                          int* upperColIndices,
+                          T* diagonal,
+                          int* reorderedToNatural,
+                          int* naturalToReordered,
+                          int startIdx,
+                          int rowsInLevelSet,
+                          int threadBlockSize);
+
+} // namespace Opm::cuistl::detail::ILU0
+#endif
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.cu
@ -0,0 +1,88 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#include <config.h>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
+#include <stdexcept>
+
+namespace Opm::cuistl::detail::JAC
+{
+namespace
+{
+    template <class T, int blocksize>
+    __global__ void
+    cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
+    {
+        const auto row = blockDim.x * blockIdx.x + threadIdx.x;
+
+        if (row < numberOfRows) {
+            size_t nnzIdx = rowIndices[row];
+            size_t nnzIdxLim = rowIndices[row + 1];
+
+            // this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
+            // zero element
+            while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
+                ++nnzIdx;
+            }
+
+            // diagBlock points to the start of where the diagonal block is stored
+            T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
+            // vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
+            // element should be stored
+            T* vecBlock = &vec[blocksize * blocksize * row];
+
+            invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
+        }
+    }
+} // namespace
+
+template <class T, int blocksize>
+void
+invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
+{
+    if (blocksize <= 3) {
+        int threadBlockSize
+            = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
+        cuInvertDiagonalAndFlatten<T, blocksize>
+            <<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
+    } else {
+        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
+    }
+}
+
+#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
+    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);
+
+INSTANTIATE_KERNEL_WRAPPERS(float, 1);
+INSTANTIATE_KERNEL_WRAPPERS(float, 2);
+INSTANTIATE_KERNEL_WRAPPERS(float, 3);
+INSTANTIATE_KERNEL_WRAPPERS(float, 4);
+INSTANTIATE_KERNEL_WRAPPERS(float, 5);
+INSTANTIATE_KERNEL_WRAPPERS(float, 6);
+INSTANTIATE_KERNEL_WRAPPERS(double, 1);
+INSTANTIATE_KERNEL_WRAPPERS(double, 2);
+INSTANTIATE_KERNEL_WRAPPERS(double, 3);
+INSTANTIATE_KERNEL_WRAPPERS(double, 4);
+INSTANTIATE_KERNEL_WRAPPERS(double, 5);
+INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+
+} // namespace Opm::cuistl::detail::JAC
--- a/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp
+++ b/opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp
@ -0,0 +1,38 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+#ifndef OPM_JAC_KERNELS_HPP
+#define OPM_JAC_KERNELS_HPP
+#include <cstddef>
+#include <vector>
+namespace Opm::cuistl::detail::JAC
+{
+
+/**
+ * @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
+ * @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
+ * @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
+ * @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
+ * @param numberOfRows Integer describing the number of rows in the matrix
+ * @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
+ */
+template <class T, int blocksize>
+void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
+
+} // namespace Opm::cuistl::detail::JAC
+#endif
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@ -16,14 +16,15 @@
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <config.h>
 #include <opm/common/ErrorMacros.hpp>
-#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
+#include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
-#include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
+#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
 #include <stdexcept>
-#include <config.h>
 namespace Opm::cuistl::detail
 {

@ -83,20 +84,8 @@ namespace
        }
    }

-    constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements)
-    {
-        return 1024;
-    }
-
-    inline size_t getBlocks(size_t numberOfElements)
-    {
-        const auto threads = getThreads(numberOfElements);
-        return (numberOfElements + threads - 1) / threads;
-    }
-
    template <class T>
-    __global__ void
-    prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
+    __global__ void prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;

@ -105,8 +94,7 @@ namespace
        }
    }
    template <class T>
-    __global__ void
-    syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
+    __global__ void syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;

@ -116,33 +104,13 @@ namespace
    }
 } // namespace

-// Kernel here is the function object of the cuda kernel
-template <class Kernel>
-inline int getCudaRecomendedThreadBlockSize(Kernel k)
-{
-#if USE_HIP
-    return 512;
-#else
-    int blockSize;
-    int tmpGridSize;
-    std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
-    return blockSize;
-#endif
-}
-
-inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
-{
-    return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
-}
-
 template <class T>
 void
 setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-    setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-        deviceData, numberOfElements, value);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    setVectorValueKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, value);
 }

 template void setVectorValue(double*, size_t, const double&);
@ -153,10 +121,9 @@ template <class T>
 void
 setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-    setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-        deviceData, numberOfElements, indices);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    setZeroAtIndexSetKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, indices);
 }
 template void setZeroAtIndexSet(double*, size_t, const int*);
 template void setZeroAtIndexSet(float*, size_t, const int*);
@ -164,12 +131,16 @@ template void setZeroAtIndexSet(int*, size_t, const int*);

 template <class T>
 T
-innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
+innerProductAtIndices(cublasHandle_t cublasHandle,
+                      const T* deviceA,
+                      const T* deviceB,
+                      T* buffer,
+                      size_t numberOfElements,
+                      const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-    elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-        deviceA, deviceB, buffer, numberOfElements, indices);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);

    // TODO: [perf] Get rid of the allocation here.
    CuVector<T> oneVector(numberOfElements);
@ -184,11 +155,12 @@ template float innerProductAtIndices(cublasHandle_t, const float*, const float*,
 template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*);

 template <class T>
-void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
+void
+prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-    prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    prepareSendBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
    OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
 }
 template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
@ -196,12 +168,13 @@ template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberO
 template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);

 template <class T>
-void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
+void
+syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
 {
-    int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
-    int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-    syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
-    //cudaDeviceSynchronize(); // Not needed, I guess...
+    int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
+    int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+    syncFromRecvBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
+    // cudaDeviceSynchronize(); // Not needed, I guess...
 }
 template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
 template void syncFromRecvBuf(float* deviceA, float* buffer, size_t numberOfElements, const int* indices);
@ -217,30 +190,24 @@ weightedDiagMV(const T* squareBlockVector,
               T* dstVec)
 {
    switch (blocksize) {
-    case 1:
-        {
-            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
-            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-            weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-        }
-        break;
-    case 2:
-        {
-            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
-            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-            weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-        }
-        break;
-    case 3:
-        {
-            int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
-            int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
-            weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-                squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
-        }
-        break;
+    case 1: {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        weightedDiagMV<T, 1>
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
+    } break;
+    case 2: {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        weightedDiagMV<T, 2>
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
+    } break;
+    case 3: {
+        int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
+        int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
+        weightedDiagMV<T, 3>
+            <<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
+    } break;
    default:
        OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
        break;
--- a/tests/cuistl/test_cuSparse_matrix_operations.cpp
+++ b/tests/cuistl/test_cuSparse_matrix_operations.cpp
@ -28,6 +28,7 @@
 #include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
+#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>

 using NumericTypes = boost::mpl::list<double, float>;

@ -87,7 +88,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::invertDiagonalAndFlatten<T, 3>(
+    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());

    std::vector<T> expectedInvDiag {-1.0 / 4.0,
@ -161,7 +162,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
    Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
    Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);

-    Opm::cuistl::detail::invertDiagonalAndFlatten<T, 2>(
+    Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
        m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());

    std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};