use cudagraphs in gpu DILU and ILU0 apply

expected to give performance gain on current nvidia cards, no significant change on AMD cards for cases of reasonable size uses new GPU convenence objects
2025-02-25 18:55:30 -06:00 · 2024-12-05 10:47:05 +01:00 · 2024-12-05 10:47:05 +01:00 · 66fa971079
commit 66fa971079
parent 2babeb848d
12 changed files with 417 additions and 149 deletions
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@ -27,6 +27,7 @@
 #include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>
+#include <cuda_runtime.h>


 namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@ -37,6 +37,7 @@
 #include <functional>
 #include <utility>
 #include <string>
+
 namespace Opm::gpuistl
 {

@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
        }
    }

-    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    computeDiagonal(m_DILUFactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
    OPM_TIMEBLOCK(prec_apply);
    {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
    }
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }

 template <class M, class X, class Y, int l>
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInvFloat->data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            } else {
                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                m_gpuDInv.data(),
                d.data(),
                v.data(),
-                lowerSolveThreadBlockSize);
+                lowerSolveThreadBlockSize,
+                m_stream.get());
        }
        levelStartIdx += numOfRowsInLevel;
    }
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInvFloat->data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            } else {
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                numOfRowsInLevel,
                m_gpuDInv.data(),
                v.data(),
-                upperSolveThreadBlockSize);
+                upperSolveThreadBlockSize,
+                m_stream.get());
        }
    }
 }
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+        reorderAndSplitMatrix(m_moveThreadBlockSize);
+        computeDiagonal(m_DILUFactorizationThreadBlockSize);
    }
 }

@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(moveThreadBlockSize);
+    computeDiagonal(factorizationBlockSize);
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }

 template <class M, class X, class Y, int l>
 void
-GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
    if (m_splitMatrix) {
        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                                                                m_gpuMatrixReordered->N(),
                                                                moveThreadBlockSize);
    }
+}

+template <class M, class X, class Y, int l>
+void
+GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
+{
    int levelStartIdx = 0;
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@ -24,8 +24,10 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <vector>
-
+#include <map>
+#include <utility>


 namespace Opm::gpuistl
@ -82,8 +84,11 @@ public:
    //! \brief Updates the matrix data.
    void update() final;

+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
    //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void computeDiagonal(int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -153,6 +158,16 @@ private:
    int m_lowerSolveThreadBlockSize = -1;
    int m_moveThreadBlockSize = -1;
    int m_DILUFactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl

--- a/opm/simulators/linalg/gpuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
    OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

+template <class T>
+void
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > dim()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+                              dim(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
--- a/opm/simulators/linalg/gpuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@ -203,6 +203,7 @@ public:
     * @note assumes that this vector has numberOfElements elements
     */
    void copyFromHost(const T* dataPointer, size_t numberOfElements);
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);

    /**
     * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@ -37,6 +37,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
+
 namespace Opm::gpuistl
 {

@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
    , m_tuneThreadBlockSizes(tuneKernels)
    , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
-
    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
    // Some sanity check
    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
        }
    }

-    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
+
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
    }
 }

@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
            else{
                detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                     numOfRowsInLevel,
                                                                     d.data(),
                                                                     v.data(),
-                                                                     lowerSolveThreadBlockSize);
+                                                                     lowerSolveThreadBlockSize,
+                                                                     m_stream.get());
        }
        levelStartIdx += numOfRowsInLevel;
    }
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                        numOfRowsInLevel,
                        m_gpuMatrixReorderedDiagFloat.value().data(),
                        v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream.get());
            }
            else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                        numOfRowsInLevel,
                        m_gpuMatrixReorderedDiag.value().data(),
                        v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream.get());
            }
            else{
                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    m_gpuMatrixReorderedDiag.value().data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                     levelStartIdx,
                                                                     numOfRowsInLevel,
                                                                     v.data(),
-                                                                     upperSolveThreadBlockSize);
+                                                                     upperSolveThreadBlockSize,
+                                                                     m_stream.get());
        }
    }
 }
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::update()
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
-        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
-    }
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 }

 template <class M, class X, class Y, int l>
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+        reorderAndSplitMatrix(moveThreadBlockSize);
+        LUFactorizeMatrix(factorizationThreadBlockSize);
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
    }
 }
+
 template <class M, class X, class Y, int l>
 void
-OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
    if (m_splitMatrix) {
        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                                                               m_gpuReorderedLU->N(),
                                                               moveThreadBlockSize);
    }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
+{
    int levelStartIdx = 0;
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <optional>
 #include <type_traits>
@ -84,8 +85,11 @@ public:
    //! \brief Updates the matrix data.
    void update() final;

+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
    //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void LUFactorizeMatrix(int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -152,6 +156,16 @@ private:
    int m_lowerSolveThreadBlockSize = -1;
    int m_moveThreadBlockSize = -1;
    int m_ILU0FactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl

--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@ -85,10 +85,12 @@ namespace
            // TODO: removce the first condition in the for loop
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -137,10 +139,12 @@ namespace
            LinearSolverScalar rhs[blocksize] = {0};
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -211,8 +215,8 @@ namespace
        }
    }

-    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
-    // TOOD: The important part is to only cast after that is fully computed
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
+    // results TOOD: The important part is to only cast after that is fully computed
    template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
    __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                               int* lowerRowIndices,
@ -239,7 +243,8 @@ namespace
            InputScalar dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j]
+                        = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                }
            }

@ -257,21 +262,26 @@ namespace

                if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                    // TODO: think long and hard about whether this performs only the wanted memory transfers
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedLowerMat[block * blocksize * blocksize],
+                        &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                        &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
                }

                mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
-                                              dInvTmp);
+                                                        &dInv[col * blocksize * blocksize],
+                                                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                                        dInvTmp);
            }

            invBlockInPlace<InputScalar, blocksize>(dInvTmp);
            moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);

            if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
+                moveBlock<blocksize, InputScalar, OutputScalar>(
+                    dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
            }
        }
    }
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
                   const T* dInv,
                   const T* d,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }

@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        const DiagonalScalar* dInv,
                        const LinearSolverScalar* d,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
                   int rowsInLevelSet,
                   const T* dInv,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

 template <class T, int blocksize>
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
-                                                                                     lowerRowIndices,
-                                                                                     lowerColIndices,
-                                                                                     srcReorderedUpperMat,
-                                                                                     upperRowIndices,
-                                                                                     upperColIndices,
-                                                                                     srcDiagonal,
-                                                                                     reorderedToNatural,
-                                                                                     naturalToReordered,
-                                                                                     startIdx,
-                                                                                     rowsInLevelSet,
-                                                                                     dInv,
-                                                                                     dstDiag,
-                                                                                     dstLowerMat,
-                                                                                     dstUpperMat);
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
+            <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+                                                 lowerRowIndices,
+                                                 lowerColIndices,
+                                                 srcReorderedUpperMat,
+                                                 upperRowIndices,
+                                                 upperColIndices,
+                                                 srcDiagonal,
+                                                 reorderedToNatural,
+                                                 naturalToReordered,
+                                                 startIdx,
+                                                 rowsInLevelSet,
+                                                 dInv,
+                                                 dstDiag,
+                                                 dstLowerMat,
+                                                 dstUpperMat);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }

-// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(   \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(      \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
+    template void solveLowerLevelSet<T, blocksize>(                                                                    \
+        T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);

 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);

-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t);     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*,                                                                                                 \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int,                                                                                                           \
+        int,                                                                                                           \
+        const DiagonalScalar*,                                                                                         \
+        const LinearSolverScalar*,                                                                                     \
+        LinearSolverScalar*,                                                                                           \
+        int,                                                                                                           \
+        cudaStream_t);

 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize)                                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float);                                                 \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float);                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double);                                              \
    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);

 INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
                        const T* dInv,
                        const T* d,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             const DiagonalScalar* dInv,
                             const LinearSolverScalar* d,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
                        int rowsInLevelSet,
                        const T* dInv,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int rowsInLevelSet,
                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -215,9 +215,9 @@ namespace
            // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
            // and sometimes also the diagonal.
            if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
-                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
-                // if not then we just use the double diagonal that is already now stored in srcDiagonal
-                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from
+                // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
                                                                    &dstDiagonal[reorderedIdx * scalarsInBlock]);
                }
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
                   int rowsInLevelSet,
                   const T* d,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
                   int startIdx,
                   int rowsInLevelSet,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }

@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const LinearSolverScalar* d,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

 template <class T, int blocksize>
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
 }

 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t);             \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                     \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(        \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(       \
        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(          \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(         \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(         \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(        \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);

-#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T)                                                                     \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5);                                                                                 \
    INSTANTIATE_KERNEL_WRAPPERS(T, 6);

 INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
    /* double preconditioner */                                                                                        \
    template void solveLowerLevelSetSplit<blocksize, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
    /* float matrix, double compute preconditioner */                                                                  \
    template void solveLowerLevelSetSplit<blocksize, double, float>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                              \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
    /* float preconditioner */                                                                                         \
    template void solveLowerLevelSetSplit<blocksize, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);                                  \
                                                                                                                       \
    /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                          \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
    /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                           \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
    /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
-        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                            \
+        float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t);                                 \
    /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
-        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                           \
+        double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                \
    /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
-        float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                            \
+        float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                 \
    /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                             \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);


 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
                        int startIdx,
                        int rowsInLevelSet,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
                        int rowsInLevelSet,
                        const T* d,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                             int rowsInLevelSet,
                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
                             int rowsInLevelSet,
                             const LinearSolverScalar* d,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal