Merge pull request #5852 from multitalentloes/cudagraph_experiment

Improve gpuistl using cudaGraphs
2025-02-25 18:55:30 -06:00 · 2025-02-12 15:01:20 +01:00 · 2025-02-12 15:01:20 +01:00 · b99b606575
commit b99b606575
parent 586e667db0 66fa971079
12 changed files with 417 additions and 149 deletions
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.cpp
@ -188,6 +188,7 @@ GpuBuffer<T>::copyFromHost(const std::vector<T>& data)
 {
    copyFromHost(data.data(), data.size());
 }
+
 template <class T>
 void
 GpuBuffer<T>::copyToHost(std::vector<T>& data) const
--- a/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuBuffer.hpp
@ -27,6 +27,7 @@
 #include <opm/simulators/linalg/gpuistl/GpuView.hpp>
 #include <vector>
 #include <string>
+#include <cuda_runtime.h>


 namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@ -37,6 +37,7 @@
 #include <functional>
 #include <utility>
 #include <string>
+
 namespace Opm::gpuistl
 {

@ -93,7 +94,8 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int
        }
    }

-    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    computeDiagonal(m_DILUFactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@ -110,10 +112,31 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::apply(X& v, const Y& d)
 {
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
    OPM_TIMEBLOCK(prec_apply);
    {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
    }
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }

 template <class M, class X, class Y, int l>
@ -135,7 +158,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInvFloat->data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
@ -147,7 +171,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            } else {
                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
@ -159,7 +184,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInv.data(),
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
@ -172,7 +198,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                m_gpuDInv.data(),
                d.data(),
                v.data(),
-                lowerSolveThreadBlockSize);
+                lowerSolveThreadBlockSize,
+                m_stream.get());
        }
        levelStartIdx += numOfRowsInLevel;
    }
@ -193,7 +220,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInvFloat->data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
@ -204,7 +232,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            } else {
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -215,7 +244,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    numOfRowsInLevel,
                    m_gpuDInv.data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
@ -227,7 +257,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                numOfRowsInLevel,
                m_gpuDInv.data(),
                v.data(),
-                upperSolveThreadBlockSize);
+                upperSolveThreadBlockSize,
+                m_stream.get());
        }
    }
 }
@ -252,7 +283,9 @@ GpuDILU<M, X, Y, l>::update()
 {
    OPM_TIMEBLOCK(prec_update);
    {
-        update(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
+        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+        reorderAndSplitMatrix(m_moveThreadBlockSize);
+        computeDiagonal(m_DILUFactorizationThreadBlockSize);
    }
 }

@ -260,13 +293,22 @@ template <class M, class X, class Y, int l>
 void
 GpuDILU<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationBlockSize)
 {
-    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-    computeDiagAndMoveReorderedData(moveThreadBlockSize, factorizationBlockSize);
+    // ensure that this stream only starts doing work when main stream is completed up to this point
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(moveThreadBlockSize);
+    computeDiagonal(factorizationBlockSize);
+
+    // ensure that main stream only continues after this stream is completed
+    OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+    OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
 }

 template <class M, class X, class Y, int l>
 void
-GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationBlockSize)
+GpuDILU<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
    if (m_splitMatrix) {
        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -290,7 +332,12 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                                                                m_gpuMatrixReordered->N(),
                                                                moveThreadBlockSize);
    }
+}

+template <class M, class X, class Y, int l>
+void
+GpuDILU<M, X, Y, l>::computeDiagonal(int factorizationBlockSize)
+{
    int levelStartIdx = 0;
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@ -24,8 +24,10 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <vector>
-
+#include <map>
+#include <utility>


 namespace Opm::gpuistl
@ -82,8 +84,11 @@ public:
    //! \brief Updates the matrix data.
    void update() final;

+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
    //! \brief Compute the diagonal of the DILU, and update the data of the reordered matrix
-    void computeDiagAndMoveReorderedData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void computeDiagonal(int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -153,6 +158,16 @@ private:
    int m_lowerSolveThreadBlockSize = -1;
    int m_moveThreadBlockSize = -1;
    int m_DILUFactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl

--- a/opm/simulators/linalg/gpuistl/GpuVector.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.cpp
@ -266,6 +266,19 @@ GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements)
    OPM_GPU_SAFE_CALL(cudaMemcpy(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice));
 }

+template <class T>
+void
+GpuVector<T>::copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream)
+{
+    if (numberOfElements > dim()) {
+        OPM_THROW(std::runtime_error,
+                  fmt::format("Requesting to copy too many elements. Vector has {} elements, while {} was requested.",
+                              dim(),
+                              numberOfElements));
+    }
+    OPM_GPU_SAFE_CALL(cudaMemcpyAsync(data(), dataPointer, numberOfElements * sizeof(T), cudaMemcpyHostToDevice, stream));
+}
+
 template <class T>
 void
 GpuVector<T>::copyToHost(T* dataPointer, size_t numberOfElements) const
--- a/opm/simulators/linalg/gpuistl/GpuVector.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuVector.hpp
@ -203,6 +203,7 @@ public:
     * @note assumes that this vector has numberOfElements elements
     */
    void copyFromHost(const T* dataPointer, size_t numberOfElements);
+    void copyFromHost(const T* dataPointer, size_t numberOfElements, cudaStream_t stream);

    /**
     * @brief copyFromHost copies numberOfElements to the CPU memory dataPointer
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@ -37,6 +37,7 @@
 #include <string>
 #include <tuple>
 #include <utility>
+
 namespace Opm::gpuistl
 {

@ -58,7 +59,6 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
    , m_tuneThreadBlockSizes(tuneKernels)
    , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
-
    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
    // Some sanity check
    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -98,7 +98,8 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
        }
    }

-    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);

    if (m_tuneThreadBlockSizes) {
        tuneThreadBlockSizes();
@ -117,7 +118,29 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d)
 {
    OPM_TIMEBLOCK(prec_apply);
    {
-        apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
+        const auto ptrs = std::make_pair(v.data(), d.data());
+
+        auto it = m_apply_graphs.find(ptrs);
+
+        if (it == m_apply_graphs.end()) {
+            OPM_GPU_SAFE_CALL(cudaStreamBeginCapture(m_stream.get(), cudaStreamCaptureModeGlobal));
+
+            // The apply functions contains lots of small function calls which call a kernel each
+            apply(v, d, m_lowerSolveThreadBlockSize, m_upperSolveThreadBlockSize);
+
+            OPM_GPU_SAFE_CALL(cudaStreamEndCapture(m_stream.get(), &m_apply_graphs[ptrs].get()));
+            OPM_GPU_SAFE_CALL(cudaGraphInstantiate(&m_executableGraphs[ptrs].get(), m_apply_graphs[ptrs].get(), nullptr, nullptr, 0));
+        }
+        OPM_GPU_SAFE_CALL(cudaGraphLaunch(m_executableGraphs[ptrs].get(), 0));
+
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
    }
 }

@ -142,7 +165,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
            else{
                detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
@ -154,7 +178,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    d.data(),
                    v.data(),
-                    lowerSolveThreadBlockSize);
+                    lowerSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -165,7 +190,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                     numOfRowsInLevel,
                                                                     d.data(),
                                                                     v.data(),
-                                                                     lowerSolveThreadBlockSize);
+                                                                     lowerSolveThreadBlockSize,
+                                                                     m_stream.get());
        }
        levelStartIdx += numOfRowsInLevel;
    }
@ -185,7 +211,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                        numOfRowsInLevel,
                        m_gpuMatrixReorderedDiagFloat.value().data(),
                        v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream.get());
            }
            else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
@ -197,7 +224,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                        numOfRowsInLevel,
                        m_gpuMatrixReorderedDiag.value().data(),
                        v.data(),
-                        upperSolveThreadBlockSize);
+                        upperSolveThreadBlockSize,
+                        m_stream.get());
            }
            else{
                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
@ -209,7 +237,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                    numOfRowsInLevel,
                    m_gpuMatrixReorderedDiag.value().data(),
                    v.data(),
-                    upperSolveThreadBlockSize);
+                    upperSolveThreadBlockSize,
+                    m_stream.get());
            }
        } else {
            detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
@ -219,7 +248,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                                                                     levelStartIdx,
                                                                     numOfRowsInLevel,
                                                                     v.data(),
-                                                                     upperSolveThreadBlockSize);
+                                                                     upperSolveThreadBlockSize,
+                                                                     m_stream.get());
        }
    }
 }
@ -241,10 +271,9 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::update()
 {
-    OPM_TIMEBLOCK(prec_update);
-    {
-        update(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
-    }
+    m_gpuMatrix.updateNonzeroValues(m_cpuMatrix); // send updated matrix to the gpu
+    reorderAndSplitMatrix(m_moveThreadBlockSize);
+    LUFactorizeMatrix(m_ILU0FactorizationThreadBlockSize);
 }

 template <class M, class X, class Y, int l>
@ -253,13 +282,23 @@ OpmGpuILU0<M, X, Y, l>::update(int moveThreadBlockSize, int factorizationThreadB
 {
    OPM_TIMEBLOCK(prec_update);
    {
+        // ensure that this stream only starts doing work when main stream is completed up to this point
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_before.get(), 0));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(m_stream.get(), m_before.get(), 0));
+
        m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
-        LUFactorizeAndMoveData(moveThreadBlockSize, factorizationThreadBlockSize);
+        reorderAndSplitMatrix(moveThreadBlockSize);
+        LUFactorizeMatrix(factorizationThreadBlockSize);
+
+        // ensure that main stream only continues after this stream is completed
+        OPM_GPU_SAFE_CALL(cudaEventRecord(m_after.get(), m_stream.get()));
+        OPM_GPU_SAFE_CALL(cudaStreamWaitEvent(0, m_after.get(), 0));
    }
 }
+
 template <class M, class X, class Y, int l>
 void
-OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize)
+OpmGpuILU0<M, X, Y, l>::reorderAndSplitMatrix(int moveThreadBlockSize)
 {
    if (m_splitMatrix) {
        detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
@ -283,6 +322,12 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                                                               m_gpuReorderedLU->N(),
                                                               moveThreadBlockSize);
    }
+}
+
+template <class M, class X, class Y, int l>
+void
+OpmGpuILU0<M, X, Y, l>::LUFactorizeMatrix(int factorizationThreadBlockSize)
+{
    int levelStartIdx = 0;
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/gpu_resources.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <optional>
 #include <type_traits>
@ -84,8 +85,11 @@ public:
    //! \brief Updates the matrix data.
    void update() final;

+    //! \brief perform matrix splitting and reordering
+    void reorderAndSplitMatrix(int moveThreadBlockSize);
+
    //! \brief Compute LU factorization, and update the data of the reordered matrix
-    void LUFactorizeAndMoveData(int moveThreadBlockSize, int factorizationThreadBlockSize);
+    void LUFactorizeMatrix(int factorizationThreadBlockSize);

    //! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
    void tuneThreadBlockSizes();
@ -152,6 +156,16 @@ private:
    int m_lowerSolveThreadBlockSize = -1;
    int m_moveThreadBlockSize = -1;
    int m_ILU0FactorizationThreadBlockSize = -1;
+
+    // Graphs for Apply
+    std::map<std::pair<field_type*, const field_type*>, GPUGraph> m_apply_graphs;
+    std::map<std::pair<field_type*, const field_type*>, GPUGraphExec> m_executableGraphs;
+
+    // Stream for the DILU operations on the GPU
+    GPUStream m_stream{};
+    // Events for synchronization with main stream
+    GPUEvent m_before{};
+    GPUEvent m_after{};
 };
 } // end namespace Opm::gpuistl

--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@ -85,10 +85,12 @@ namespace
            // TODO: removce the first condition in the for loop
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -137,10 +139,12 @@ namespace
            LinearSolverScalar rhs[blocksize] = {0};
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                &dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -211,8 +215,8 @@ namespace
        }
    }

-    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
-    // TOOD: The important part is to only cast after that is fully computed
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate
+    // results TOOD: The important part is to only cast after that is fully computed
    template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
    __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                               int* lowerRowIndices,
@ -239,7 +243,8 @@ namespace
            InputScalar dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j]
+                        = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                }
            }

@ -257,21 +262,26 @@ namespace

                if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                    // TODO: think long and hard about whether this performs only the wanted memory transfers
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
-                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedLowerMat[block * blocksize * blocksize],
+                        &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(
+                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                        &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
                }

                mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
-                                              &dInv[col * blocksize * blocksize],
-                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
-                                              dInvTmp);
+                                                        &dInv[col * blocksize * blocksize],
+                                                        &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                                        dInvTmp);
            }

            invBlockInPlace<InputScalar, blocksize>(dInvTmp);
            moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);

            if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
+                moveBlock<blocksize, InputScalar, OutputScalar>(
+                    dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
            }
        }
    }
@ -289,12 +299,13 @@ solveLowerLevelSet(T* reorderedMat,
                   const T* dInv,
                   const T* d,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }

@ -310,13 +321,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        const DiagonalScalar* dInv,
                        const LinearSolverScalar* d,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <class T, int blocksize>
@ -329,12 +342,13 @@ solveUpperLevelSet(T* reorderedMat,
                   int rowsInLevelSet,
                   const T* dInv,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@ -348,13 +362,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

 template <class T, int blocksize>
@ -409,43 +425,134 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
-                                                                                     lowerRowIndices,
-                                                                                     lowerColIndices,
-                                                                                     srcReorderedUpperMat,
-                                                                                     upperRowIndices,
-                                                                                     upperColIndices,
-                                                                                     srcDiagonal,
-                                                                                     reorderedToNatural,
-                                                                                     naturalToReordered,
-                                                                                     startIdx,
-                                                                                     rowsInLevelSet,
-                                                                                     dInv,
-                                                                                     dstDiag,
-                                                                                     dstLowerMat,
-                                                                                     dstUpperMat);
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>
+            <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+                                                 lowerRowIndices,
+                                                 lowerColIndices,
+                                                 srcReorderedUpperMat,
+                                                 upperRowIndices,
+                                                 upperColIndices,
+                                                 srcDiagonal,
+                                                 reorderedToNatural,
+                                                 naturalToReordered,
+                                                 startIdx,
+                                                 rowsInLevelSet,
+                                                 dInv,
+                                                 dstDiag,
+                                                 dstLowerMat,
+                                                 dstUpperMat);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }

-// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
-        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(   \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(      \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(     \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        float*,                                                                                                        \
+        int);                                                                                                          \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(    \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const T*,                                                                                                      \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        const int,                                                                                                     \
+        int,                                                                                                           \
+        T*,                                                                                                            \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        double*,                                                                                                       \
+        int);                                                                                                          \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
+    template void solveLowerLevelSet<T, blocksize>(                                                                    \
+        T*, int*, int*, int*, int, int, const T*, const T*, T*, int, cudaStream_t);

 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -460,19 +567,29 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);

-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int, cudaStream_t);     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                \
+        MatrixScalar*,                                                                                                 \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int*,                                                                                                          \
+        int,                                                                                                           \
+        int,                                                                                                           \
+        const DiagonalScalar*,                                                                                         \
+        const LinearSolverScalar*,                                                                                     \
+        LinearSolverScalar*,                                                                                           \
+        int,                                                                                                           \
+        cudaStream_t);

 // TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
-    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize)                                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float);                                                 \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float);                                               \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double);                                                \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double);                                              \
    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);

 INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -54,7 +54,8 @@ void solveLowerLevelSet(T* reorderedMat,
                        const T* dInv,
                        const T* d,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -82,7 +83,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             const DiagonalScalar* dInv,
                             const LinearSolverScalar* d,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -108,7 +110,8 @@ void solveUpperLevelSet(T* reorderedMat,
                        int rowsInLevelSet,
                        const T* dInv,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -134,7 +137,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int rowsInLevelSet,
                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -215,9 +215,9 @@ namespace
            // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
            // and sometimes also the diagonal.
            if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
-                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
-                // if not then we just use the double diagonal that is already now stored in srcDiagonal
-                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from
+                // double to float if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
                                                                    &dstDiagonal[reorderedIdx * scalarsInBlock]);
                }
@ -362,12 +362,13 @@ solveLowerLevelSet(T* reorderedMat,
                   int rowsInLevelSet,
                   const T* d,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
@ -380,12 +381,13 @@ solveUpperLevelSet(T* reorderedMat,
                   int startIdx,
                   int rowsInLevelSet,
                   T* v,
-                   int thrBlockSize)
+                   int thrBlockSize,
+                   cudaStream_t stream)
 {
    int threadBlockSize
        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize, 0, stream>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }

@ -399,13 +401,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const LinearSolverScalar* d,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
 template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
@ -418,13 +422,15 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int rowsInLevelSet,
                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
-                        int thrBlockSize)
+                        int thrBlockSize,
+                        cudaStream_t stream)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
-        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>
+        <<<nThreadBlocks, threadBlockSize, 0, stream>>>(
+            reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

 template <class T, int blocksize>
@ -484,28 +490,28 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
 }

 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
-    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
+    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int, cudaStream_t);             \
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int, cudaStream_t);   \
    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                     \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(        \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(       \
        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(          \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
-    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(         \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(         \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                   \
-        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(        \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);

-#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
-    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T)                                                                     \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4);                                                                                 \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5);                                                                                 \
    INSTANTIATE_KERNEL_WRAPPERS(T, 6);

 INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
@ -514,32 +520,32 @@ INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)
 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
    /* double preconditioner */                                                                                        \
    template void solveLowerLevelSetSplit<blocksize, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
    /* float matrix, double compute preconditioner */                                                                  \
    template void solveLowerLevelSetSplit<blocksize, double, float>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                              \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
    /* float preconditioner */                                                                                         \
    template void solveLowerLevelSetSplit<blocksize, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);                                  \
                                                                                                                       \
    /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
-        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                          \
+        double*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                               \
    /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
-        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                           \
+        float*, int*, int*, int*, int, int, const double*, double*, int, cudaStream_t);                                \
    /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
-        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                            \
+        float*, int*, int*, int*, int, int, const double*, float*, int, cudaStream_t);                                 \
    /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
-        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                           \
+        double*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                \
    /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
-        float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                            \
+        float*, int*, int*, int*, int, int, const float*, double*, int, cudaStream_t);                                 \
    /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
-        float*, int*, int*, int*, int, int, const float*, float*, int);
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                             \
+        float*, int*, int*, int*, int, int, const float*, float*, int, cudaStream_t);


 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -46,7 +46,8 @@ void solveUpperLevelSet(T* reorderedMat,
                        int startIdx,
                        int rowsInLevelSet,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
@ -72,7 +73,8 @@ void solveLowerLevelSet(T* reorderedMat,
                        int rowsInLevelSet,
                        const T* d,
                        T* v,
-                        int threadBlockSize);
+                        int threadBlockSize,
+                        cudaStream_t stream);

 /**
 * @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
@ -99,7 +101,8 @@ void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                             int rowsInLevelSet,
                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
@ -127,7 +130,8 @@ void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
                             int rowsInLevelSet,
                             const LinearSolverScalar* d,
                             LinearSolverScalar* v,
-                             int threadBlockSize);
+                             int threadBlockSize,
+                             cudaStream_t stream);

 /**
 * @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal