working MP for gpu DILU

2025-02-25 18:55:30 -06:00 · 2024-10-15 13:22:55 +02:00 · 2024-10-15 13:22:55 +02:00 · 3d67191d49
commit 3d67191d49
parent 1d49eadd15
5 changed files with 231 additions and 108 deletions
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@ -83,10 +83,13 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
    }

    if (m_storeFactorizationAsFloat) {
-        OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
+        if (!m_splitMatrix){
+            OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
+        }
        m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
        m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
        m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+        m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
    }

    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@ -120,17 +123,31 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                d.data(),
-                v.data(),
-                lowerSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat) {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInvFloat->data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            } else {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            }
        } else {
            detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
                m_gpuMatrixReordered->getNonZeroValues().data(),
@ -153,16 +170,29 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
        const int numOfRowsInLevel = m_levelSets[level].size();
        levelStartIdx -= numOfRowsInLevel;
        if (m_splitMatrix) {
-            detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                v.data(),
-                upperSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat){
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInvFloat->data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
+            } else {
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
+            }
        } else {
            detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
                m_gpuMatrixReordered->getNonZeroValues().data(),
@ -241,20 +271,44 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuMatrixReorderedDiag->data(),
-                m_gpuReorderToNatural.data(),
-                m_gpuNaturalToReorder.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuDInv.data(),
-                factorizationBlockSize);
+            if (m_storeFactorizationAsFloat) {
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    m_gpuDInvFloat->data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    factorizationBlockSize);
+            } else {
+                // TODO: should this be field type twice or field type then float in the template?
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    factorizationBlockSize);
+            }
        } else {
            detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
                m_gpuMatrixReordered->getNonZeroValues().data(),
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@ -133,6 +133,7 @@ private:
    std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
    std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
    std::unique_ptr<FloatVec> m_gpuMatrixReorderedDiagFloat;
+    std::unique_ptr<FloatVec> m_gpuDInvFloat;
    //! row conversion from natural to reordered matrix indices stored on the GPU
    GpuVector<int> m_gpuNaturalToReorder;
    //! row conversion from reordered to natural matrix indices stored on the GPU
--- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
@ -159,7 +159,7 @@ mmv(const T* a, const T* b, T* c)
 // dst -= A*B*C
 template <class T, int blocksize>
 __device__ __forceinline__ void
-mmx2Subtraction(T* A, T* B, T* C, T* dst)
+mmx2Subtraction(const T* A, const T* B, const T* C, T* dst)
 {

    T tmp[blocksize * blocksize] = {0};
@ -256,6 +256,19 @@ mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
    }
 }

+// TODO: consider merging with existing block operations
+// mixed precision general version of c += Ab
+template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
+__device__ __forceinline__ void
+umvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        for (int j = 0; j < blocksize; ++j) {
+            c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
+        }
+    }
+}
+
 // TODO: consider merging with existing block operations
 // Mixed precision general version of c -= Ab
 template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@ -59,16 +59,16 @@ namespace
        }
    }

-    template <class T, int blocksize>
-    __global__ void cuSolveLowerLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
-                                              const T* dInv,
-                                              const T* d,
-                                              T* v)
+                                              const MatrixScalar* dInv,
+                                              const LinearSolverScalar* d,
+                                              LinearSolverScalar* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -77,7 +77,7 @@ namespace
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];

-            T rhs[blocksize];
+            LinearSolverScalar rhs[blocksize];
            for (int i = 0; i < blocksize; i++) {
                rhs[i] = d[naturalRowIdx * blocksize + i];
            }
@ -85,10 +85,10 @@ namespace
            // TODO: removce the first condition in the for loop
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -118,15 +118,15 @@ namespace
        }
    }

-    template <class T, int blocksize>
-    __global__ void cuSolveUpperLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
-                                              const T* dInv,
-                                              T* v)
+                                              const MatrixScalar* dInv,
+                                              LinearSolverScalar* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -134,13 +134,13 @@ namespace
            const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
            const int naturalRowIdx = indexConversion[reorderedRowIdx];

-            T rhs[blocksize] = {0};
+            LinearSolverScalar rhs[blocksize] = {0};
            for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                const int col = colIndices[block];
-                umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -211,19 +211,24 @@ namespace
        }
    }

-    template <class T, int blocksize>
-    __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
+    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
+    // TOOD: The important part is to only cast after that is fully computed
+    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                               int* lowerRowIndices,
                                               int* lowerColIndices,
-                                               T* reorderedUpperMat,
+                                               const InputScalar* srcReorderedUpperMat,
                                               int* upperRowIndices,
                                               int* upperColIndices,
-                                               T* diagonal,
+                                               const InputScalar* srcDiagonal,
                                               int* reorderedToNatural,
                                               int* naturalToReordered,
                                               const int startIdx,
                                               int rowsInLevelSet,
-                                               T* dInv)
+                                               InputScalar* dInv,
+                                               OutputScalar* dstDiag, // TODO: should this be diag or dInv?
+                                               OutputScalar* dstLowerMat,
+                                               OutputScalar* dstUpperMat)
    {
        const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
        if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -231,10 +236,10 @@ namespace
            const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];

-            T dInvTmp[blocksize * blocksize];
+            InputScalar dInvTmp[blocksize * blocksize];
            for (int i = 0; i < blocksize; ++i) {
                for (int j = 0; j < blocksize; ++j) {
-                    dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
+                    dInvTmp[i * blocksize + j] = srcDiagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
                }
            }

@ -250,18 +255,28 @@ namespace

                const int symOppositeBlock = symOppositeIdx;

-                mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
+                if constexpr (copyResultToOtherMatrix) {
+                    // TODO: think long and hard about whether this performs only the wanted memory transfers
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
+                }
+
+                mmx2Subtraction<InputScalar, blocksize>(&srcReorderedLowerMat[block * blocksize * blocksize],
                                              &dInv[col * blocksize * blocksize],
-                                              &reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
+                                              &srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize],
                                              dInvTmp);
            }

-            invBlockInPlace<T, blocksize>(dInvTmp);
+            invBlockInPlace<InputScalar, blocksize>(dInvTmp);

-            for (int i = 0; i < blocksize; ++i) {
-                for (int j = 0; j < blocksize; ++j) {
-                    dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
-                }
+            // for (int i = 0; i < blocksize; ++i) {
+            //     for (int j = 0; j < blocksize; ++j) {
+            //         dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
+            //     }
+            // }
+            moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
+            if constexpr (copyResultToOtherMatrix) {
+                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
            }
        }
    }
@ -289,23 +304,23 @@ solveLowerLevelSet(T* reorderedMat,
 }


-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveLowerLevelSetSplit(T* reorderedMat,
+solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
-                        const T* dInv,
-                        const T* d,
-                        T* v,
+                        const MatrixScalar* dInv,
+                        const LinearSolverScalar* d,
+                        LinearSolverScalar* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
@ -328,22 +343,22 @@ solveUpperLevelSet(T* reorderedMat,
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveUpperLevelSetSplit(T* reorderedMat,
+solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int* rowIndices,
                        int* colIndices,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
-                        const T* dInv,
-                        T* v,
+                        const MatrixScalar* dInv,
+                        LinearSolverScalar* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@ -376,51 +391,62 @@ computeDiluDiagonal(T* reorderedMat,
    }
 }

-template <class T, int blocksize>
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
 void
-computeDiluDiagonalSplit(T* reorderedLowerMat,
+computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                         int* lowerRowIndices,
                         int* lowerColIndices,
-                         T* reorderedUpperMat,
+                         const InputScalar* srcReorderedUpperMat,
                         int* upperRowIndices,
                         int* upperColIndices,
-                         T* diagonal,
+                         const InputScalar* srcDiagonal,
                         int* reorderedToNatural,
                         int* naturalToReordered,
                         const int startIdx,
                         int rowsInLevelSet,
-                         T* dInv,
+                         InputScalar* dInv,
+                         OutputScalar* dstDiag,
+                         OutputScalar* dstLowerMat,
+                         OutputScalar* dstUpperMat,
                         int thrBlockSize)
 {
    if (blocksize <= 3) {
        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-            cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
+            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                                     lowerRowIndices,
                                                                                     lowerColIndices,
-                                                                                     reorderedUpperMat,
+                                                                                     srcReorderedUpperMat,
                                                                                     upperRowIndices,
                                                                                     upperColIndices,
-                                                                                     diagonal,
+                                                                                     srcDiagonal,
                                                                                     reorderedToNatural,
                                                                                     naturalToReordered,
                                                                                     startIdx,
                                                                                     rowsInLevelSet,
-                                                                                     dInv);
+                                                                                     dInv,
+                                                                                     dstDiag,
+                                                                                     dstLowerMat,
+                                                                                     dstUpperMat);
    } else {
        OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
    }
 }

+// TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
-        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, false>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, false>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, true>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, true>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
-    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);       \
-    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
-    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
+    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);

 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@ -434,4 +460,30 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar)                                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int);                     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
+
 } // namespace Opm::gpuistl::detail::DILU
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -71,16 +71,16 @@ void solveLowerLevelSet(T* reorderedMat,
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
-template <class T, int blocksize>
-void solveLowerLevelSetSplit(T* reorderedUpperMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
-                             const T* dInv,
-                             const T* d,
-                             T* v,
+                             const MatrixScalar* dInv,
+                             const LinearSolverScalar* d,
+                             LinearSolverScalar* v,
                             int threadBlockSize);

 /**
@ -124,15 +124,15 @@ void solveUpperLevelSet(T* reorderedMat,
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
-template <class T, int blocksize>
-void solveUpperLevelSetSplit(T* reorderedUpperMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
-                             const T* dInv,
-                             T* v,
+                             const MatrixScalar* dInv,
+                             LinearSolverScalar* v,
                             int threadBlockSize);

 /**
@ -161,7 +161,6 @@ void computeDiluDiagonal(T* reorderedMat,
                         int rowsInLevelSet,
                         T* dInv,
                         int threadBlockSize);
-template <class T, int blocksize>

 /**
 * @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
@ -184,18 +183,22 @@ template <class T, int blocksize>
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
-void computeDiluDiagonalSplit(T* reorderedLowerMat,
+template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                              int* lowerRowIndices,
                              int* lowerColIndices,
-                              T* reorderedUpperMat,
+                              const InputScalar* srcReorderedUpperMat,
                              int* upperRowIndices,
                              int* upperColIndices,
-                              T* diagonal,
+                              const InputScalar* srcDiagonal,
                              int* reorderedToNatural,
                              int* naturalToReordered,
                              int startIdx,
                              int rowsInLevelSet,
-                              T* dInv,
+                              InputScalar* dInv,
+                              OutputScalar* dstDiagonal,
+                              OutputScalar* dstLowerMat,
+                              OutputScalar* dstUpperMat,
                              int threadBlockSize);

 } // namespace Opm::gpuistl::detail::DILU