From 55c20dbddd7497aa6286a8420923c37f5cb94547 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Mon, 30 Sep 2024 12:03:01 +0200
Subject: [PATCH] Implement mixed precision GPU ILU0

---
 .../linalg/PreconditionerFactory_impl.hpp     |   8 +-
 .../linalg/gpuistl/GpuSparseMatrix.cpp        |  15 ++
 .../linalg/gpuistl/GpuSparseMatrix.hpp        |  14 ++
 opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp  |  92 +++++++---
 opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp  |  21 ++-
 .../gpuistl/detail/deviceBlockOperations.hpp  |  52 +++++-
 .../preconditionerKernels/ILU0Kernels.cu      | 163 ++++++++++++------
 .../preconditionerKernels/ILU0Kernels.hpp     |  29 ++--
 8 files changed, 291 insertions(+), 103 deletions(-)
diff --git a/opm/simulators/linalg/PreconditionerFactory_impl.hpp b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
index 0f368cd80..2f58a5309 100644
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@@ -361,9 +361,10 @@ struct StandardPreconditioners {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
             using field_type = typename V::field_type;
             using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
+            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
 
             auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
             auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@@ -619,10 +620,12 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+
             using field_type = typename V::field_type;
             using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
 
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
         });
 
         F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
@@ -636,6 +639,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
         F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
             const bool split_matrix = prm.get<bool>("split_matrix", true);
             const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
+
             using block_type = typename V::block_type;
             using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
             using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
index 3928784ca..19687b6cc 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.cpp
@@ -81,6 +81,21 @@ GpuSparseMatrix<T>::GpuSparseMatrix(const T* nonZeroElements,
     }
 }
 
+template <class T>
+GpuSparseMatrix<T>::GpuSparseMatrix(const GpuVector<int> rowIndices,
+                                  const GpuVector<int> columnIndices,
+                                  size_t blockSize)
+    : m_nonZeroElements(columnIndices.dim() * blockSize * blockSize)
+    , m_columnIndices(columnIndices)
+    , m_rowIndices(rowIndices)
+    , m_numberOfNonzeroBlocks(detail::to_int(columnIndices.dim()))
+    , m_numberOfRows(detail::to_int(rowIndices.dim()-1))
+    , m_blockSize(detail::to_int(blockSize))
+    , m_matrixDescription(detail::createMatrixDescription())
+    , m_cusparseHandle(detail::CuSparseHandle::getInstance())
+{
+}
+
 template <class T>
 GpuSparseMatrix<T>::~GpuSparseMatrix()
 {
diff --git a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
index d3d6dcb66..2a2141a0f 100644
--- a/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp
@@ -67,6 +67,20 @@ public:
                    size_t blockSize,
                    size_t numberOfRows);
 
+    //! Create a sparse matrix by copying the sparsity structure of another matrix, not filling in the values
+    //!
+    //! \note Prefer to use the constructor taking a const reference to a matrix instead.
+    //!
+    //! \param[in] rowIndices      the row indices of the non-zero elements
+    //! \param[in] columnIndices   the column indices of the non-zero elements
+    //! \param[in] blockSize size of each block matrix (typically 3)
+    //!
+    //! \note We assume numberOfNonzeroBlocks, blockSize and numberOfRows all are representable as int due to
+    //!       restrictions in the current version of cusparse. This might change in future versions.
+    GpuSparseMatrix(const GpuVector<int> rowIndices,
+                   const GpuVector<int> columnIndices,
+                   size_t blockSize);
+
     /**
      * We don't want to be able to copy this for now (too much hassle in copying the cusparse resources)
      */
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
index 6bd008864..3782ff49b 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {
 
 template <class M, class X, class Y, int l>
-OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels)
+OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
     : m_cpuMatrix(A)
     , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
     , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@@ -49,11 +49,14 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
     , m_gpuMatrix(GpuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
     , m_gpuMatrixReorderedLower(nullptr)
     , m_gpuMatrixReorderedUpper(nullptr)
+    , m_gpuMatrixReorderedLowerFloat(nullptr)
+    , m_gpuMatrixReorderedUpperFloat(nullptr)
     , m_gpuNaturalToReorder(m_naturalToReordered)
     , m_gpuReorderToNatural(m_reorderedToNatural)
     , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
     , m_splitMatrix(splitMatrix)
     , m_tuneThreadBlockSizes(tuneKernels)
+    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
 {
     // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
     // Some sanity check
@@ -71,6 +74,7 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
                  fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
                              m_gpuMatrix.nonzeroes(),
                              A.nonzeroes()));
+
     if (m_splitMatrix) {
         m_gpuMatrixReorderedDiag.emplace(GpuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
         std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
@@ -80,6 +84,15 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
         m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, GpuSparseMatrix<field_type>>(
             m_cpuMatrix, m_reorderedToNatural);
     }
+
+    if (m_storeFactorizationAsFloat){
+        assert(m_splitMatrix && "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");
+        // initialize mixed precision datastructures
+        m_gpuMatrixReorderedLowerFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_)));
+        m_gpuMatrixReorderedUpperFloat = std::unique_ptr<FloatMat>(new auto(FloatMat(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_)));
+        m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+    }
+
     LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
 
     if (m_tuneThreadBlockSizes) {
@@ -107,20 +120,37 @@ template <class M, class X, class Y, int l>
 void
 OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int upperSolveThreadBlockSize)
 {
+    // perform a lower solve and then an upper solve to apply the approximate inverse using ILU factorization
+    // for the lower and upper solve we have some if's that determine which underlying implementation to use
+
     int levelStartIdx = 0;
     for (int level = 0; level < m_levelSets.size(); ++level) {
         const int numOfRowsInLevel = m_levelSets[level].size();
         if (m_splitMatrix) {
-            detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                d.data(),
-                v.data(),
-                lowerSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat){
+                detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            }
+            else{
+                detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
+            }
         } else {
             detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
                                                                      m_gpuReorderedLU->getRowIndices().data(),
@@ -140,16 +170,30 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
         const int numOfRowsInLevel = m_levelSets[level].size();
         levelStartIdx -= numOfRowsInLevel;
         if (m_splitMatrix) {
-            detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
-                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                m_gpuReorderToNatural.data(),
-                levelStartIdx,
-                numOfRowsInLevel,
-                m_gpuMatrixReorderedDiag.value().data(),
-                v.data(),
-                upperSolveThreadBlockSize);
+            if (m_storeFactorizationAsFloat) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+                        m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                        m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                        m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                        m_gpuReorderToNatural.data(),
+                        levelStartIdx,
+                        numOfRowsInLevel,
+                        m_gpuMatrixReorderedDiagFloat.value().data(),
+                        v.data(),
+                        upperSolveThreadBlockSize);
+            }
+            else{
+                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
+            }
         } else {
             detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
                                                                      m_gpuReorderedLU->getRowIndices().data(),
@@ -227,7 +271,7 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
         const int numOfRowsInLevel = m_levelSets[level].size();
 
         if (m_splitMatrix) {
-            detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
+            detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float>(
                 m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                 m_gpuMatrixReorderedLower->getRowIndices().data(),
                 m_gpuMatrixReorderedLower->getColumnIndices().data(),
@@ -235,10 +279,14 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                 m_gpuMatrixReorderedUpper->getRowIndices().data(),
                 m_gpuMatrixReorderedUpper->getColumnIndices().data(),
                 m_gpuMatrixReorderedDiag.value().data(),
+                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data() : nullptr,
+                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data() : nullptr,
+                m_storeFactorizationAsFloat ? m_gpuMatrixReorderedDiagFloat.value().data() : nullptr,
                 m_gpuReorderToNatural.data(),
                 m_gpuNaturalToReorder.data(),
                 levelStartIdx,
                 numOfRowsInLevel,
+                m_storeFactorizationAsFloat,
                 factorizationThreadBlockSize);
 
         } else {
diff --git a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
index ec29fb566..025be1f5b 100644
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@@ -54,7 +54,9 @@ public:
     //! \brief The field type of the preconditioner.
     using field_type = typename X::field_type;
     //! \brief The GPU matrix type
-    using CuMat = GpuSparseMatrix<field_type>;
+    using GpuMat = GpuSparseMatrix<field_type>;
+    //! \brief The Float matrix type for mixed precision
+    using FloatMat = GpuSparseMatrix<float>;
 
     //! \brief Constructor.
     //!
@@ -62,7 +64,7 @@ public:
     //! \param A The matrix to operate on.
     //! \param w The relaxation factor.
     //!
-    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels);
+    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
 
     //! \brief Prepare the preconditioner.
     //! \note Does nothing at the time being.
@@ -120,11 +122,15 @@ private:
     //! \brief converts from index in natural ordered structure to index reordered strucutre
     std::vector<int> m_naturalToReordered;
     //! \brief The A matrix stored on the gpu, and its reordred version
-    CuMat m_gpuMatrix;
-    std::unique_ptr<CuMat> m_gpuReorderedLU;
+    GpuMat m_gpuMatrix;
+    std::unique_ptr<GpuMat> m_gpuReorderedLU;
     //! \brief If matrix splitting is enabled, then we store the lower and upper part separately
-    std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
-    std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
+    std::unique_ptr<GpuMat> m_gpuMatrixReorderedLower;
+    std::unique_ptr<GpuMat> m_gpuMatrixReorderedUpper;
+    //! \brief If mixed precision is enabled, store a float matrix
+    std::unique_ptr<FloatMat> m_gpuMatrixReorderedLowerFloat;
+    std::unique_ptr<FloatMat> m_gpuMatrixReorderedUpperFloat;
+    std::optional<GpuVector<float>> m_gpuMatrixReorderedDiagFloat;
     //! \brief If matrix splitting is enabled, we also store the diagonal separately
     std::optional<GpuVector<field_type>> m_gpuMatrixReorderedDiag;
     //! row conversion from natural to reordered matrix indices stored on the GPU
@@ -137,6 +143,9 @@ private:
     bool m_splitMatrix;
     //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
     bool m_tuneThreadBlockSizes;
+    //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
+    //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
+    bool m_storeFactorizationAsFloat;
     //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
     //! The default value of -1 indicates that we have not calibrated and selected a value yet
     int m_upperSolveThreadBlockSize = -1;
diff --git a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
index b3475591a..d590ca365 100644
--- a/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/deviceBlockOperations.hpp
@@ -37,9 +37,9 @@ __device__ __forceinline__ void
 invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
 {
     if (blocksize == 1) {
-        dstBlock[0] = 1.0 / (srcBlock[0]);
+        dstBlock[0] = T(1.0) / (srcBlock[0]);
     } else if (blocksize == 2) {
-        T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
+        T detInv = T(1.0) / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
         dstBlock[0] = srcBlock[3] * detInv;
         dstBlock[1] = -srcBlock[1] * detInv;
         dstBlock[2] = -srcBlock[2] * detInv;
@@ -53,7 +53,7 @@ invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
         T t12 = srcBlock[1] * srcBlock[6];
         T t14 = srcBlock[2] * srcBlock[6];
 
-        T t17 = 1.0
+        T t17 = T(1.0)
             / (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
                - t14 * srcBlock[4]); // t17 is 1/determinant
 
@@ -75,9 +75,9 @@ __device__ __forceinline__ void
 invBlockInPlace(T* __restrict__ block)
 {
     if (blocksize == 1) {
-        block[0] = 1.0 / (block[0]);
+        block[0] = T(1.0) / (block[0]);
     } else if (blocksize == 2) {
-        T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
+        T detInv = T(1.0) / (block[0] * block[3] - block[1] * block[2]);
 
         T temp = block[0];
         block[0] = block[3] * detInv;
@@ -93,7 +93,7 @@ invBlockInPlace(T* __restrict__ block)
         T t14 = block[2] * block[6];
 
         T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
-        T t17 = T(1.0) / det;
+        T t17 = T(T(1.0)) / det;
 
         T matrix01 = block[1];
         T matrix00 = block[0];
@@ -229,6 +229,46 @@ matrixSubtraction(T* A, T* B)
         }
     }
 }
+
+// TODO: possibly place somewhere else in this file
+// TODO: consider merging with existing block operations
+template<int blocksize, class ScalarInputType, class ScalarOutputType>
+__device__ __forceinline__ void
+moveBlock(const ScalarInputType* __restrict__ A, ScalarOutputType* __restrict__ B){
+    for (int i = 0; i < blocksize; ++i){
+        for (int j = 0; j < blocksize; ++j){
+            B[i * blocksize + j] = ScalarOutputType(A[i * blocksize + j]);
+        }
+    }
+}
+
+// TODO: possibly place somewhere else in this file
+// TODO: consider merging with existing block operations
+template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
+__device__ __forceinline__ void
+mvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        c[i] = 0;
+
+        for (int j = 0; j < blocksize; ++j) {
+            c[i] += ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
+        }
+    }
+}
+
+// TODO: possibly place somewhere else in this file
+// TODO: consider merging with existing block operations
+template <int blocksize, class MatrixScalar, class VectorScalar, class ResultScalar, class ComputeScalar>
+__device__ __forceinline__ void
+mmvMixedGeneral(const MatrixScalar* A, const VectorScalar* b, ResultScalar* c)
+{
+    for (int i = 0; i < blocksize; ++i) {
+        for (int j = 0; j < blocksize; ++j) {
+            c[i] -= ResultScalar(ComputeScalar(A[i * blocksize + j]) * ComputeScalar(b[j]));
+        }
+    }
+}
 } // namespace
 
 #endif
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
index e58fbb18e..8d1dd642f 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@@ -120,17 +120,21 @@ namespace
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
+    template <int blocksize, class InputScalar, class OutputScalar>
+    __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                                            int* lowerRowIndices,
                                            int* lowerColIndices,
-                                           T* reorderedUpperMat,
+                                           InputScalar* srcReorderedUpperMat,
                                            int* upperRowIndices,
                                            int* upperColIndices,
-                                           T* diagonal,
+                                           InputScalar* srcDiagonal,
+                                           OutputScalar* dstReorderedLowerMat,
+                                           OutputScalar* dstReorderedUpperMat,
+                                           OutputScalar* dstDiagonal,
                                            int* reorderedToNatural,
                                            int* naturalToReordered,
                                            const int startIdx,
+                                           const bool copyResultToOtherMatrix,
                                            int rowsInLevelSet)
     {
         auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
@@ -155,9 +159,12 @@ namespace
 
                 // the DUNE implementation is inplace, and I need to take care to make sure
                 // this out of place implementation is equivalent
-                mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
-                                        &diagonal[j * scalarsInBlock],
-                                        &reorderedLowerMat[ij * scalarsInBlock]);
+                mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
+                                        &srcDiagonal[j * scalarsInBlock],
+                                        &srcReorderedLowerMat[ij * scalarsInBlock]);
+                if (copyResultToOtherMatrix){
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock], &dstReorderedLowerMat[ij * scalarsInBlock]);
+                }
 
                 // we have now accounted for an element under the diagonal and the diagonal element above it
                 // now iterate over the blocks that align in the
@@ -172,40 +179,53 @@ namespace
                 // the same block index might be found in the lower part of the matrix
                 while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
 
-                    T* ikBlockPtr;
+                    InputScalar* ikBlockPtr;
+                    OutputScalar* dstIkBlockPtr;
                     int ikColumn;
                     if (ikState == POSITION_TYPE::UNDER_DIAG) {
-                        ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
+                        ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock];
+                        if (copyResultToOtherMatrix)
+                            dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock];
                         ikColumn = lowerColIndices[ik];
                     } else if (ikState == POSITION_TYPE::ON_DIAG) {
-                        ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
+                        ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock];
+                        if (copyResultToOtherMatrix)
+                            dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock];
                         ikColumn = naturalIdx;
                     } else { // ikState == POSITION_TYPE::ABOVE_DIAG
-                        ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
+                        ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock];
+                        if (copyResultToOtherMatrix)
+                            dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock];
                         ikColumn = upperColIndices[ik];
                     }
 
                     if (ikColumn == upperColIndices[jk]) {
                         // A_jk = A_ij * A_jk
-                        T tmp[scalarsInBlock] = {0};
+                        InputScalar tmp[scalarsInBlock] = {0};
 
-                        mmNoOverlap<T, blocksize>(
-                            &reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
-                        matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
+                        mmNoOverlap<InputScalar, blocksize>(
+                            &srcReorderedLowerMat[ij * scalarsInBlock], &srcReorderedUpperMat[jk * scalarsInBlock], tmp);
+                        matrixSubtraction<InputScalar, blocksize>(ikBlockPtr, tmp);
                         incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
-                        ;
                         ++jk;
                     } else {
                         if (ikColumn < upperColIndices[jk]) {
                             incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
-                            ;
                         } else {
                             ++jk;
                         }
                     }
                 }
             }
-            invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
+            invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
+            if (copyResultToOtherMatrix){
+                moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock], &dstDiagonal[reorderedIdx * scalarsInBlock]);
+
+                // also move all values above the diagonal on this row
+                for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock], &dstReorderedUpperMat[block * scalarsInBlock]);
+                }
+            }
         }
     }
 
@@ -266,15 +286,15 @@ namespace
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuSolveLowerLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const T* d,
-                                              T* v)
+                                              const LinearSolverScalar* d,
+                                              LinearSolverScalar* v)
     {
         auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedIdx < rowsInLevelSet + startIdx) {
@@ -282,30 +302,31 @@ namespace
             const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
             const int naturalRowIdx = indexConversion[reorderedIdx];
 
-            T rhs[blocksize];
+            LinearSolverScalar rhs[blocksize];
             for (int i = 0; i < blocksize; i++) {
                 rhs[i] = d[naturalRowIdx * blocksize + i];
             }
 
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
             for (int i = 0; i < blocksize; ++i) {
-                v[naturalRowIdx * blocksize + i] = rhs[i];
+                v[naturalRowIdx * blocksize + i] = LinearSolverScalar(rhs[i]);
             }
         }
     }
 
-    template <class T, int blocksize>
-    __global__ void cuSolveUpperLevelSetSplit(T* mat,
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                               int* rowIndices,
                                               int* colIndices,
                                               int* indexConversion,
                                               int startIdx,
                                               int rowsInLevelSet,
-                                              const T* dInv,
-                                              T* v)
+                                              const MatrixScalar* dInv,
+                                              LinearSolverScalar* v)
     {
         auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedIdx < rowsInLevelSet + startIdx) {
@@ -313,17 +334,17 @@ namespace
             const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
             const int naturalRowIdx = indexConversion[reorderedIdx];
 
-            T rhs[blocksize];
+            LinearSolverScalar rhs[blocksize];
             for (int i = 0; i < blocksize; i++) {
                 rhs[i] = v[naturalRowIdx * blocksize + i];
             }
 
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
-                mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
+                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
             }
 
-            mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
         }
     }
 } // namespace
@@ -365,41 +386,41 @@ solveUpperLevelSet(T* reorderedMat,
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
 }
 
-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveLowerLevelSetSplit(T* reorderedMat,
+solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const T* d,
-                        T* v,
+                        const LinearSolverScalar* d,
+                        LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
-template <class T, int blocksize>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
 void
-solveUpperLevelSetSplit(T* reorderedMat,
+solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                         int* rowIndices,
                         int* colIndices,
                         int* indexConversion,
                         int startIdx,
                         int rowsInLevelSet,
-                        const T* dInv,
-                        T* v,
+                        const MatrixScalar* dInv,
+                        LinearSolverScalar* v,
                         int thrBlockSize)
 {
     int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
         reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }
 
@@ -421,34 +442,42 @@ LUFactorization(T* srcMatrix,
         srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }
 
-template <class T, int blocksize>
+template <int blocksize, class InputScalar, class OutputScalar>
 void
-LUFactorizationSplit(T* reorderedLowerMat,
+LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                      int* lowerRowIndices,
                      int* lowerColIndices,
-                     T* reorderedUpperMat,
+                     InputScalar* reorderedUpperMat,
                      int* upperRowIndices,
                      int* upperColIndices,
-                     T* diagonal,
+                     InputScalar* srcDiagonal,
+                     OutputScalar* dstReorderedLowerMat,
+                     OutputScalar* dstReorderedUpperMat,
+                     OutputScalar* dstDiagonal,
                      int* reorderedToNatural,
                      int* naturalToReordered,
                      const int startIdx,
                      int rowsInLevelSet,
+                     bool copyResultToOtherMatrix,
                      int thrBlockSize)
 {
     int threadBlockSize
-        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
+        = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar>, thrBlockSize);
     int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                              lowerRowIndices,
                                                                              lowerColIndices,
                                                                              reorderedUpperMat,
                                                                              upperRowIndices,
                                                                              upperColIndices,
-                                                                             diagonal,
+                                                                             srcDiagonal,
+                                                                             dstReorderedLowerMat,
+                                                                             dstReorderedUpperMat,
+                                                                             dstDiagonal,
                                                                              reorderedToNatural,
                                                                              naturalToReordered,
                                                                              startIdx,
+                                                                             copyResultToOtherMatrix,
                                                                              rowsInLevelSet);
 }
 
@@ -456,10 +485,12 @@ LUFactorizationSplit(T* reorderedLowerMat,
     template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
     template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
     template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<T, blocksize>(                                                                  \
-        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int);                                          \
-    template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
-    template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
+    template void LUFactorizationSplit<blocksize, T, float>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, bool, int);                                          \
+    template void LUFactorizationSplit<blocksize, T, double>(                                                                  \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, bool, int);             
+    // template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);            \
+    // template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
 INSTANTIATE_KERNEL_WRAPPERS(float, 2);
@@ -473,4 +504,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 3);
 INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+
+#define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize) \
+    /* double preconditioner */\
+    template void solveLowerLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int); \
+    /* float matrix, double compute preconditioner */\
+    template void solveLowerLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const double*, double*, int);   \
+    /* float preconditioner */\
+    template void solveLowerLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int);       \
+    \
+    /* double preconditioner */\
+    template void solveUpperLevelSetSplit<blocksize, double, double>(double*, int*, int*, int*, int, int, const double*, double*, int);            \
+    /* float matrix, double compute preconditioner */\
+    template void solveUpperLevelSetSplit<blocksize, double, float>(float*, int*, int*, int*, int, int, const float*, double*, int);            \
+    /* float preconditioner */\
+    template void solveUpperLevelSetSplit<blocksize, float, float>(float*, int*, int*, int*, int, int, const float*, float*, int);            \
+
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3);
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(4);
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(5);
+INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(6);
 } // namespace Opm::gpuistl::detail::ILU0
diff --git a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
index 99e59ba13..42e5c2fbb 100644
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@@ -89,15 +89,15 @@ void solveLowerLevelSet(T* reorderedMat,
  * solve
  * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
  */
-template <class T, int blocksize>
-void solveUpperLevelSetSplit(T* reorderedMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const T* dInv,
-                             T* v,
+                             const MatrixScalar* dInv,
+                             LinearSolverScalar* v,
                              int threadBlockSize);
 
 /**
@@ -117,15 +117,15 @@ void solveUpperLevelSetSplit(T* reorderedMat,
  * solve
  * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
  */
-template <class T, int blocksize>
-void solveLowerLevelSetSplit(T* reorderedLowerMat,
+template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
                              int* rowIndices,
                              int* colIndices,
                              int* indexConversion,
                              int startIdx,
                              int rowsInLevelSet,
-                             const T* d,
-                             T* v,
+                             const LinearSolverScalar* d,
+                             LinearSolverScalar* v,
                              int threadBlockSize);
 
 /**
@@ -155,6 +155,7 @@ void LUFactorization(T* reorderedMat,
                      int threadBlockSize);
 
 /**
+ * TODO: update this doucmentation
  * @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
  * triangular part)
  * @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
@@ -175,18 +176,22 @@ void LUFactorization(T* reorderedMat,
  * function
  * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
  */
-template <class T, int blocksize>
-void LUFactorizationSplit(T* reorderedLowerMat,
+template <int blocksize, class InputScalar, class OutputScalar>
+void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                           int* lowerRowIndices,
                           int* lowerColIndices,
-                          T* reorderedUpperMat,
+                          InputScalar* srcReorderedUpperMat,
                           int* upperRowIndices,
                           int* upperColIndices,
-                          T* diagonal,
+                          InputScalar* srcDiagonal,
+                          OutputScalar* dstReorderedLowerMat,
+                          OutputScalar* dstReorderedUpperMat,
+                          OutputScalar* dstDiagonal,
                           int* reorderedToNatural,
                           int* naturalToReordered,
                           int startIdx,
                           int rowsInLevelSet,
+                          bool copyResultToOtherMatrix,
                           int threadBlockSize);
 
 } // namespace Opm::gpuistl::detail::ILU0