Add new MP scheme to GPU ILU and DILU

2025-02-25 18:55:30 -06:00 · 2024-10-16 15:37:16 +02:00 · 2024-10-16 15:37:16 +02:00 · d1e5a69476
commit d1e5a69476
parent 119282bd6d
11 changed files with 340 additions and 145 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -254,6 +254,7 @@ if (HAVE_CUDA)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/kernel_enums.hpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuVector.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg GpuView.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
--- a/opm/simulators/linalg/PreconditionerFactory_impl.hpp
+++ b/opm/simulators/linalg/PreconditionerFactory_impl.hpp
@ -357,10 +357,10 @@ struct StandardPreconditioners {
        F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
            using field_type = typename V::field_type;
            using GpuDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
+            auto gpuDILU = std::make_shared<GpuDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, GpuDILU>>(gpuDILU);
            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@ -370,10 +370,10 @@ struct StandardPreconditioners {
        F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
            using field_type = typename V::field_type;
            using OpmGpuILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float);
+            auto gpuilu0 = std::make_shared<OpmGpuILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme);

            auto adapted = std::make_shared<gpuistl::PreconditionerAdapter<V, V, OpmGpuILU0>>(gpuilu0);
            auto wrapped = std::make_shared<gpuistl::GpuBlockPreconditioner<V, V, Comm>>(adapted, comm);
@ -650,27 +650,27 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
        F::addCreator("OPMGPUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);

            using field_type = typename V::field_type;
            using GPUILU0 = typename gpuistl::OpmGpuILU0<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;

-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUILU0>>(std::make_shared<GPUILU0>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
        });

        F::addCreator("GPUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);
            using field_type = typename V::field_type;
            using GPUDILU = typename gpuistl::GpuDILU<M, gpuistl::GpuVector<field_type>, gpuistl::GpuVector<field_type>>;
-            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            return std::make_shared<gpuistl::PreconditionerAdapter<V, V, GPUDILU>>(std::make_shared<GPUDILU>(op.getmat(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
        });

        F::addCreator("GPUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
            const bool split_matrix = prm.get<bool>("split_matrix", true);
            const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
-            const bool store_factorization_as_float = prm.get<bool>("store_factorization_as_float", false);
+            const int mixed_precision_scheme = prm.get<int>("mixed_precision_scheme", 0);

            using block_type = typename V::block_type;
            using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
@ -679,7 +679,7 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
            using Adapter = typename gpuistl::PreconditionerAdapter<VTo, VTo, GpuDILU>;
            using Converter = typename gpuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
            auto converted = std::make_shared<Converter>(op.getmat());
-            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, store_factorization_as_float));
+            auto adapted = std::make_shared<Adapter>(std::make_shared<GpuDILU>(converted->getConvertedMatrix(), split_matrix, tune_gpu_kernels, mixed_precision_scheme));
            converted->setUnderlyingPreconditioner(adapted);
            return converted;
        });
--- a/opm/simulators/linalg/gpuistl/GpuDILU.cpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.cpp
@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
-GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
+GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@ -52,8 +52,7 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
    , m_splitMatrix(splitMatrix)
    , m_tuneThreadBlockSizes(tuneKernels)
-    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
-
+    , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
    // Some sanity check
@ -82,14 +81,16 @@ GpuDILU<M, X, Y, l>::GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, boo
            m_cpuMatrix, m_reorderedToNatural);
    }

-    if (m_storeFactorizationAsFloat) {
+    if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) {
        if (!m_splitMatrix){
            OPM_THROW(std::runtime_error, "Matrix must be split when storing as float.");
        }
        m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
        m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
-        m_gpuMatrixReorderedDiagFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
-        m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+
+        if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
+            m_gpuDInvFloat = std::make_unique<FloatVec>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize());
+        }
    }

    computeDiagAndMoveReorderedData(m_moveThreadBlockSize, m_DILUFactorizationThreadBlockSize);
@ -123,8 +124,8 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float>(
+            if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, float>(
                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
@ -135,8 +136,20 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    d.data(),
                    v.data(),
                    lowerSolveThreadBlockSize);
+            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, float, field_type>(
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedLowerFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data(),
+                    lowerSolveThreadBlockSize);
            } else {
-                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::DILU::solveLowerLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -170,7 +183,7 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
        const int numOfRowsInLevel = m_levelSets[level].size();
        levelStartIdx -= numOfRowsInLevel;
        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
+            if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
@ -181,6 +194,17 @@ GpuDILU<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, int
                    m_gpuDInvFloat->data(),
                    v.data(),
                    upperSolveThreadBlockSize);
+            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
+                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data(),
+                    upperSolveThreadBlockSize);
            } else {
                detail::DILU::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
@ -271,8 +295,8 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, true>(
+            if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -289,9 +313,27 @@ GpuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData(int moveThreadBlockSize, in
                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                    factorizationBlockSize);
+            } else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag->data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    nullptr,
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    factorizationBlockSize);
            } else {
                // TODO: should this be field type twice or field type then float in the template?
-                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, false>(
+                detail::DILU::computeDiluDiagonalSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
--- a/opm/simulators/linalg/gpuistl/GpuDILU.hpp
+++ b/opm/simulators/linalg/gpuistl/GpuDILU.hpp
@ -23,6 +23,7 @@
 #include <opm/grid/utility/SparseTable.hpp>
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <vector>


@ -62,7 +63,7 @@ public:
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
-    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
+    explicit GpuDILU(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);

    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
@ -144,8 +145,8 @@ private:
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
    bool m_tuneThreadBlockSizes;
-    //! \brief Bool storing whether or not we will store the factorization as float. Only used for mixed precision
-    bool m_storeFactorizationAsFloat;
+    //! \brief Enum describing how we should store the factorized matrix
+    const MatrixStorageMPScheme m_mixedPrecisionScheme;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
    int m_upperSolveThreadBlockSize = -1;
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.cpp
@ -41,7 +41,7 @@ namespace Opm::gpuistl
 {

 template <class M, class X, class Y, int l>
-OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat)
+OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme)
    : m_cpuMatrix(A)
    , m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
    , m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
@ -56,8 +56,9 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
    , m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
    , m_splitMatrix(splitMatrix)
    , m_tuneThreadBlockSizes(tuneKernels)
-    , m_storeFactorizationAsFloat(storeFactorizationAsFloat)
+    , m_mixedPrecisionScheme(makeMatrixStorageMPScheme(mixedPrecisionScheme))
 {
+
    // TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
    // Some sanity check
    OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
@ -85,13 +86,16 @@ OpmGpuILU0<M, X, Y, l>::OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernel
            m_cpuMatrix, m_reorderedToNatural);
    }

-    if (m_storeFactorizationAsFloat){
+    if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG){
        OPM_ERROR_IF(!m_splitMatrix, "Mixed precision GpuILU0 is currently only supported when using split_matrix=true");

        // initialize mixed precision datastructures
        m_gpuMatrixReorderedLowerFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedLower->getRowIndices(), m_gpuMatrixReorderedLower->getColumnIndices(), blocksize_);
        m_gpuMatrixReorderedUpperFloat = std::make_unique<FloatMat>(m_gpuMatrixReorderedUpper->getRowIndices(), m_gpuMatrixReorderedUpper->getColumnIndices(), blocksize_);
-        m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        // The MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG does not need to allocate this float vector
+        if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
+            m_gpuMatrixReorderedDiagFloat.emplace(GpuVector<float>(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize()));
+        }
    }

    LUFactorizeAndMoveData(m_moveThreadBlockSize, m_ILU0FactorizationThreadBlockSize);
@ -128,7 +132,7 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
    for (int level = 0; level < m_levelSets.size(); ++level) {
        const int numOfRowsInLevel = m_levelSets[level].size();
        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
+            if (m_mixedPrecisionScheme != MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG) {
                detail::ILU0::solveLowerLevelSetSplit<blocksize_, field_type, float>(
                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLowerFloat->getRowIndices().data(),
@ -171,8 +175,8 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
        const int numOfRowsInLevel = m_levelSets[level].size();
        levelStartIdx -= numOfRowsInLevel;
        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat) {
-                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float>(
+            if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, float>(
                        m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
                        m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
                        m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
@ -183,8 +187,20 @@ OpmGpuILU0<M, X, Y, l>::apply(X& v, const Y& d, int lowerSolveThreadBlockSize, i
                        v.data(),
                        upperSolveThreadBlockSize);
            }
+            else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG) {
+                    detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, float, field_type>(
+                        m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                        m_gpuMatrixReorderedUpperFloat->getRowIndices().data(),
+                        m_gpuMatrixReorderedUpperFloat->getColumnIndices().data(),
+                        m_gpuReorderToNatural.data(),
+                        levelStartIdx,
+                        numOfRowsInLevel,
+                        m_gpuMatrixReorderedDiag.value().data(),
+                        v.data(),
+                        upperSolveThreadBlockSize);
+            }
            else{
-                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type>(
+                detail::ILU0::solveUpperLevelSetSplit<blocksize_, field_type, field_type, field_type>(
                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@ -272,8 +288,8 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
        const int numOfRowsInLevel = m_levelSets[level].size();

        if (m_splitMatrix) {
-            if (m_storeFactorizationAsFloat){
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, true>(
+            if (m_mixedPrecisionScheme == MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -290,8 +306,26 @@ OpmGpuILU0<M, X, Y, l>::LUFactorizeAndMoveData(int moveThreadBlockSize, int fact
                    numOfRowsInLevel,
                    factorizationThreadBlockSize);
            }
+            else if (m_mixedPrecisionScheme == MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG){
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuMatrixReorderedLowerFloat->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpperFloat->getNonZeroValues().data(),
+                    nullptr,
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    factorizationThreadBlockSize);
+            }
            else{
-                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, false>(
+                detail::ILU0::LUFactorizationSplit<blocksize_, field_type, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(
                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
                    m_gpuMatrixReorderedLower->getRowIndices().data(),
                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
--- a/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
+++ b/opm/simulators/linalg/gpuistl/OpmGpuILU0.hpp
@ -24,6 +24,7 @@
 #include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuSparseMatrix.hpp>
 #include <opm/simulators/linalg/gpuistl/GpuVector.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <optional>
 #include <type_traits>
 #include <vector>
@ -64,7 +65,7 @@ public:
    //! \param A The matrix to operate on.
    //! \param w The relaxation factor.
    //!
-    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, bool storeFactorizationAsFloat);
+    explicit OpmGpuILU0(const M& A, bool splitMatrix, bool tuneKernels, int mixedPrecisionScheme);

    //! \brief Prepare the preconditioner.
    //! \note Does nothing at the time being.
@ -143,9 +144,8 @@ private:
    bool m_splitMatrix;
    //! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
    bool m_tuneThreadBlockSizes;
-    //! \brief Bool storing whether or not we should store the ILU factorization in a float datastructure.
-    //! This uses a mixed precision preconditioner to trade numerical accuracy for memory transfer speed.
-    bool m_storeFactorizationAsFloat;
+    //! \brief Enum storing how we should store the factorized matrix
+    const MatrixStorageMPScheme m_mixedPrecisionScheme;
    //! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
    //! The default value of -1 indicates that we have not calibrated and selected a value yet
    int m_upperSolveThreadBlockSize = -1;
--- a/opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp
@ -0,0 +1,109 @@
+/*
+  Copyright 2024 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_GPUISTL_KERNEL_ENUMS_HPP
+#define OPM_GPUISTL_KERNEL_ENUMS_HPP
+
+#include <cuda_runtime.h>
+
+/*
+    This file organizes a growing amount of different mixed precision options for the preconditioners.
+*/
+
+namespace Opm::gpuistl {
+    // Mixed precision schemes used for storing the matrix in GPU memory
+    enum class MatrixStorageMPScheme {
+        DOUBLE_DIAG_DOUBLE_OFFDIAG = 0, // full precision should be default
+        FLOAT_DIAG_FLOAT_OFFDIAG = 1,
+        DOUBLE_DIAG_FLOAT_OFFDIAG = 2
+    };
+
+    namespace detail {
+        bool isValidMatrixStorageMPScheme(int scheme);
+    }
+
+    inline MatrixStorageMPScheme makeMatrixStorageMPScheme(int scheme) {
+        if (!detail::isValidMatrixStorageMPScheme(scheme)) {
+            OPM_THROW(std::invalid_argument,
+                      fmt::format("Invalid matrix storage mixed precision scheme chosen: {}.\n"
+                                  "Valid Schemes:\n"
+                                  "\t0: DOUBLE_DIAG_DOUBLE_OFFDIAG\n"
+                                  "\t1: FLOAT_DIAG_FLOAT_OFFDIAG\n"
+                                  "\t2: DOUBLE_DIAG_FLOAT_OFFDIAG",
+                                  scheme));
+        }
+        return static_cast<MatrixStorageMPScheme>(scheme);
+    }
+
+    namespace detail {
+
+        __host__ __device__ constexpr bool storeDiagonalAsFloat(MatrixStorageMPScheme scheme) {
+            switch (scheme) {
+                case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
+                    return false;
+                case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
+                    return false;
+                default:
+                    return false;
+            }
+        }
+
+        __host__ __device__ constexpr bool storeOffDiagonalAsFloat(MatrixStorageMPScheme scheme) {
+            switch (scheme) {
+                case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
+                    return false;
+                case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+
+        // returns true if we use anything else that the the default double precision for everything
+        __host__ __device__ constexpr bool usingMixedPrecision(MatrixStorageMPScheme scheme) {
+            switch (scheme) {
+                case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
+                    return false;
+                case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+
+        inline bool isValidMatrixStorageMPScheme(int scheme) {
+            switch (static_cast<MatrixStorageMPScheme>(scheme)) {
+                case MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG:
+                case MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG:
+                case MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG:
+                    return true;
+                default:
+                    return false;
+            }
+        }
+    }
+}
+
+#endif // OPM_GPUISTL_KERNEL_ENUMS_HPP
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.cu
@ -59,14 +59,14 @@ namespace
        }
    }

-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
    __global__ void cuSolveLowerLevelSetSplit(MatrixScalar* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                              const LinearSolverScalar* d,
                                              LinearSolverScalar* v)
    {
@ -88,7 +88,7 @@ namespace
                mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -118,14 +118,14 @@ namespace
        }
    }

-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
    __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                              LinearSolverScalar* v)
    {
        const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@ -140,7 +140,7 @@ namespace
                umvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mmvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
+            mmvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }

@ -213,7 +213,7 @@ namespace

    // TODO: rewrite such that during the factorization there is a dInv of InputScalar type that stores intermediate results
    // TOOD: The important part is to only cast after that is fully computed
-    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
    __global__ void cuComputeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                                               int* lowerRowIndices,
                                               int* lowerColIndices,
@ -255,7 +255,7 @@ namespace

                const int symOppositeBlock = symOppositeIdx;

-                if constexpr (copyResultToOtherMatrix) {
+                if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                    // TODO: think long and hard about whether this performs only the wanted memory transfers
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[block * blocksize * blocksize], &dstLowerMat[block * blocksize * blocksize]);
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[symOppositeBlock * blocksize * blocksize], &dstUpperMat[symOppositeBlock * blocksize * blocksize]);
@ -268,14 +268,9 @@ namespace
            }

            invBlockInPlace<InputScalar, blocksize>(dInvTmp);
-
-            // for (int i = 0; i < blocksize; ++i) {
-            //     for (int j = 0; j < blocksize; ++j) {
-            //         dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
-            //     }
-            // }
            moveBlock<blocksize, InputScalar, InputScalar>(dInvTmp, &dInv[reorderedRowIdx * blocksize * blocksize]);
-            if constexpr (copyResultToOtherMatrix) {
+
+            if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)) {
                moveBlock<blocksize, InputScalar, OutputScalar>(dInvTmp, &dstDiag[reorderedRowIdx * blocksize * blocksize]); // important!
            }
        }
@ -304,7 +299,7 @@ solveLowerLevelSet(T* reorderedMat,
 }


-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        int* rowIndices,
@ -312,15 +307,15 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                        const LinearSolverScalar* d,
                        LinearSolverScalar* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
 }
 // perform the upper solve for all rows in the same level set
@ -343,7 +338,7 @@ solveUpperLevelSet(T* reorderedMat,
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int* rowIndices,
@ -351,14 +346,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@ -391,7 +386,7 @@ computeDiluDiagonal(T* reorderedMat,
    }
 }

-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme scheme>
 void
 computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                         int* lowerRowIndices,
@ -412,9 +407,9 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
 {
    if (blocksize <= 3) {
        int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
+            cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme>, thrBlockSize);
        int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
+        cuComputeDiluDiagonalSplit<blocksize, InputScalar, OutputScalar, scheme><<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                                                                     lowerRowIndices,
                                                                                     lowerColIndices,
                                                                                     srcReorderedUpperMat,
@ -437,13 +432,17 @@ computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
 // TODO: format
 #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int);              \
-    template void computeDiluDiagonalSplit<blocksize, T, double, false>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, false>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                              \
        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, float, true>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
-    template void computeDiluDiagonalSplit<blocksize, T, double, true>(                                                              \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
+        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, float*, float*, float*, int);                                      \
+    template void computeDiluDiagonalSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                              \
        const T*, int*, int*, const T*, int*, int*, const T*, int*, int*, const int, int, T*, double*, double*, double*, int);                                      \
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
@ -461,29 +460,26 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 4);
 INSTANTIATE_KERNEL_WRAPPERS(double, 5);
 INSTANTIATE_KERNEL_WRAPPERS(double, 6);

-#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar)                                 \
-    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, LinearSolverScalar*, int);                     \
-    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>(                                \
-        MatrixScalar*, int*, int*, int*, int, int, const MatrixScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar)                                 \
+    template void solveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, LinearSolverScalar*, int);                     \
+    template void solveLowerLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>(                                \
+        MatrixScalar*, int*, int*, int*, int, int, const DiagonalScalar*, const LinearSolverScalar*, LinearSolverScalar*, int);

-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, float, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, double);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(1, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(2, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(3, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(4, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(5, double, float);
-INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(6, double, float);
+// TODO: be smarter about this... Surely this instantiates many more combinations that are actually needed
+#define INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(blocksize) \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, float); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, float, float, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, double, double); \
+    INSTANTIATE_SOLVE_LEVEL_SET_SPLIT(blocksize, double, float, double);
+
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(1);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(2);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(3);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(4);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(5);
+INSTANTIATE_SOLVE_LEVEL_SET_SPLIT_ALL(6);

 } // namespace Opm::gpuistl::detail::DILU
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/DILUKernels.hpp
@ -22,6 +22,7 @@
 #include <cstddef>
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 #include <vector>

 namespace Opm::gpuistl::detail::DILU
@ -71,14 +72,14 @@ void solveLowerLevelSet(T* reorderedMat,
 * @param d Stores the defect
 * @param [out] v Will store the results of the lower solve
 */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveLowerLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                             const LinearSolverScalar* d,
                             LinearSolverScalar* v,
                             int threadBlockSize);
@ -124,14 +125,14 @@ void solveUpperLevelSet(T* reorderedMat,
 * @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
 * solve
 */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveUpperLevelSetSplit(MatrixScalar* reorderedUpperMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
                             int threadBlockSize);

@ -183,7 +184,7 @@ void computeDiluDiagonal(T* reorderedMat,
 * function
 * @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
 */
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme>
 void computeDiluDiagonalSplit(const InputScalar* srcReorderedLowerMat,
                              int* lowerRowIndices,
                              int* lowerColIndices,
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.cu
@ -120,7 +120,7 @@ namespace
        }
    }

-    template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+    template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
    __global__ void cuLUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                                           int* lowerRowIndices,
                                           int* lowerColIndices,
@ -161,7 +161,7 @@ namespace
                mmOverlap<InputScalar, blocksize>(&srcReorderedLowerMat[ij * scalarsInBlock],
                                                  &srcDiagonal[j * scalarsInBlock],
                                                  &srcReorderedLowerMat[ij * scalarsInBlock]);
-                if (copyResultToOtherMatrix) {
+                if constexpr (detail::storeOffDiagonalAsFloat(mixedPrecisionScheme)) {
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedLowerMat[ij * scalarsInBlock],
                                                                    &dstReorderedLowerMat[ij * scalarsInBlock]);
                }
@ -180,22 +180,15 @@ namespace
                while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {

                    InputScalar* ikBlockPtr;
-                    OutputScalar* dstIkBlockPtr;
                    int ikColumn;
                    if (ikState == POSITION_TYPE::UNDER_DIAG) {
                        ikBlockPtr = &srcReorderedLowerMat[ik * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
-                            dstIkBlockPtr = &dstReorderedLowerMat[ik * scalarsInBlock];
                        ikColumn = lowerColIndices[ik];
                    } else if (ikState == POSITION_TYPE::ON_DIAG) {
                        ikBlockPtr = &srcDiagonal[reorderedIdx * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
-                            dstIkBlockPtr = &dstDiagonal[reorderedIdx * scalarsInBlock];
                        ikColumn = naturalIdx;
                    } else { // ikState == POSITION_TYPE::ABOVE_DIAG
                        ikBlockPtr = &srcReorderedUpperMat[ik * scalarsInBlock];
-                        if (copyResultToOtherMatrix)
-                            dstIkBlockPtr = &dstReorderedUpperMat[ik * scalarsInBlock];
                        ikColumn = upperColIndices[ik];
                    }

@ -219,10 +212,15 @@ namespace
                }
            }
            invBlockInPlace<InputScalar, blocksize>(&srcDiagonal[reorderedIdx * scalarsInBlock]);
-            if (copyResultToOtherMatrix) {
-                moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
-                                                                &dstDiagonal[reorderedIdx * scalarsInBlock]);
-
+            // as of now, if we are using mixed precision, then we are always storing the off-diagonals as floats,
+            // and sometimes also the diagonal.
+            if constexpr (detail::usingMixedPrecision(mixedPrecisionScheme)) {
+                // if we are want to store the entire matrix as a float then we must also move the diagonal block from double to float
+                // if not then we just use the double diagonal that is already now stored in srcDiagonal
+                if constexpr (detail::storeDiagonalAsFloat(mixedPrecisionScheme)){
+                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcDiagonal[reorderedIdx * scalarsInBlock],
+                                                                    &dstDiagonal[reorderedIdx * scalarsInBlock]);
+                }
                // also move all values above the diagonal on this row
                for (int block = startOfRowIUpper; block < endOfRowIUpper; ++block) {
                    moveBlock<blocksize, InputScalar, OutputScalar>(&srcReorderedUpperMat[block * scalarsInBlock],
@ -321,14 +319,14 @@ namespace
        }
    }

-    template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+    template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
    __global__ void cuSolveUpperLevelSetSplit(MatrixScalar* mat,
                                              int* rowIndices,
                                              int* colIndices,
                                              int* indexConversion,
                                              int startIdx,
                                              int rowsInLevelSet,
-                                              const MatrixScalar* dInv,
+                                              const DiagonalScalar* dInv,
                                              LinearSolverScalar* v)
    {
        auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
@ -348,7 +346,7 @@ namespace
                    &mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
            }

-            mvMixedGeneral<blocksize, MatrixScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
+            mvMixedGeneral<blocksize, DiagonalScalar, LinearSolverScalar, LinearSolverScalar, LinearSolverScalar>(
                &dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
        }
    }
@ -410,7 +408,7 @@ solveLowerLevelSetSplit(MatrixScalar* reorderedMat,
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
 }
 // perform the upper solve for all rows in the same level set
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void
 solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int* rowIndices,
@ -418,14 +416,14 @@ solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                        int* indexConversion,
                        int startIdx,
                        int rowsInLevelSet,
-                        const MatrixScalar* dInv,
+                        const DiagonalScalar* dInv,
                        LinearSolverScalar* v,
                        int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar>, thrBlockSize);
+        cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar><<<nThreadBlocks, threadBlockSize>>>(
+    cuSolveUpperLevelSetSplit<blocksize, LinearSolverScalar, MatrixScalar, DiagonalScalar><<<nThreadBlocks, threadBlockSize>>>(
        reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
 }

@ -447,7 +445,7 @@ LUFactorization(T* srcMatrix,
        srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
 }

-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
 void
 LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                     int* lowerRowIndices,
@ -466,9 +464,9 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                     int thrBlockSize)
 {
    int threadBlockSize = ::Opm::gpuistl::detail::getCudaRecomendedThreadBlockSize(
-        cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>, thrBlockSize);
+        cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>, thrBlockSize);
    int nThreadBlocks = ::Opm::gpuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, copyResultToOtherMatrix>
+    cuLUFactorizationSplit<blocksize, InputScalar, OutputScalar, mixedPrecisionScheme>
        <<<nThreadBlocks, threadBlockSize>>>(srcReorderedLowerMat,
                                             lowerRowIndices,
                                             lowerColIndices,
@ -489,27 +487,29 @@ LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
    template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int);                           \
    template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);                 \
    template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int);                         \
-    template void LUFactorizationSplit<blocksize, T, float, true>(                                                     \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                     \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, true>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_DOUBLE_OFFDIAG>(                                                    \
        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int);               \
-    template void LUFactorizationSplit<blocksize, T, float, false>(                                                    \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                    \
        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
-    template void LUFactorizationSplit<blocksize, T, double, false>(                                                   \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::FLOAT_DIAG_FLOAT_OFFDIAG>(                                                   \
+        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); \
+    template void LUFactorizationSplit<blocksize, T, float, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                    \
+        T*, int*, int*, T*, int*, int*, T*, float*, float*, float*, int*, int*, const int, int, int);                  \
+    template void LUFactorizationSplit<blocksize, T, double, MatrixStorageMPScheme::DOUBLE_DIAG_FLOAT_OFFDIAG>(                                                   \
        T*, int*, int*, T*, int*, int*, T*, double*, double*, double*, int*, int*, const int, int, int); 

-INSTANTIATE_KERNEL_WRAPPERS(float, 1);
-INSTANTIATE_KERNEL_WRAPPERS(float, 2);
-INSTANTIATE_KERNEL_WRAPPERS(float, 3);
-INSTANTIATE_KERNEL_WRAPPERS(float, 4);
-INSTANTIATE_KERNEL_WRAPPERS(float, 5);
-INSTANTIATE_KERNEL_WRAPPERS(float, 6);
-INSTANTIATE_KERNEL_WRAPPERS(double, 1);
-INSTANTIATE_KERNEL_WRAPPERS(double, 2);
-INSTANTIATE_KERNEL_WRAPPERS(double, 3);
-INSTANTIATE_KERNEL_WRAPPERS(double, 4);
-INSTANTIATE_KERNEL_WRAPPERS(double, 5);
-INSTANTIATE_KERNEL_WRAPPERS(double, 6);
+#define INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(T) \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 1); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 2); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 3); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 4); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 5); \
+    INSTANTIATE_KERNEL_WRAPPERS(T, 6);
+
+INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(float)
+INSTANTIATE_BLOCK_SIZED_KERNEL_WRAPPERS(double)

 #define INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(blocksize)                                                         \
    /* double preconditioner */                                                                                        \
@ -523,15 +523,25 @@ INSTANTIATE_KERNEL_WRAPPERS(double, 6);
        float*, int*, int*, int*, int, int, const float*, float*, int);                                                \
                                                                                                                       \
    /* double preconditioner */                                                                                        \
-    template void solveUpperLevelSetSplit<blocksize, double, double>(                                                  \
+    template void solveUpperLevelSetSplit<blocksize, double, double, double>(                                                  \
        double*, int*, int*, int*, int, int, const double*, double*, int);                                             \
    /* float matrix, double compute preconditioner */                                                                  \
-    template void solveUpperLevelSetSplit<blocksize, double, float>(                                                   \
+    template void solveUpperLevelSetSplit<blocksize, double, float, double>(                                                   \
+        float*, int*, int*, int*, int, int, const double*, double*, int);                                               \
+    /* float preconditioner */                                                                                         \
+    template void solveUpperLevelSetSplit<blocksize, float, float, double>(                                                    \
+        float*, int*, int*, int*, int, int, const double*, float*, int);                                                \
+    /* double preconditioner */                                                                                        \
+    template void solveUpperLevelSetSplit<blocksize, double, double, float>(                                                  \
+        double*, int*, int*, int*, int, int, const float*, double*, int);                                             \
+    /* float matrix, double compute preconditioner */                                                                  \
+    template void solveUpperLevelSetSplit<blocksize, double, float, float>(                                                   \
        float*, int*, int*, int*, int, int, const float*, double*, int);                                               \
    /* float preconditioner */                                                                                         \
-    template void solveUpperLevelSetSplit<blocksize, float, float>(                                                    \
+    template void solveUpperLevelSetSplit<blocksize, float, float, float>(                                                    \
        float*, int*, int*, int*, int, int, const float*, float*, int);

+
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(1);
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(2);
 INSTANTIATE_MIXED_PRECISION_KERNEL_WRAPPERS(3);
--- a/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
+++ b/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
@ -20,6 +20,7 @@
 #define OPM_ILU0_KERNELS_HPP
 #include <cstddef>
 #include <vector>
+#include <opm/simulators/linalg/gpuistl/detail/kernel_enums.hpp>
 namespace Opm::gpuistl::detail::ILU0
 {

@ -89,14 +90,14 @@ void solveLowerLevelSet(T* reorderedMat,
 * solve
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
-template <int blocksize, class LinearSolverScalar, class MatrixScalar>
+template <int blocksize, class LinearSolverScalar, class MatrixScalar, class DiagonalScalar>
 void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
                             int* rowIndices,
                             int* colIndices,
                             int* indexConversion,
                             int startIdx,
                             int rowsInLevelSet,
-                             const MatrixScalar* dInv,
+                             const DiagonalScalar* dInv,
                             LinearSolverScalar* v,
                             int threadBlockSize);

@ -176,7 +177,7 @@ void LUFactorization(T* reorderedMat,
 * function
 * @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
 */
-template <int blocksize, class InputScalar, class OutputScalar, bool copyResultToOtherMatrix>
+template <int blocksize, class InputScalar, class OutputScalar, MatrixStorageMPScheme mixedPrecisionScheme>
 void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
                          int* lowerRowIndices,
                          int* lowerColIndices,