From 82ff782d5f300df0cf20decb94e521a843c15531 Mon Sep 17 00:00:00 2001
From: Tobias Meyer Andersen <tobiasmeyer.andersen@sintef.no>
Date: Tue, 18 Jun 2024 11:42:00 +0200
Subject: [PATCH] clang format

---
 opm/simulators/linalg/cuistl/CuDILU.cpp       | 183 +++++++--------
 .../detail/cusparse_matrix_operations.cu      | 211 ++++++++++--------
 tests/cuistl/test_cudilu.cpp                  |  90 ++++----
 3 files changed, 261 insertions(+), 223 deletions(-)
diff --git a/opm/simulators/linalg/cuistl/CuDILU.cpp b/opm/simulators/linalg/cuistl/CuDILU.cpp
index 00e11d176..f54d124ef 100644
--- a/opm/simulators/linalg/cuistl/CuDILU.cpp
+++ b/opm/simulators/linalg/cuistl/CuDILU.cpp
@@ -25,9 +25,9 @@
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
+#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
 #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
-#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/matrixblock.hh>
 #include <vector>
 
@@ -65,7 +65,9 @@ createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
 
 template <class M, class field_type, class GPUM>
 void
-createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatural, std::unique_ptr<GPUM>& reorderedGpuMat)
+createReorderedMatrix(const M& naturalMatrix,
+                      std::vector<int> reorderedToNatural,
+                      std::unique_ptr<GPUM>& reorderedGpuMat)
 {
     M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
     for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
@@ -81,22 +83,26 @@ createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatura
 
 template <class M, class field_type, class GPUM>
 void
-extractLowerAndUpperMatrices(const M& naturalMatrix, std::vector<int> reorderedToNatural, std::unique_ptr<GPUM>& lower, std::unique_ptr<GPUM>& upper)
+extractLowerAndUpperMatrices(const M& naturalMatrix,
+                             std::vector<int> reorderedToNatural,
+                             std::unique_ptr<GPUM>& lower,
+                             std::unique_ptr<GPUM>& upper)
 {
-    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N())/2;
+    const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
 
     M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
     M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
 
-    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin(); lowerIt != reorderedLower.createend(); ++lowerIt, ++upperIt) {
+    for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
+         lowerIt != reorderedLower.createend();
+         ++lowerIt, ++upperIt) {
 
         auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
 
         for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
-            if (elem.index() < srcRow.index()){ // add index to lower matrix if under the diagonal
+            if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
                 lowerIt.insert(elem.index());
-            }
-            else if (elem.index() > srcRow.index()){ // add element to upper matrix if above the diagonal
+            } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
                 upperIt.insert(elem.index());
             }
         }
@@ -144,12 +150,13 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool split_matrix)
                  fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
                              m_gpuMatrix.nonzeroes(),
                              A.nonzeroes()));
-    if (m_split_matrix){
-        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_*blocksize_*m_cpuMatrix.N()));
-        extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
-    }
-    else{
-        createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
+    if (m_split_matrix) {
+        m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
+        extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
+            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
+    } else {
+        createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
+            m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
     }
     computeDiagAndMoveReorderedData();
 }
@@ -170,27 +177,28 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
         int levelStartIdx = 0;
         for (int level = 0; level < m_levelSets.size(); ++level) {
             const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_split_matrix){
-                detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                                                                        m_gpuMatrixReorderedLower->getRowIndices().data(),
-                                                                        m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                                                                        m_gpuReorderToNatural.data(),
-                                                                        levelStartIdx,
-                                                                        numOfRowsInLevel,
-                                                                        m_gpuDInv.data(),
-                                                                        d.data(),
-                                                                        v.data());
-            }
-            else{
-                detail::computeLowerSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
-                                                                        m_gpuMatrixReordered->getRowIndices().data(),
-                                                                        m_gpuMatrixReordered->getColumnIndices().data(),
-                                                                        m_gpuReorderToNatural.data(),
-                                                                        levelStartIdx,
-                                                                        numOfRowsInLevel,
-                                                                        m_gpuDInv.data(),
-                                                                        d.data(),
-                                                                        v.data());
+            if (m_split_matrix) {
+                detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data());
+            } else {
+                detail::computeLowerSolveLevelSet<field_type, blocksize_>(
+                    m_gpuMatrixReordered->getNonZeroValues().data(),
+                    m_gpuMatrixReordered->getRowIndices().data(),
+                    m_gpuMatrixReordered->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    d.data(),
+                    v.data());
             }
             levelStartIdx += numOfRowsInLevel;
         }
@@ -200,25 +208,26 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
         for (int level = m_levelSets.size() - 1; level >= 0; --level) {
             const int numOfRowsInLevel = m_levelSets[level].size();
             levelStartIdx -= numOfRowsInLevel;
-            if (m_split_matrix){
-                detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                                                                        m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                                                                        m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                                                                        m_gpuReorderToNatural.data(),
-                                                                        levelStartIdx,
-                                                                        numOfRowsInLevel,
-                                                                        m_gpuDInv.data(),
-                                                                        v.data());
-            }
-            else{
-                detail::computeUpperSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
-                                                                        m_gpuMatrixReordered->getRowIndices().data(),
-                                                                        m_gpuMatrixReordered->getColumnIndices().data(),
-                                                                        m_gpuReorderToNatural.data(),
-                                                                        levelStartIdx,
-                                                                        numOfRowsInLevel,
-                                                                        m_gpuDInv.data(),
-                                                                        v.data());
+            if (m_split_matrix) {
+                detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data());
+            } else {
+                detail::computeUpperSolveLevelSet<field_type, blocksize_>(
+                    m_gpuMatrixReordered->getNonZeroValues().data(),
+                    m_gpuMatrixReordered->getRowIndices().data(),
+                    m_gpuMatrixReordered->getColumnIndices().data(),
+                    m_gpuReorderToNatural.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data(),
+                    v.data());
             }
         }
     }
@@ -254,45 +263,45 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
 {
     OPM_TIMEBLOCK(prec_update);
     {
-        if (m_split_matrix){
-            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                m_gpuMatrix.getRowIndices().data(),
-                                                                m_gpuMatrix.getColumnIndices().data(),
-                                                                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                                                                m_gpuMatrixReorderedLower->getRowIndices().data(),
-                                                                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                                                                m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                                                                m_gpuMatrixReorderedDiag.value().data(),
-                                                                m_gpuNaturalToReorder.data(),
-                                                                m_gpuMatrixReorderedLower->N());
-        }
-        else{
+        if (m_split_matrix) {
+            detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
+                m_gpuMatrix.getNonZeroValues().data(),
+                m_gpuMatrix.getRowIndices().data(),
+                m_gpuMatrix.getColumnIndices().data(),
+                m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                m_gpuMatrixReorderedLower->getRowIndices().data(),
+                m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                m_gpuMatrixReorderedDiag.value().data(),
+                m_gpuNaturalToReorder.data(),
+                m_gpuMatrixReorderedLower->N());
+        } else {
             detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
-                                                                m_gpuMatrix.getRowIndices().data(),
-                                                                m_gpuMatrixReordered->getNonZeroValues().data(),
-                                                                m_gpuMatrixReordered->getRowIndices().data(),
-                                                                m_gpuNaturalToReorder.data(),
-                                                                m_gpuMatrixReordered->N());
+                                                                   m_gpuMatrix.getRowIndices().data(),
+                                                                   m_gpuMatrixReordered->getNonZeroValues().data(),
+                                                                   m_gpuMatrixReordered->getRowIndices().data(),
+                                                                   m_gpuNaturalToReorder.data(),
+                                                                   m_gpuMatrixReordered->N());
         }
 
         int levelStartIdx = 0;
         for (int level = 0; level < m_levelSets.size(); ++level) {
             const int numOfRowsInLevel = m_levelSets[level].size();
-            if (m_split_matrix){
-                detail::computeDiluDiagonalSplit<field_type, blocksize_>(m_gpuMatrixReorderedLower->getNonZeroValues().data(),
-                                                        m_gpuMatrixReorderedLower->getRowIndices().data(),
-                                                        m_gpuMatrixReorderedLower->getColumnIndices().data(),
-                                                        m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
-                                                        m_gpuMatrixReorderedUpper->getRowIndices().data(),
-                                                        m_gpuMatrixReorderedUpper->getColumnIndices().data(),
-                                                        m_gpuMatrixReorderedDiag.value().data(),
-                                                        m_gpuReorderToNatural.data(),
-                                                        m_gpuNaturalToReorder.data(),
-                                                        levelStartIdx,
-                                                        numOfRowsInLevel,
-                                                        m_gpuDInv.data());
-            }
-            else{
+            if (m_split_matrix) {
+                detail::computeDiluDiagonalSplit<field_type, blocksize_>(
+                    m_gpuMatrixReorderedLower->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedLower->getRowIndices().data(),
+                    m_gpuMatrixReorderedLower->getColumnIndices().data(),
+                    m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
+                    m_gpuMatrixReorderedUpper->getRowIndices().data(),
+                    m_gpuMatrixReorderedUpper->getColumnIndices().data(),
+                    m_gpuMatrixReorderedDiag.value().data(),
+                    m_gpuReorderToNatural.data(),
+                    m_gpuNaturalToReorder.data(),
+                    levelStartIdx,
+                    numOfRowsInLevel,
+                    m_gpuDInv.data());
+            } else {
                 detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
                                                                     m_gpuMatrixReordered->getRowIndices().data(),
                                                                     m_gpuMatrixReordered->getColumnIndices().data(),
diff --git a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
index 703892dd3..967160c72 100644
--- a/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.cu
@@ -229,14 +229,14 @@ namespace
 
     template <class T, int blocksize>
     __global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
-                                                int* rowIndices,
-                                                int* colIndices,
-                                                int* indexConversion,
-                                                int startIdx,
-                                                int rowsInLevelSet,
-                                                const T* dInv,
-                                                const T* d,
-                                                T* v)
+                                                     int* rowIndices,
+                                                     int* colIndices,
+                                                     int* indexConversion,
+                                                     int startIdx,
+                                                     int rowsInLevelSet,
+                                                     const T* dInv,
+                                                     const T* d,
+                                                     T* v)
     {
         const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@@ -250,7 +250,7 @@ namespace
                 rhs[i] = d[naturalRowIdx * blocksize + i];
             }
 
-            //TODO: removce the first condition in the for loop
+            // TODO: removce the first condition in the for loop
             for (int block = nnzIdx; block < nnzIdxLim; ++block) {
                 const int col = colIndices[block];
                 mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
@@ -288,13 +288,13 @@ namespace
 
     template <class T, int blocksize>
     __global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
-                                                int* rowIndices,
-                                                int* colIndices,
-                                                int* indexConversion,
-                                                int startIdx,
-                                                int rowsInLevelSet,
-                                                const T* dInv,
-                                                T* v)
+                                                     int* rowIndices,
+                                                     int* colIndices,
+                                                     int* indexConversion,
+                                                     int startIdx,
+                                                     int rowsInLevelSet,
+                                                     const T* dInv,
+                                                     T* v)
     {
         const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@@ -381,23 +381,23 @@ namespace
 
     template <class T, int blocksize>
     __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
-                    int* lowerRowIndices,
-                    int* lowerColIndices,
-                    T* reorderedUpperMat,
-                    int* upperRowIndices,
-                    int* upperColIndices,
-                    T* diagonal,
-                    int* reorderedToNatural,
-                    int* naturalToReordered,
-                    const int startIdx,
-                    int rowsInLevelSet,
-                    T* dInv)
+                                               int* lowerRowIndices,
+                                               int* lowerColIndices,
+                                               T* reorderedUpperMat,
+                                               int* upperRowIndices,
+                                               int* upperColIndices,
+                                               T* diagonal,
+                                               int* reorderedToNatural,
+                                               int* naturalToReordered,
+                                               const int startIdx,
+                                               int rowsInLevelSet,
+                                               T* dInv)
     {
         const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
         if (reorderedRowIdx < rowsInLevelSet + startIdx) {
             const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
             const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
-            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx+1];
+            const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
 
             T dInvTmp[blocksize * blocksize];
             for (int i = 0; i < blocksize; ++i) {
@@ -410,8 +410,8 @@ namespace
                 const int col = naturalToReordered[lowerColIndices[block]];
 
                 int symOppositeIdx = upperRowIndices[col];
-                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx){
-                    if (naturalRowIdx == upperColIndices[symOppositeIdx]){
+                for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
+                    if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
                         break;
                     }
                 }
@@ -457,15 +457,23 @@ namespace
     }
 
     template <class T, int blocksize>
-    __global__ void cuMoveDataToReorderedSplit(
-        T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows)
+    __global__ void cuMoveDataToReorderedSplit(T* srcMatrix,
+                                               int* srcRowIndices,
+                                               int* srcColumnIndices,
+                                               T* dstLowerMatrix,
+                                               int* dstLowerRowIndices,
+                                               T* dstUpperMatrix,
+                                               int* dstUpperRowIndices,
+                                               T* dstDiag,
+                                               int* naturalToReordered,
+                                               size_t numberOfRows)
     {
         const auto srcRow = blockDim.x * blockIdx.x + threadIdx.x;
         if (srcRow < numberOfRows) {
 
             const auto dstRow = naturalToReordered[srcRow];
             const auto rowStart = srcRowIndices[srcRow];
-            const auto rowEnd = srcRowIndices[srcRow+1];
+            const auto rowEnd = srcRowIndices[srcRow + 1];
 
             auto lowerBlock = dstLowerRowIndices[dstRow];
             auto upperBlock = dstUpperRowIndices[dstRow];
@@ -474,17 +482,16 @@ namespace
                 int dstBlock;
                 T* dstBuffer;
 
-                if (srcColumnIndices[srcBlock] < srcRow){ // we are writing a value to the lower triangular matrix
+                if (srcColumnIndices[srcBlock] < srcRow) { // we are writing a value to the lower triangular matrix
                     dstBlock = lowerBlock;
                     ++lowerBlock;
                     dstBuffer = dstLowerMatrix;
-                }
-                else if (srcColumnIndices[srcBlock] > srcRow){ // we are writing a value to the upper triangular matrix
+                } else if (srcColumnIndices[srcBlock]
+                           > srcRow) { // we are writing a value to the upper triangular matrix
                     dstBlock = upperBlock;
                     ++upperBlock;
                     dstBuffer = dstUpperMatrix;
-                }
-                else{ // we are writing a value to the diagonal
+                } else { // we are writing a value to the diagonal
                     dstBlock = dstRow;
                     dstBuffer = dstDiag;
                 }
@@ -511,14 +518,16 @@ namespace
 
     // Kernel here is the function object of the cuda kernel
     template <class Kernel>
-    inline int getCudaRecomendedThreadBlockSize(Kernel k){
+    inline int getCudaRecomendedThreadBlockSize(Kernel k)
+    {
         int blockSize;
         int tmpGridSize;
-        cudaOccupancyMaxPotentialBlockSize( &tmpGridSize, &blockSize, k, 0, 0);
+        cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
         return blockSize;
     }
 
-    inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize){
+    inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
+    {
         return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
     }
 
@@ -557,14 +566,14 @@ computeLowerSolveLevelSet(T* reorderedMat,
 template <class T, int blocksize>
 void
 computeLowerSolveLevelSetSplit(T* reorderedMat,
-                          int* rowIndices,
-                          int* colIndices,
-                          int* indexConversion,
-                          int startIdx,
-                          int rowsInLevelSet,
-                          const T* dInv,
-                          const T* d,
-                          T* v)
+                               int* rowIndices,
+                               int* colIndices,
+                               int* indexConversion,
+                               int startIdx,
+                               int rowsInLevelSet,
+                               const T* dInv,
+                               const T* d,
+                               T* v)
 {
     int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
     int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
@@ -590,13 +599,13 @@ computeUpperSolveLevelSet(T* reorderedMat,
 template <class T, int blocksize>
 void
 computeUpperSolveLevelSetSplit(T* reorderedMat,
-                          int* rowIndices,
-                          int* colIndices,
-                          int* indexConversion,
-                          int startIdx,
-                          int rowsInLevelSet,
-                          const T* dInv,
-                          T* v)
+                               int* rowIndices,
+                               int* colIndices,
+                               int* indexConversion,
+                               int startIdx,
+                               int rowsInLevelSet,
+                               const T* dInv,
+                               T* v)
 {
     int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
     int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
@@ -633,34 +642,33 @@ computeDiluDiagonal(T* reorderedMat,
 template <class T, int blocksize>
 void
 computeDiluDiagonalSplit(T* reorderedLowerMat,
-                    int* lowerRowIndices,
-                    int* lowerColIndices,
-                    T* reorderedUpperMat,
-                    int* upperRowIndices,
-                    int* upperColIndices,
-                    T* diagonal,
-                    int* reorderedToNatural,
-                    int* naturalToReordered,
-                    const int startIdx,
-                    int rowsInLevelSet,
-                    T* dInv)
+                         int* lowerRowIndices,
+                         int* lowerColIndices,
+                         T* reorderedUpperMat,
+                         int* upperRowIndices,
+                         int* upperColIndices,
+                         T* diagonal,
+                         int* reorderedToNatural,
+                         int* naturalToReordered,
+                         const int startIdx,
+                         int rowsInLevelSet,
+                         T* dInv)
 {
     if (blocksize <= 3) {
         int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
         int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
-        cuComputeDiluDiagonalSplit<T, blocksize>
-            <<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
-                    lowerRowIndices,
-                    lowerColIndices,
-                    reorderedUpperMat,
-                    upperRowIndices,
-                    upperColIndices,
-                    diagonal,
-                    reorderedToNatural,
-                    naturalToReordered,
-                    startIdx,
-                    rowsInLevelSet,
-                    dInv);
+        cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
+                                                                                     lowerRowIndices,
+                                                                                     lowerColIndices,
+                                                                                     reorderedUpperMat,
+                                                                                     upperRowIndices,
+                                                                                     upperColIndices,
+                                                                                     diagonal,
+                                                                                     reorderedToNatural,
+                                                                                     naturalToReordered,
+                                                                                     startIdx,
+                                                                                     rowsInLevelSet,
+                                                                                     dInv);
     } else {
         OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
     }
@@ -677,24 +685,41 @@ copyMatDataToReordered(
 
 template <class T, int blocksize>
 void
-copyMatDataToReorderedSplit(
-    T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows)
+copyMatDataToReorderedSplit(T* srcMatrix,
+                            int* srcRowIndices,
+                            int* srcColumnIndices,
+                            T* dstLowerMatrix,
+                            int* dstLowerRowIndices,
+                            T* dstUpperMatrix,
+                            int* dstUpperRowIndices,
+                            T* dstDiag,
+                            int* naturalToReordered,
+                            size_t numberOfRows)
 {
     int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
     int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
-    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
-        srcMatrix, srcRowIndices, srcColumnIndices, dstLowerMatrix, dstLowerRowIndices, dstUpperMatrix, dstUpperRowIndices, dstDiag, naturalToReordered, numberOfRows);
+    cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
+                                                                                 srcRowIndices,
+                                                                                 srcColumnIndices,
+                                                                                 dstLowerMatrix,
+                                                                                 dstLowerRowIndices,
+                                                                                 dstUpperMatrix,
+                                                                                 dstUpperRowIndices,
+                                                                                 dstDiag,
+                                                                                 naturalToReordered,
+                                                                                 numberOfRows);
 }
 
-#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                            \
-    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);                                        \
-    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t);                                    \
-    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t);           \
-    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*);                         \
-    template void computeDiluDiagonalSplit<T, blocksize>(T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*);\
-    template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*);                     \
-    template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);           \
-    template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*);                \
+#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize)                                                                      \
+    template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);                                  \
+    template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t);                              \
+    template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t);     \
+    template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*);                   \
+    template void computeDiluDiagonalSplit<T, blocksize>(                                                              \
+        T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*);                                           \
+    template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*);               \
+    template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);     \
+    template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*);          \
     template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);
 
 INSTANTIATE_KERNEL_WRAPPERS(float, 1);
diff --git a/tests/cuistl/test_cudilu.cpp b/tests/cuistl/test_cudilu.cpp
index 221948220..d18fdaf66 100644
--- a/tests/cuistl/test_cudilu.cpp
+++ b/tests/cuistl/test_cudilu.cpp
@@ -24,12 +24,12 @@
 #include <dune/common/fmatrix.hh>
 #include <dune/istl/bcrsmatrix.hh>
 #include <memory>
+#include <opm/simulators/linalg/DILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
 #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
 #include <opm/simulators/linalg/cuistl/CuVector.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
 #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
-#include <opm/simulators/linalg/DILU.hpp>
 #include <random>
 #include <vector>
 
@@ -44,47 +44,49 @@ using Sp2x2BlockMatrix = Dune::BCRSMatrix<FM2x2>;
 using CuMatrix = Opm::cuistl::CuSparseMatrix<T>;
 using CuIntVec = Opm::cuistl::CuVector<int>;
 using CuFloatingPointVec = Opm::cuistl::CuVector<T>;
-using CuDilu1x1 = Opm::cuistl::CuDILU<Sp1x1BlockMatrix,  CuFloatingPointVec, CuFloatingPointVec>;
-using CuDilu2x2 = Opm::cuistl::CuDILU<Sp2x2BlockMatrix,  CuFloatingPointVec, CuFloatingPointVec>;
+using CuDilu1x1 = Opm::cuistl::CuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
+using CuDilu2x2 = Opm::cuistl::CuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
 
-Sp1x1BlockMatrix get1x1BlockTestMatrix(){
-/*
-    matA:
-    1  2  0  3  0  0
-    4  5  0  6  0  7
-    0  0  8  0  0  0
-    9 10  0 11 12  0
-    0  0  0 13 14  0
-    0 15  0  0  0 16
+Sp1x1BlockMatrix
+get1x1BlockTestMatrix()
+{
+    /*
+        matA:
+        1  2  0  3  0  0
+        4  5  0  6  0  7
+        0  0  8  0  0  0
+        9 10  0 11 12  0
+        0  0  0 13 14  0
+        0 15  0  0  0 16
 
-    Expected reordering:
-    1  2  0  3  0  0
-    0  0  8  0  0  0
-    4  5  0  6  0  7
-    9 10  0 11 12  0
-    0 15  0  0  0 16
-    0  0  0 13 14  0
+        Expected reordering:
+        1  2  0  3  0  0
+        0  0  8  0  0  0
+        4  5  0  6  0  7
+        9 10  0 11 12  0
+        0 15  0  0  0 16
+        0  0  0 13 14  0
 
-    Expected lowerTriangularReorderedMatrix:
-    0  0  0  0  0  0
-    0  0  0  0  0  0
-    4  0  0  0  0  0
-    9 10  0  0  0  0
-    0 15  0  0  0  0
-    0  0  0 13  0  0
+        Expected lowerTriangularReorderedMatrix:
+        0  0  0  0  0  0
+        0  0  0  0  0  0
+        4  0  0  0  0  0
+        9 10  0  0  0  0
+        0 15  0  0  0  0
+        0  0  0 13  0  0
 
-    Expected lowerTriangularReorderedMatrix:
-    0  2  0  3  0  0
-    0  0  0  0  0  0
-    0  0  0  6  0  7
-    0  0  0  0  12 0
-    0  0  0  0  0  0
-*/
+        Expected lowerTriangularReorderedMatrix:
+        0  2  0  3  0  0
+        0  0  0  0  0  0
+        0  0  0  6  0  7
+        0  0  0  0  12 0
+        0  0  0  0  0  0
+    */
 
     const int N = 6;
     const int nonZeroes = 16;
 
-    //Create the Dune A matrix
+    // Create the Dune A matrix
     Sp1x1BlockMatrix matA(N, N, nonZeroes, Sp1x1BlockMatrix::row_wise);
     for (auto row = matA.createbegin(); row != matA.createend(); ++row) {
         row.insert(row.index());
@@ -132,7 +134,9 @@ Sp1x1BlockMatrix get1x1BlockTestMatrix(){
     return matA;
 }
 
-Sp2x2BlockMatrix get2x2BlockTestMatrix(){
+Sp2x2BlockMatrix
+get2x2BlockTestMatrix()
+{
     /*
     matA:
     1  2    0  3    0  0
@@ -148,7 +152,7 @@ Sp2x2BlockMatrix get2x2BlockTestMatrix(){
     const int N = 3;
     const int nonZeroes = 9;
 
-    //Create the Dune A matrix
+    // Create the Dune A matrix
     Sp2x2BlockMatrix matA(N, N, nonZeroes, Sp2x2BlockMatrix::row_wise);
     for (auto row = matA.createbegin(); row != matA.createend(); ++row) {
         row.insert(row.index());
@@ -215,13 +219,13 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)
 
     // put results in std::vector
     std::vector<T> cpudilures;
-    for (auto e : h_output){
+    for (auto e : h_output) {
         cpudilures.push_back(e);
     }
     auto cudilures = d_output.asStdVector();
 
     // check that CuDilu results matches that of CPU dilu
-    for (size_t i = 0; i < cudilures.size(); ++i){
+    for (size_t i = 0; i < cudilures.size(); ++i) {
         BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
     }
 }
@@ -255,14 +259,14 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked)
 
     auto cudilures = d_output.asStdVector();
     std::vector<T> cpudilures;
-    for (auto v : h_output){
-        for (auto e : v){
+    for (auto v : h_output) {
+        for (auto e : v) {
             cpudilures.push_back(e);
         }
     }
 
     // check that the values are close
-    for (size_t i = 0; i < cudilures.size(); ++i){
+    for (size_t i = 0; i < cudilures.size(); ++i) {
         BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
     }
 }
@@ -316,13 +320,13 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
 
     // put results in std::vector
     std::vector<T> cpudilures;
-    for (auto e : h_output){
+    for (auto e : h_output) {
         cpudilures.push_back(e);
     }
     auto cudilures = d_output.asStdVector();
 
     // check that CuDilu results matches that of CPU dilu
-    for (size_t i = 0; i < cudilures.size(); ++i){
+    for (size_t i = 0; i < cudilures.size(); ++i) {
         BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
     }
 }