Do not reorder the matrix for GPU

2025-02-25 18:55:30 -06:00 · 2022-09-22 10:15:23 +02:00
parent 2690917e93
commit e327142088
12 changed files with 76 additions and 126 deletions
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@@ -93,16 +93,16 @@ void reorderBlockedVectorByPattern(int Nb, double *vector, int *fromOrder, doubl
 bool canBeStarted(const int rowIndex, const  int *rowPointers, const  int *colIndices, const std::vector<bool>& doneRows);

 /// Find a level scheduling reordering for an input matrix
-/// The toOrder and fromOrder   arrays must be allocated already
+/// The toOrder and fromOrder arrays must be allocated already
 /// \param[in] CSRColIndices    column indices array, obtained from storing the input matrix in the CSR format
 /// \param[in] CSRRowPointers   row pointers array, obtained from storing the input matrix in the CSR format
 /// \param[in] CSCRowIndices    row indices array, obtained from storing the input matrix in the CSC format
 /// \param[in] CSCColPointers   column pointers array, obtained from storing the input matrix in the CSC format
 /// \param[in] Nb               number of blockrows in the matrix
 /// \param[out] numColors       a pointer to the number of colors needed for the level scheduling
-/// \param[inout] toOrder       the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
-/// \param[inout] fromOrder     the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
-/// \param[inout] rowsPerColor  for each used color, the number of rows assigned to that color, this function uses emplace_back() to fill
+/// \param[out] toOrder         the reorder pattern that was found, which lists for each index in the original order, to which index in the new order it should be moved
+/// \param[out] fromOrder       the reorder pattern that was found, which lists for each index in the new order, from which index in the original order it was moved
+/// \param[out] rowsPerColor    for each color, an array of all rowIndices in that color, this function uses emplace_back() to fill
 void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector<int>& rowsPerColor);

 /// Find a graph coloring reordering for an input matrix
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@@ -78,14 +78,8 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
        fromOrder.resize(Nb);
        CSCRowIndices.resize(matToDecompose->nnzbs);
        CSCColPointers.resize(Nb + 1);
-        rmat = std::make_shared<BlockedMatrix>(mat->Nb, mat->nnzbs, block_size);

-        if (jacMat) {
-            rJacMat = std::make_shared<BlockedMatrix>(jacMat->Nb, jacMat->nnzbs, block_size);
-            LUmat = std::make_unique<BlockedMatrix>(*rJacMat);
-        } else {
-            LUmat = std::make_unique<BlockedMatrix>(*rmat);
-        }
+        LUmat = std::make_unique<BlockedMatrix>(*matToDecompose);

        Timer t_convert;
        csrPatternToCsc(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb);
@@ -101,17 +95,9 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
        out << "BILU0 reordering strategy: " << "level_scheduling\n";
        findLevelScheduling(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
-        if (jacMat) {
-            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
-        }
    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
        out << "BILU0 reordering strategy: " << "graph_coloring\n";
        findGraphColoring<block_size>(matToDecompose->colIndices, matToDecompose->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), Nb, Nb, Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
-        reorderBlockedMatrixByPattern(mat, reordermappingNonzeroes, toOrder.data(), fromOrder.data(), rmat.get());
-        if (jacMat) {
-            reorderBlockedMatrixByPattern(jacMat, jacReordermappingNonzeroes, toOrder.data(), fromOrder.data(), rJacMat.get());
-        }
    } else if (opencl_ilu_reorder == ILUReorder::NONE) {
        out << "BILU0 reordering strategy: none\n";
        // numColors = 1;
@@ -143,6 +129,7 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb);
    s.rowsPerColor = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (numColors + 1));
    s.diagIndex = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * LUmat->Nb);
+    s.rowIndices = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned) * LUmat->Nb);
 #if CHOW_PATEL
    s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
    s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
@@ -156,14 +143,16 @@ bool BILU0<block_size>::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat
    s.LUrows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (LUmat->Nb + 1));
 #endif

-    events.resize(2);
+    events.resize(3);
    err = queue->enqueueWriteBuffer(s.invDiagVals, CL_FALSE, 0, mat->Nb * sizeof(double) * bs * bs, invDiagVals.data(), nullptr, &events[0]);

    rowsPerColorPrefix.resize(numColors + 1); // resize initializes value 0.0
    for (int i = 0; i < numColors; ++i) {
-        rowsPerColorPrefix[i+1] = rowsPerColorPrefix[i] + rowsPerColor[i];
+        rowsPerColorPrefix[i + 1] = rowsPerColorPrefix[i] + rowsPerColor[i];
    }
+
    err |= queue->enqueueWriteBuffer(s.rowsPerColor, CL_FALSE, 0, (numColors + 1) * sizeof(int), rowsPerColorPrefix.data(), nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(s.rowIndices, CL_FALSE, 0, Nb * sizeof(unsigned), fromOrder.data(), nullptr, &events[2]);

    cl::WaitForEvents(events);
    events.clear();
@@ -189,27 +178,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
 {
    const unsigned int bs = block_size;

-    auto *matToDecompose = jacMat;
-
-    if (opencl_ilu_reorder == ILUReorder::NONE) { // NONE should only be used in debug
-        matToDecompose = jacMat ? jacMat : mat;
-    } else {
-        Timer t_reorder;
-        if (jacMat) {
-            matToDecompose = rJacMat.get();
-            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
-            reorderNonzeroes(jacMat, jacReordermappingNonzeroes, rJacMat.get());
-        } else {
-            matToDecompose = rmat.get();
-            reorderNonzeroes(mat, reordermappingNonzeroes, rmat.get());
-        }
-
-        if (verbosity >= 3){
-            std::ostringstream out;
-            out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
-            OpmLog::info(out.str());
-        }
-    }
+    auto *matToDecompose = jacMat ? jacMat : mat;

    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
@@ -269,11 +238,11 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
    std::ostringstream out;
    for (int color = 0; color < numColors; ++color) {
        const unsigned int firstRow = rowsPerColorPrefix[color];
-        const unsigned int lastRow = rowsPerColorPrefix[color+1];
+        const unsigned int lastRow = rowsPerColorPrefix[color + 1];
        if (verbosity >= 5) {
            out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
        }
-        OpenclKernels::ILU_decomp(firstRow, lastRow,
+        OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
                                  s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
                                  s.invDiagVals, rowsPerColor[color], block_size);
    }
@@ -301,11 +270,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)

    for (int color = 0; color < numColors; ++color) {
 #if CHOW_PATEL
-        OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
+        OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
                                  s.diagIndex, y, x, s.rowsPerColor,
                                  color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
+        OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
                                  s.diagIndex, y, x, s.rowsPerColor,
                                  color, rowsPerColor[color], block_size);
 #endif
@@ -313,11 +282,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)

    for (int color = numColors - 1; color >= 0; --color) {
 #if CHOW_PATEL
-        OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
+        OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
                                  s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
                                  color, rowsPerColor[color], block_size);
 #else
-        OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
+        OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
                                  s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
                                  color, rowsPerColor[color], block_size);
 #endif
--- a/opm/simulators/linalg/bda/opencl/BILU0.hpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.hpp
@@ -54,8 +54,6 @@ class BILU0 : public Preconditioner<block_size>

 private:
    std::unique_ptr<BlockedMatrix> LUmat = nullptr;
-    std::shared_ptr<BlockedMatrix> rmat = nullptr; // only used with PAR_SIM
-    std::shared_ptr<BlockedMatrix> rJacMat = nullptr; 
 #if CHOW_PATEL
    std::unique_ptr<BlockedMatrix> Lmat = nullptr, Umat = nullptr;
 #endif
@@ -76,6 +74,7 @@ private:
        cl::Buffer invDiagVals;
        cl::Buffer diagIndex;
        cl::Buffer rowsPerColor;
+        cl::Buffer rowIndices;
 #if CHOW_PATEL
        cl::Buffer Lvals, Lcols, Lrows;
        cl::Buffer Uvals, Ucols, Urows;
@@ -117,12 +116,7 @@ public:

    BlockedMatrix* getRMat() override
    {
-        return rmat.get();
-    }
-
-    BlockedMatrix* getRJacMat()
-    {
-        return rJacMat.get();
+        return LUmat.get();
    }

    std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> get_preconditioner_structure()
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl
@@ -3,6 +3,7 @@
 /// Here, L is it's own BSR matrix.
 /// Only used with ChowPatelIlu.
 __kernel void ILU_apply1(
+    __global const unsigned *rowIndices,
    __global const double *LUvals,
    __global const unsigned int *LUcols,
    __global const unsigned int *LUrows,
@@ -30,14 +31,16 @@ __kernel void ILU_apply1(
    target_block_row += nodesPerColorPrefix[color];

    while(target_block_row < nodesPerColorPrefix[color+1]){
-        const unsigned int first_block = LUrows[target_block_row];
-        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned row_idx = rowIndices[target_block_row];
+
+        const unsigned int first_block = LUrows[row_idx];
+        const unsigned int last_block = LUrows[row_idx+1];
        unsigned int block = first_block + lane / (bs*bs);
        double local_out = 0.0;

        if(lane < num_active_threads){
            if(lane < bs){
-                local_out = y[target_block_row*bs+lane];
+                local_out = y[row_idx*bs+lane];
            }
            for(; block < last_block; block += num_blocks_per_warp){
                const double x_elem = x[LUcols[block]*bs + c];
@@ -60,7 +63,7 @@ __kernel void ILU_apply1(
        }

        if(lane < bs){
-            const unsigned int row = target_block_row*bs + lane;
+            const unsigned int row = row_idx*bs + lane;
            x[row] = tmp[lane];
        }

--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
@@ -3,6 +3,7 @@
 /// Here, L is inside a normal, square matrix.
 /// In this case, diagIndex indicates where the rows of L end.
 __kernel void ILU_apply1(
+    __global const unsigned *rowIndices,
    __global const double *LUvals,
    __global const unsigned int *LUcols,
    __global const unsigned int *LUrows,
@@ -29,14 +30,17 @@ __kernel void ILU_apply1(
    const unsigned int r = lane % bs;

    while(target_block_row < nodesPerColorPrefix[color+1]){
-        const unsigned int first_block = LUrows[target_block_row];
-        const unsigned int last_block = diagIndex[target_block_row];
+        unsigned row_idx = rowIndices[target_block_row];
+
+        const unsigned int first_block = LUrows[row_idx];
+        const unsigned int last_block = diagIndex[row_idx];
+
        unsigned int block = first_block + lane / (bs*bs);
        double local_out = 0.0;

        if(lane < num_active_threads){
            if(lane < bs){
-                local_out = y[target_block_row*bs+lane];
+                local_out = y[row_idx*bs+lane];
            }
            for(; block < last_block; block += num_blocks_per_warp){
                const double x_elem = x[LUcols[block]*bs + c];
@@ -59,7 +63,7 @@ __kernel void ILU_apply1(
        }

        if(lane < bs){
-            const unsigned int row = target_block_row*bs + lane;
+            const unsigned int row = row_idx*bs + lane;
            x[row] = tmp[lane];
        }

--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl
@@ -3,6 +3,7 @@
 /// Here, U is it's own BSR matrix.
 /// Only used with ChowPatelIlu.
 __kernel void ILU_apply2(
+    __global const unsigned *rowIndices,
    __global const double *LUvals,
    __global const int *LUcols,
    __global const int *LUrows,
@@ -29,14 +30,15 @@ __kernel void ILU_apply2(
    const unsigned int r = lane % bs;

    while(target_block_row < nodesPerColorPrefix[color+1]){
-        const unsigned int first_block = LUrows[target_block_row];
-        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned row_idx = rowIndices[target_block_row];
+        const unsigned int first_block = LUrows[row_idx];
+        const unsigned int last_block = LUrows[row_idx+1];
        unsigned int block = first_block + lane / (bs*bs);
        double local_out = 0.0;

        if(lane < num_active_threads){
            if(lane < bs){
-                const unsigned int row = target_block_row*bs+lane;
+                const unsigned int row = row_idx*bs+lane;
                local_out = x[row];
            }
            for(; block < last_block; block += num_blocks_per_warp){
@@ -64,10 +66,10 @@ __kernel void ILU_apply2(
            tmp[lane + bs*idx_t/warpsize] = local_out;
            double sum = 0.0;
            for(int i = 0; i < bs; ++i){
-                sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
            }

-            const unsigned int row = target_block_row*bs + lane;
+            const unsigned int row = row_idx*bs + lane;
            x[row] = sum;
        }

--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
@@ -3,6 +3,7 @@
 /// Here, U is inside a normal, square matrix.
 /// In this case diagIndex indicates where the rows of U start.
 __kernel void ILU_apply2(
+    __global const unsigned *rowIndices,
    __global const double *LUvals,
    __global const int *LUcols,
    __global const int *LUrows,
@@ -30,14 +31,15 @@ __kernel void ILU_apply2(
    target_block_row += nodesPerColorPrefix[color];

    while(target_block_row < nodesPerColorPrefix[color+1]){
-        const unsigned int first_block = diagIndex[target_block_row] + 1;
-        const unsigned int last_block = LUrows[target_block_row+1];
+        unsigned row_idx = rowIndices[target_block_row];
+        const unsigned int first_block = diagIndex[row_idx] + 1;
+        const unsigned int last_block = LUrows[row_idx+1];
        unsigned int block = first_block + lane / (bs*bs);
        double local_out = 0.0;

        if(lane < num_active_threads){
            if(lane < bs){
-                const unsigned int row = target_block_row*bs+lane;
+                const unsigned int row = row_idx*bs+lane;
                local_out = x[row];
            }
            for(; block < last_block; block += num_blocks_per_warp){
@@ -65,10 +67,10 @@ __kernel void ILU_apply2(
            tmp[lane + bs*idx_t/warpsize] = local_out;
            double sum = 0.0;
            for(int i = 0; i < bs; ++i){
-                sum += invDiagVals[target_block_row*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
+                sum += invDiagVals[row_idx*bs*bs + i + lane*bs] * tmp[i + bs*idx_t/warpsize];
            }

-            const unsigned int row = target_block_row*bs + lane;
+            const unsigned int row = row_idx*bs + lane;
            x[row] = sum;
        }

--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_decomp.cl
@@ -68,6 +68,7 @@ __kernel void inverter(__global double *matrix, __global double *inverse)
 /// The kernel takes a full BSR matrix and performs inplace ILU decomposition
 __kernel void ilu_decomp(const unsigned int firstRow,
                         const unsigned int lastRow,
+                         __global const unsigned *rowIndices,
                         __global double *LUvals,
                         __global const int *LUcols,
                         __global const int *LUrows,
@@ -91,14 +92,15 @@ __kernel void ilu_decomp(const unsigned int firstRow,
    // go through all rows
    for (int i = firstRow + work_group_id * hwarps_per_group + hwarp_id_in_group; i < lastRow; i += num_groups * hwarps_per_group)
    {
-        int iRowStart = LUrows[i];
-        int iRowEnd = LUrows[i + 1];
+        const unsigned row = rowIndices[i];
+        int iRowStart = LUrows[row];
+        int iRowEnd = LUrows[row + 1];

        // go through all elements of the row
        for (int ij = iRowStart; ij < iRowEnd; ij++) {
            int j = LUcols[ij];

-            if (j < i) {
+            if (j < row) {
                // calculate the pivot of this row
                block_mult(LUvals + ij * bs * bs, invDiagVals + j * bs * bs, pivot + lmem_offset);

@@ -127,11 +129,11 @@ __kernel void ilu_decomp(const unsigned int firstRow,
        }

        // store the inverse in the diagonal
-        inverter(LUvals + diagIndex[i] * bs * bs, invDiagVals + i * bs * bs);
+        inverter(LUvals + diagIndex[row] * bs * bs, invDiagVals + row * bs * bs);

        // copy inverse
        if (thread_id_in_hwarp < bs * bs) {
-            LUvals[diagIndex[i] * bs * bs + thread_id_in_hwarp] = invDiagVals[i * bs * bs + thread_id_in_hwarp];
+            LUvals[diagIndex[row] * bs * bs + thread_id_in_hwarp] = invDiagVals[row * bs * bs + thread_id_in_hwarp];
        }
    }
 }
--- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp
@@ -390,7 +390,7 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
    }
 }

-void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
+void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
                               cl::Buffer& rows, cl::Buffer& diagIndex,
                               const cl::Buffer& y, cl::Buffer& x,
                               cl::Buffer& rowsPerColor, int color,
@@ -403,8 +403,8 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
    Timer t_ilu_apply1;

    cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          vals, cols, rows, diagIndex, y, x,
-                          rowsPerColor, color, block_size,
+                          rowIndices, vals, cols, rows, diagIndex,
+                          y, x, rowsPerColor, color, block_size,
                          cl::Local(lmem_per_work_group));

    if (verbosity >= 5) {
@@ -415,7 +415,7 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
    }
 }

-void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
+void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
                               cl::Buffer& rows, cl::Buffer& diagIndex,
                               cl::Buffer& invDiagVals, cl::Buffer& x,
                               cl::Buffer& rowsPerColor, int color,
@@ -428,8 +428,8 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
    Timer t_ilu_apply2;

    cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          vals, cols, rows, diagIndex, invDiagVals,
-                          x, rowsPerColor, color, block_size,
+                          rowIndices, vals, cols, rows, diagIndex,
+                          invDiagVals, x, rowsPerColor, color, block_size,
                          cl::Local(lmem_per_work_group));

    if (verbosity >= 5) {
@@ -440,7 +440,7 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
    }
 }

-void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
+void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
                               cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
                               cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
                               int rowsThisColor, unsigned int block_size)
@@ -453,7 +453,8 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
    Timer t_ilu_decomp;

    cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
-                          firstRow, lastRow, vals, cols, rows,
+                          firstRow, lastRow, rowIndices,
+                          vals, cols, rows,
                          invDiagVals, diagIndex, rowsThisColor,
                          cl::Local(lmem_per_work_group));

--- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp
@@ -39,9 +39,9 @@ using residual_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&,
                                         cl::Buffer&, const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
 using residual_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
                                         cl::Buffer&, const cl::Buffer&, cl::Buffer&, cl::LocalSpaceArg>;
-using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
+using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
-using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
+using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
 using stdwell_apply_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
@@ -51,7 +51,7 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
                                                             cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                                             const unsigned int, const unsigned int, cl::Buffer&,
                                                             cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>;
-using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
+using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                               cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
 using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
                                  cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
@@ -135,13 +135,13 @@ public:
    static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true, bool add = false);
    static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);

-    static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
+    static void ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
        const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);

-    static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
+    static void ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex,
        cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);

-    static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
+    static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
        cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);

    static void apply_stdwells_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp
@@ -232,12 +232,6 @@ void openclSolverBackend<block_size>::setOpencl(std::shared_ptr<cl::Context>& co
    queue = queue_;
 }

-template <unsigned int block_size>
-openclSolverBackend<block_size>::~openclSolverBackend() {
-    finalize();
-}
-
-

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@@ -464,7 +458,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>

        bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
        if (reorder) {
-            rb = new double[N];
            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
        }

@@ -482,13 +475,6 @@ void openclSolverBackend<block_size>::initialize(std::shared_ptr<BlockedMatrix>
    initialized = true;
 } // end initialize()

-template <unsigned int block_size>
-void openclSolverBackend<block_size>::finalize() {
-    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        delete[] rb;
-    }
-} // end finalize()
-
 template <unsigned int block_size>
 void openclSolverBackend<block_size>::copy_system_to_gpu() {
    Timer t;
@@ -508,7 +494,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {

    err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
    err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[3]);
    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
    if (opencl_ilu_reorder != ILUReorder::NONE) {
        events.resize(6);
@@ -546,7 +532,7 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
    err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
 #endif

-    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
+    err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, h_b, nullptr, &events[1]);
    err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
    cl::WaitForEvents(events);
    events.clear();
@@ -602,13 +588,8 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
    Timer t;

    mat->nnzValues = vals;
-    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
-        static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
-    } else {
-        rb = b;
-        static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
-    }
+    h_b = b;
+    static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);

    if (verbosity > 2) {
        std::ostringstream out;
@@ -670,12 +651,7 @@ template <unsigned int block_size>
 void openclSolverBackend<block_size>::get_result(double *x) {
    Timer t;

-    if (opencl_ilu_reorder != ILUReorder::NONE) {
-        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
-        reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
-    } else {
-        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
-    }
+    queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);

    if (verbosity > 2) {
        std::ostringstream out;
--- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.hpp
@@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-    double *rb = nullptr;                 // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
+    double *h_b = nullptr;                // b vector, on host
    std::vector<double> vals_contiguous;  // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp

    // OpenCL variables must be reusable, they are initialized in initialize()
@@ -184,9 +184,6 @@ public:
    /// For the CPR coarse solver
    openclSolverBackend(int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);

-    /// Destroy a openclSolver, and free memory
-    ~openclSolverBackend();
-
    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
    /// \param[in] matrix         matrix A
    /// \param[in] b              input vector, contains N values