Reordering is now actually skipped when NONE is chosen.

Also made reordering optional for openclSolver.
2025-02-25 18:55:30 -06:00 · 2021-01-11 16:51:05 +01:00 · 2021-01-11 16:51:05 +01:00 · f26e58c6ca
commit f26e58c6ca
parent 71692ff52b
4 changed files with 80 additions and 45 deletions
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -53,9 +53,12 @@ namespace bda
    {
        delete[] invDiagVals;
        delete[] diagIndex;
+        if (opencl_ilu_reorder != ILUReorder::NONE) {
            delete[] toOrder;
            delete[] fromOrder;
        }
+        delete[] LUmat->nnzValues;
+    }

    template <unsigned int block_size>
    bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
@ -67,11 +70,18 @@ namespace bda
        this->nnz = mat->nnzbs * block_size * block_size;
        this->nnzbs = mat->nnzbs;

+        int *CSCRowIndices = nullptr;
+        int *CSCColPointers = nullptr;
+
+        if (opencl_ilu_reorder == ILUReorder::NONE) {
+            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
+        } else {
            toOrder = new int[Nb];
            fromOrder = new int[Nb];
-
-        int *CSCRowIndices = new int[nnzbs];
-        int *CSCColPointers = new int[Nb + 1];
+            CSCRowIndices = new int[nnzbs];
+            CSCColPointers = new int[Nb + 1];
+            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
+            LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);

            Timer t_convert;
            csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb);
@ -80,10 +90,9 @@ namespace bda
                out << "BILU0 convert CSR to CSC: " << t_convert.stop() << " s";
                OpmLog::info(out.str());
            }
+        }

        Timer t_analysis;
-        rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
-        LUmat = std::make_unique<BlockedMatrix<block_size> >(*rmat);
        std::ostringstream out;
        if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
            out << "BILU0 reordering strategy: " << "level_scheduling\n";
@ -93,15 +102,11 @@ namespace bda
            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
            out << "BILU0 reordering strategy: none\n";
-            numColors = 1;
-            rowsPerColor.emplace_back(Nb);
-            // numColors = Nb;
-            // for(int i = 0; i < Nb; ++i){
-            //     rowsPerColor.emplace_back(1);
-            // }
+            // numColors = 1;
+            // rowsPerColor.emplace_back(Nb);
+            numColors = Nb;
            for(int i = 0; i < Nb; ++i){
-                toOrder[i] = i;
-                fromOrder[i] = i;
+                rowsPerColor.emplace_back(1);
            }
        } else {
            OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
@ -114,8 +119,10 @@ namespace bda
        }
        OpmLog::info(out.str());

+        if (opencl_ilu_reorder != ILUReorder::NONE) {
            delete[] CSCRowIndices;
            delete[] CSCColPointers;
+        }

        diagIndex = new int[mat->Nb];
        invDiagVals = new double[mat->Nb * bs * bs];
@ -385,6 +392,10 @@ namespace bda
    bool BILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
    {
        const unsigned int bs = block_size;
+        auto *m = mat;
+
+        if (opencl_ilu_reorder != ILUReorder::NONE) {
+            m = rmat.get();
            Timer t_reorder;
            reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());

@ -393,10 +404,12 @@ namespace bda
                out << "BILU0 reorder matrix: " << t_reorder.stop() << " s";
                OpmLog::info(out.str());
            }
+        }

        // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+        // this copy can have mat or rmat ->nnzValues as origin, depending on the reorder strategy
        Timer t_copy;
-        memcpy(LUmat->nnzValues, rmat->nnzValues, sizeof(double) * bs * bs * rmat->nnzbs);
+        memcpy(LUmat->nnzValues, m->nnzValues, sizeof(double) * bs * bs * m->nnzbs);

        if (verbosity >= 3){
            std::ostringstream out;
--- a/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
+++ b/opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
@ -126,6 +126,7 @@ public:
    /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
    /// Those indices need to be mapped via toOrder
    /// \param[in] toOrder    array with mappings
+    /// \param[in] reorder    whether reordering is actually used or not
    void setReordering(int *toOrder, bool reorder);
 };

--- a/opm/simulators/linalg/bda/openclSolverBackend.cpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp
@ -44,7 +44,7 @@ using Opm::OpmLog;
 using Dune::Timer;

 template <unsigned int block_size>
-openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
+openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
    prec = new Preconditioner(opencl_ilu_reorder, verbosity_);

    cl_int err = CL_SUCCESS;
@ -485,7 +485,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        prec->setOpenCLContext(context.get());
        prec->setOpenCLQueue(queue.get());

-        rb = new double[N];
        tmp = new double[N];
 #if COPY_ROW_BY_ROW
        vals_contiguous = new double[N];
@ -508,7 +507,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
        d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
        d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));

+        bool reorder = (opencl_ilu_reorder != ILUReorder::NONE);
+        if (reorder) {
+            rb = new double[N];
            d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
+        }
+        wcontainer->init(wellContribs, N, Nb, reorder);

        // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
        // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -544,7 +548,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub

 template <unsigned int block_size>
 void openclSolverBackend<block_size>::finalize() {
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
        delete[] rb;
+    }
    delete[] tmp;
 #if COPY_ROW_BY_ROW
    delete[] vals_contiguous;
@ -572,7 +578,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
    queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
    queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
    queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
        queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
+    }
    queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
    event.wait();

@ -624,9 +632,13 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
    int lmem_per_work_group = work_group_size * sizeof(double);
    prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);

+    if (opencl_ilu_reorder == ILUReorder::NONE) {
+        rmat = mat.get();
+    } else {
        toOrder = prec->getToOrder();
        fromOrder = prec->getFromOrder();
        rmat = prec->getRMat();
+    }

    if (verbosity > 2) {
        std::ostringstream out;
@ -645,7 +657,11 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
    Timer t;

    mat->nnzValues = vals;
+    if (opencl_ilu_reorder != ILUReorder::NONE) {
        reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+    } else {
+        rb = b;
+    }

    if (verbosity > 2) {
        std::ostringstream out;
@ -703,8 +719,12 @@ template <unsigned int block_size>
 void openclSolverBackend<block_size>::get_result(double *x) {
    Timer t;

+    if (opencl_ilu_reorder != ILUReorder::NONE) {
        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, rb);
        reorderBlockedVectorByPattern<block_size>(mat->Nb, rb, toOrder, x);
+    } else {
+        queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, x);
+    }

    if (verbosity > 2) {
        std::ostringstream out;
--- a/opm/simulators/linalg/bda/openclSolverBackend.hpp
+++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp
@ -51,7 +51,7 @@ class openclSolverBackend : public BdaSolver<block_size>
    using Base::initialized;

 private:
-    double *rb = nullptr;                 // reordered b vector, the matrix is reordered, so b must also be
+    double *rb = nullptr;                 // reordered b vector, if the matrix is reordered, rb is newly allocated, otherwise it just points to b
    double *vals_contiguous = nullptr;    // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp

    // OpenCL variables must be reusable, they are initialized in initialize()
@ -59,7 +59,7 @@ private:
    cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p;   // vectors, used during linear solve
    cl::Buffer d_pw, d_s, d_t, d_v;              // vectors, used during linear solve
    cl::Buffer d_tmp;                            // used as tmp GPU buffer for dot() and norm()
-    cl::Buffer d_toOrder;
+    cl::Buffer d_toOrder;                        // only used when reordering is used
    double *tmp = nullptr;                       // used as tmp CPU buffer for dot() and norm()

    // shared pointers are also passed to other objects
@ -81,7 +81,8 @@ private:
    int *toOrder = nullptr, *fromOrder = nullptr;                 // BILU0 reorders rows of the matrix via these mappings
    bool analysis_done = false;
    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;    // original matrix 
-    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix, used for spmv
+    BlockedMatrix<block_size> *rmat = nullptr;                    // reordered matrix (or original if no reordering), used for spmv
+    ILUReorder opencl_ilu_reorder;                                // reordering strategy

    /// Divide A by B, and round up: return (int)ceil(A/B)
    /// \param[in] A    dividend