From 0f9ae3695d90f44476582eeab09fb2a7d8f1779c Mon Sep 17 00:00:00 2001 From: tqiu Date: Wed, 13 Jan 2021 11:54:40 +0100 Subject: [PATCH] Made reordering optional for WellContributions Removed unnecessary alloc --- opm/simulators/linalg/bda/BILU0.cpp | 3 - .../linalg/bda/WellContributions.cpp | 45 ++++++------ .../linalg/bda/WellContributions.hpp | 16 +++- opm/simulators/linalg/bda/openclKernels.hpp | 73 +++++++++++++++++++ .../linalg/bda/openclSolverBackend.cpp | 17 +++-- .../linalg/bda/openclSolverBackend.hpp | 7 +- 6 files changed, 124 insertions(+), 37 deletions(-) diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 165106b50..e5f95ac87 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -57,7 +57,6 @@ namespace bda delete[] toOrder; delete[] fromOrder; } - delete[] LUmat->nnzValues; } template @@ -130,8 +129,6 @@ namespace bda Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); - LUmat->nnzValues = new double[mat->nnzbs * bs * bs]; - s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs); s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs); s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs); diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 3eca058af..f17c53c3b 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -56,11 +56,6 @@ WellContributions::~WellContributions() if(num_std_wells > 0){ delete[] val_pointers; -#if HAVE_OPENCL - if(opencl_gpu){ - delete[] h_toOrder; - } -#endif } #if HAVE_OPENCL @@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu this->queue = queue_; } -void WellContributions::setKernel(kernel_type *kernel_){ +void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){ this->kernel = kernel_; + this->kernel_no_reorder = kernel_no_reorder_; +} + +void WellContributions::setReordering(int *h_toOrder_, bool reorder_) +{ + this->h_toOrder = h_toOrder_; + this->reorder = reorder_; } void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ @@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe const unsigned int lmem2 = sizeof(double) * dim_wells; cl::Event event; - event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, - cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + if (reorder) { + event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + } else { + event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + *d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + } + event.wait(); } -void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){ +void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){ if(h_x == nullptr){ h_x = new double[N]; h_y = new double[N]; } - if(h_toOrder == nullptr){ - h_toOrder = new int[Nb]; - } - - if(!read_toOrder){ - events.resize(1); - queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]); - events[0].wait(); - events.clear(); - read_toOrder = true; - } - events.resize(2); queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]); queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]); @@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer // actually apply MultisegmentWells for(Opm::MultisegmentWellContribution *well: multisegments){ - well->setReordering(h_toOrder, true); + well->setReordering(h_toOrder, reorder); well->apply(h_x, h_y); } @@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd } if(num_ms_wells > 0){ - apply_mswells(d_x, d_y, d_toOrder); + apply_mswells(d_x, d_y); } } #endif diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 7267fe1f0..f3f8e3fd2 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -95,17 +95,21 @@ private: typedef cl::make_kernel kernel_type; + typedef cl::make_kernel kernel_type_no_reorder; cl::Context *context; cl::CommandQueue *queue; kernel_type *kernel; + kernel_type_no_reorder *kernel_no_reorder; std::vector events; std::unique_ptr d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl; std::unique_ptr d_Ccols_ocl, d_Bcols_ocl; std::unique_ptr d_val_pointers_ocl; - bool read_toOrder = false; + bool reorder = false; int *h_toOrder = nullptr; #endif @@ -150,10 +154,16 @@ public: #endif #if HAVE_OPENCL - void setKernel(kernel_type *kernel_); + void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_); void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_); + + /// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect + /// Those indices need to be mapped via toOrder + /// \param[in] toOrder array with mappings + /// \param[in] reorder whether reordering is actually used or not + void setReordering(int *toOrder, bool reorder); void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); - void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); + void apply_mswells(cl::Buffer d_x, cl::Buffer d_y); void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder); #endif diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index 021905e18..e3ffadf60 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -440,6 +440,79 @@ namespace bda } )"; + inline const char* stdwell_apply_no_reorder_s = R"( + __kernel void stdwell_apply_no_reorder(__global const double *Cnnzs, + __global const double *Dnnzs, + __global const double *Bnnzs, + __global const int *Ccols, + __global const int *Bcols, + __global const double *x, + __global double *y, + const unsigned int dim, + const unsigned int dim_wells, + __global const unsigned int *val_pointers, + __local double *localSum, + __local double *z1, + __local double *z2){ + int wgId = get_group_id(0); + int wiId = get_local_id(0); + int valSize = val_pointers[wgId + 1] - val_pointers[wgId]; + int valsPerBlock = dim*dim_wells; + int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock; + int numBlocksPerWarp = get_local_size(0)/valsPerBlock; + int c = wiId % dim; + int r = (wiId/dim) % dim_wells; + double temp; + + barrier(CLK_LOCAL_MEM_FENCE); + + localSum[wiId] = 0; + if(wiId < numActiveWorkItems){ + int b = wiId/valsPerBlock + val_pointers[wgId]; + while(b < valSize + val_pointers[wgId]){ + int colIdx = Bcols[b]; + localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c]; + b += numBlocksPerWarp; + } + + if(wiId < valsPerBlock){ + localSum[wiId] += localSum[wiId + valsPerBlock]; + } + + b = wiId/valsPerBlock + val_pointers[wgId]; + + if(c == 0 && wiId < valsPerBlock){ + for(unsigned int stride = 2; stride > 0; stride >>= 1){ + localSum[wiId] += localSum[wiId + stride]; + } + z1[r] = localSum[wiId]; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim_wells){ + temp = 0.0; + for(unsigned int i = 0; i < dim_wells; ++i){ + temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i]; + } + z2[wiId] = temp; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if(wiId < dim*valSize){ + temp = 0.0; + int bb = wiId/dim + val_pointers[wgId]; + for (unsigned int j = 0; j < dim_wells; ++j){ + temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j]; + } + int colIdx = Ccols[bb]; + y[colIdx*dim + c] -= temp; + } + } + )"; + } // end namespace bda #endif diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index 3c7aad43a..5e10ad8ec 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -318,7 +317,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr double norm, norm_0; if(wellContribs.getNumWells() > 0){ - wellContribs.setKernel(stdwell_apply_k.get()); + wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get()); } Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); @@ -479,6 +478,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s))); source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s))); source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s))); + source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s))); program = cl::Program(*context, source); program.build(devices); @@ -512,7 +512,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub rb = new double[N]; d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } - wcontainer->init(wellContribs, N, Nb, reorder); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -529,6 +528,10 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply"))); + stdwell_apply_no_reorder_k.reset(new cl::make_kernel(cl::Kernel(program, "stdwell_apply_no_reorder"))); prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get()); @@ -653,14 +656,16 @@ bool openclSolverBackend::analyse_matrix() { template -void openclSolverBackend::update_system(double *vals, double *b) { +void openclSolverBackend::update_system(double *vals, double *b, WellContributions &wellContribs) { Timer t; mat->nnzValues = vals; if (opencl_ilu_reorder != ILUReorder::NONE) { reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); + wellContribs.setReordering(toOrder, true); } else { rb = b; + wellContribs.setReordering(nullptr, false); } if (verbosity > 2) { @@ -743,13 +748,13 @@ SolverStatus openclSolverBackend::solve_system(int N_, int nnz_, int return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; } } - update_system(vals, b); + update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } copy_system_to_gpu(); } else { - update_system(vals, b); + update_system(vals, b, wellContribs); if (!create_preconditioner()) { return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; } diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index 1a8dfa73b..aae527643 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -76,6 +76,10 @@ private: cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k; + std::shared_ptr > stdwell_apply_no_reorder_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings @@ -152,7 +156,8 @@ private: /// Reorder the linear system so it corresponds with the coloring /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values /// \param[in] b input vectors, contains N values - void update_system(double *vals, double *b); + /// \param[out] wellContribs WellContributions, to set reordering + void update_system(double *vals, double *b, WellContributions &wellContribs); /// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same void update_system_on_gpu();