diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 6f8b856b9..60142f014 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -63,22 +63,23 @@ public: B }; + unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq + unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq + #if HAVE_OPENCL std::vector h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl; std::vector h_Ccols_ocl, h_Bcols_ocl; std::vector h_val_pointers_ocl; std::vector h_x_ocl, h_y_ocl; -#endif - -private: - unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq - unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq - unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() - unsigned int N; // number of rows (not blockrows) in vectors x and y - std::vector multisegments; int *toOrder = nullptr; bool reorder = false; +#endif + +private: + unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() + unsigned int N; // number of rows (not blockrows) in vectors x and y + std::vector multisegments; bool opencl_gpu = false; bool cuda_gpu = false; diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp index 716d05e57..1c9e1d504 100644 --- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp +++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp @@ -24,19 +24,18 @@ #include #include - +#include namespace bda { using Opm::OpmLog; using Dune::Timer; - void WellContributionsOCLContainer::initBuffers(WellContributions &wellContribs) - { + void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int Nb_){ + Nb = Nb_; dim = wellContribs.dim; dim_wells = wellContribs.dim_wells; num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1; - toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + wellContribs.h_Ccols_ocl.size()); s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size()); s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size()); @@ -44,10 +43,12 @@ namespace bda s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size()); s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size()); s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size()); - s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * toOrder.size()); + s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } - void WellcontributionsOCLContainer::copy_to_gpu(WellContributions &wellContribs){ + void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs){ + toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + Nb); + cl::Event event; queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data()); queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data()); @@ -59,7 +60,7 @@ namespace bda event.wait(); } - void WellcontributionsOCLContainer::update_on_gpu(WellContributions &wellContribs){ + void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){ cl::Event event; queue->enqueueWriteBuffer(s.Cnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data()); queue->enqueueWriteBuffer(s.Dnnzs, CL_TRUE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data()); @@ -90,7 +91,7 @@ namespace bda cl::Event event; event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, s.toOrder,x, y, dim, dim_wells, s.val_pointers, + s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); } diff --git a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp index 70e85fb90..3f6182a20 100644 --- a/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp +++ b/opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp @@ -31,6 +31,7 @@ namespace bda unsigned int dim, dim_wells; unsigned int num_std_wells = 0; unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size() + int Nb; std::vector toOrder; typedef struct { @@ -50,16 +51,15 @@ namespace bda void applyStdWells(cl::Buffer& x, cl::Buffer& y); public: - WellContributionsOCLContainer(); - ~WellContributionsOCLContainer(); + WellContributionsOCLContainer() {}; + ~WellContributionsOCLContainer() {}; void apply(cl::Buffer& x, cl::Buffer& y); - void initBuffers(WellContributions &wellContribs); - void copy_to_gpu(WellContributions &wellContribs); - void update_on_gpu(WellContributions &wellContribs); + void init(Opm::WellContributions &wellContribs, int Nb); + void copy_to_gpu(Opm::WellContributions &wellContribs); + void update_on_gpu(Opm::WellContributions &wellContribs); void setOpenCLContext(cl::Context *context); void setOpenCLQueue(cl::CommandQueue *queue); - void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group); void setKernel(cl::make_kernel::gpu_pbicgstab(BdaResult& res) { // v = A * pw t_spmv.start(); - wcontainer->apply(d_pw, d_v); + spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v); t_spmv.stop(); // apply wellContributions t_well.start(); - stdwell_w(d_Cnnzs, d_Dnnzs, d_Bnnzs, d_Ccols, d_Bcols, d_pw, d_v, d_val_pointers); + wcontainer->apply(d_pw, d_v); t_well.stop(); t_rest.start(); @@ -496,7 +496,7 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb); d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1)); - wcontainer->initBuffers(wellContribs); + wcontainer->init(wellContribs, Nb); // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA // cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking @@ -566,6 +566,7 @@ void openclSolverBackend::copy_system_to_gpu(WellContributions &well queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event); event.wait(); + wellContribs.setReordering(toOrder, true); wcontainer->copy_to_gpu(wellContribs); if (verbosity > 2) { diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index ec103f725..e01bd4de1 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -129,8 +129,6 @@ private: /// \param[out] b output vector void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b); - void stdwell_w(cl::Buffer Cnnzs, cl::Buffer Dnnzs, cl::Buffer Bnnzs, cl::Buffer Ccols, cl::Buffer Bcols, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers); - /// Solve linear system using ilu0-bicgstab /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result @@ -170,7 +168,7 @@ private: /// Solve linear system /// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A /// \param[inout] res summary of solver result - void solve_system(WellContributions &wellContribs, BdaResult &res); + void solve_system(BdaResult &res); public: