Add more waiting for opencl functions, using shortened opencl kernel params in WellContributions

This commit is contained in:
Tong Dong Qiu 2021-03-03 14:04:06 +01:00
parent 688d8ff627
commit 0caae966b8
4 changed files with 49 additions and 37 deletions

View File

@ -74,7 +74,7 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
void WellContributions::setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}

View File

@ -26,6 +26,7 @@
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#endif
#include <vector>
@ -39,6 +40,9 @@
namespace Opm
{
using bda::stdwell_apply_kernel_type;
using bda::stdwell_apply_no_reorder_kernel_type;
/// This class serves to eliminate the need to include the WellContributions into the matrix (with --matrix-add-well-contributions=true) for the cusparseSolver
/// If the --matrix-add-well-contributions commandline parameter is true, this class should not be used
/// So far, StandardWell and MultisegmentWell are supported
@ -92,17 +96,10 @@ private:
std::vector<MultisegmentWellContribution*> multisegments;
#if HAVE_OPENCL
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
cl::Context *context;
cl::CommandQueue *queue;
kernel_type *kernel;
kernel_type_no_reorder *kernel_no_reorder;
stdwell_apply_kernel_type *kernel;
stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
@ -154,7 +151,7 @@ public:
#endif
#if HAVE_OPENCL
void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
void setKernel(stdwell_apply_kernel_type *kernel_, stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect

View File

@ -45,7 +45,6 @@ template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_), opencl_ilu_reorder(opencl_ilu_reorder_) {
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
cl_int err = CL_SUCCESS;
std::ostringstream out;
try {
std::vector<cl::Platform> platforms;
@ -331,20 +330,23 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
//applyblockedscaleadd(-1.0, mat, x, r);
// set initial values
cl::Event event;
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
events.resize(5);
queue->enqueueFillBuffer(d_p, 0, 0, sizeof(double) * N, nullptr, &events[0]);
queue->enqueueFillBuffer(d_v, 0, 0, sizeof(double) * N, nullptr, &events[1]);
rho = 1.0;
alpha = 1.0;
omega = 1.0;
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
queue->enqueueCopyBuffer(d_b, d_r, 0, 0, sizeof(double) * N, nullptr, &events[2]);
queue->enqueueCopyBuffer(d_r, d_rw, 0, 0, sizeof(double) * N, nullptr, &events[3]);
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &events[4]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = norm_w(d_r, d_tmp);
norm_0 = norm;
@ -591,7 +593,7 @@ void openclSolverBackend<block_size>::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
cl::Event event;
events.resize(5);
#if COPY_ROW_BY_ROW
int sum = 0;
@ -600,19 +602,25 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
#else
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
err |= queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices, nullptr, &events[1]);
err |= queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers, nullptr, &events[2]);
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[3]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[4]);
if (opencl_ilu_reorder != ILUReorder::NONE) {
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
events.resize(6);
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder, nullptr, &events[5]);
}
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
}
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
if (verbosity > 2) {
std::ostringstream out;
@ -625,7 +633,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
Timer t;
cl::Event event;
events.resize(3);
#if COPY_ROW_BY_ROW
int sum = 0;
@ -634,14 +642,19 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
memcpy(vals_contiguous + sum, reinterpret_cast<double*>(rmat->nnzValues) + sum, size_row * sizeof(double) * block_size * block_size);
sum += size_row * block_size * block_size;
}
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, vals_contiguous, nullptr, &events[0]);
#else
queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues);
err = queue->enqueueWriteBuffer(d_Avals, CL_TRUE, 0, sizeof(double) * nnz, rmat->nnzValues, nullptr, &events[0]);
#endif
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
err |= queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb, nullptr, &events[1]);
err |= queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &events[2]);
cl::WaitForEvents(events);
events.clear();
if (err != CL_SUCCESS) {
// enqueueWriteBuffer is C and does not throw exceptions like C++ OpenCL
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueueWriteBuffer error");
}
if (verbosity > 2) {
std::ostringstream out;

View File

@ -82,6 +82,8 @@ private:
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix (or original if no reordering), used for spmv
ILUReorder opencl_ilu_reorder; // reordering strategy
std::vector<cl::Event> events;
cl_int err;
/// Divide A by B, and round up: return (int)ceil(A/B)
/// \param[in] A dividend