Merge pull request #2762 from ducbueno/opencl-stdwell-clean

Fixed out of resources problem
This commit is contained in:
Markus Blatt 2020-09-03 19:50:39 +02:00 committed by GitHub
commit 53005c477d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 63 additions and 47 deletions

View File

@ -102,8 +102,19 @@ WellContributions::~WellContributions()
}
#if HAVE_OPENCL
void WellContributions::setOpenCLContext(cl::Context *context_){
this->context = context_;
}
void WellContributions::init(cl::Context *context){
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *stdwell_apply_){
this->stdwell_apply = stdwell_apply_;
}
void WellContributions::init(){
if(num_std_wells > 0){
d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
@ -111,24 +122,17 @@ void WellContributions::init(cl::Context *context){
d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
}
void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
if(num_std_wells > 0){
cl::Event event;
queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
event.wait();
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
}
}
void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
// apply MultisegmentWells
if (num_ms_wells > 0) {
// allocate pinned memory on host if not yet done
@ -151,26 +155,25 @@ void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl
}
}
void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
event.wait();
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
if(num_std_wells > 0){
applyStdWell(queue, d_x, d_y, kernel);
applyStdWell(d_x, d_y);
}
if(num_ms_wells > 0){
applyMSWell(queue, d_x, d_y);
applyMSWell(d_x, d_y);
}
}

View File

@ -115,6 +115,9 @@ private:
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int,
cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
kernel_type *stdwell_apply;
cl::Context *context;
cl::CommandQueue *queue;
#endif
#if HAVE_CUDA
@ -133,8 +136,8 @@ private:
#endif
#if HAVE_OPENCL
void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
#endif
public:
@ -155,9 +158,11 @@ public:
#endif
#if HAVE_OPENCL
void init(cl::Context *context);
void copyDataToGPU(cl::CommandQueue *queue);
void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
void init();
void apply(cl::Buffer& x, cl::Buffer& y);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernel(kernel_type *stdwell_apply);
#endif
/// Create a new WellContributions

View File

@ -242,7 +242,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
wellContribs.apply(d_pw, d_v);
t_well.stop();
t_rest.start();
@ -271,7 +271,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
wellContribs.apply(d_s, d_t);
t_well.stop();
t_rest.start();
@ -319,7 +319,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) {
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
@ -462,9 +462,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
cl::Program program_ = cl::Program(*context, source);
program = cl::Program(*context, source);
program_.build(devices);
program.build(devices);
cl::Event event;
queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
@ -495,20 +495,17 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
wellContribs.init(context.get());
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
@ -523,10 +520,19 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
throw error;
}
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
wellContribs.setOpenCLContext(context.get());
wellContribs.setOpenCLQueue(queue.get());
wellContribs.init();
wellContribs.setKernel(add_well_contributions_k.get());
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
delete[] rb;
@ -539,7 +545,7 @@ void openclSolverBackend<block_size>::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
cl::Event event;
@ -561,8 +567,6 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& well
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wellContribs.copyDataToGPU(queue.get());
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@ -702,11 +706,11 @@ void openclSolverBackend<block_size>::get_result(double *x) {
} // end get_result()
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols, wellContribs);
initialize(N_, nnz_, dim, vals, rows, cols);
initialize_wellContribs(wellContribs);
if (analysis_done == false) {
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@ -716,9 +720,10 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu(wellContribs);
copy_system_to_gpu();
} else {
update_system(vals, b);
initialize_wellContribs(wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}

View File

@ -72,6 +72,7 @@ private:
//cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
// shared pointers are also passed to other objects
cl::Program program;
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
@ -149,13 +150,15 @@ private:
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs);
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
void initialize_wellContribs(WellContributions& wellContribs);
/// Clean memory
void finalize();
/// Copy linear system to GPU
void copy_system_to_gpu(WellContributions& wellContribs);
void copy_system_to_gpu();
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values