mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #2762 from ducbueno/opencl-stdwell-clean
Fixed out of resources problem
This commit is contained in:
commit
53005c477d
@ -102,8 +102,19 @@ WellContributions::~WellContributions()
|
||||
}
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void WellContributions::setOpenCLContext(cl::Context *context_){
|
||||
this->context = context_;
|
||||
}
|
||||
|
||||
void WellContributions::init(cl::Context *context){
|
||||
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
|
||||
this->queue = queue_;
|
||||
}
|
||||
|
||||
void WellContributions::setKernel(kernel_type *stdwell_apply_){
|
||||
this->stdwell_apply = stdwell_apply_;
|
||||
}
|
||||
|
||||
void WellContributions::init(){
|
||||
if(num_std_wells > 0){
|
||||
d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
|
||||
d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
|
||||
@ -111,24 +122,17 @@ void WellContributions::init(cl::Context *context){
|
||||
d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||
d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
|
||||
d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
|
||||
if(num_std_wells > 0){
|
||||
cl::Event event;
|
||||
|
||||
queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
|
||||
queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
|
||||
queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
|
||||
queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
|
||||
queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
|
||||
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
|
||||
event.wait();
|
||||
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
|
||||
void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
|
||||
// apply MultisegmentWells
|
||||
if (num_ms_wells > 0) {
|
||||
// allocate pinned memory on host if not yet done
|
||||
@ -151,26 +155,25 @@ void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl
|
||||
}
|
||||
}
|
||||
|
||||
void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
|
||||
void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int total_work_items = num_std_wells * work_group_size;
|
||||
const unsigned int lmem1 = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||
|
||||
cl::Event event;
|
||||
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
|
||||
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
event.wait();
|
||||
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
|
||||
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
}
|
||||
|
||||
void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
|
||||
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
|
||||
if(num_std_wells > 0){
|
||||
applyStdWell(queue, d_x, d_y, kernel);
|
||||
applyStdWell(d_x, d_y);
|
||||
}
|
||||
|
||||
if(num_ms_wells > 0){
|
||||
applyMSWell(queue, d_x, d_y);
|
||||
applyMSWell(d_x, d_y);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,6 +115,9 @@ private:
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, const unsigned int, const unsigned int,
|
||||
cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
|
||||
kernel_type *stdwell_apply;
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
#endif
|
||||
|
||||
#if HAVE_CUDA
|
||||
@ -133,8 +136,8 @@ private:
|
||||
#endif
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
|
||||
void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
|
||||
void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
|
||||
void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
|
||||
#endif
|
||||
|
||||
public:
|
||||
@ -155,9 +158,11 @@ public:
|
||||
#endif
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void init(cl::Context *context);
|
||||
void copyDataToGPU(cl::CommandQueue *queue);
|
||||
void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
|
||||
void init();
|
||||
void apply(cl::Buffer& x, cl::Buffer& y);
|
||||
void setOpenCLContext(cl::Context *context);
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
void setKernel(kernel_type *stdwell_apply);
|
||||
#endif
|
||||
|
||||
/// Create a new WellContributions
|
||||
|
@ -242,7 +242,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// apply wellContributions
|
||||
t_well.start();
|
||||
wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
|
||||
wellContribs.apply(d_pw, d_v);
|
||||
t_well.stop();
|
||||
|
||||
t_rest.start();
|
||||
@ -271,7 +271,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// apply wellContributions
|
||||
t_well.start();
|
||||
wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
|
||||
wellContribs.apply(d_s, d_t);
|
||||
t_well.stop();
|
||||
|
||||
t_rest.start();
|
||||
@ -319,7 +319,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs) {
|
||||
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
@ -462,9 +462,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||
source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
|
||||
cl::Program program_ = cl::Program(*context, source);
|
||||
program = cl::Program(*context, source);
|
||||
|
||||
program_.build(devices);
|
||||
program.build(devices);
|
||||
|
||||
cl::Event event;
|
||||
queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
|
||||
@ -495,20 +495,17 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
|
||||
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
|
||||
|
||||
wellContribs.init(context.get());
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "dot_1")));
|
||||
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "norm")));
|
||||
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program_, "axpy")));
|
||||
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program_, "custom")));
|
||||
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program_, "ILU_apply2")));
|
||||
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
|
||||
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||
|
||||
@ -523,10 +520,19 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
throw error;
|
||||
}
|
||||
|
||||
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
|
||||
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
|
||||
|
||||
wellContribs.setOpenCLContext(context.get());
|
||||
wellContribs.setOpenCLQueue(queue.get());
|
||||
wellContribs.init();
|
||||
wellContribs.setKernel(add_well_contributions_k.get());
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::finalize() {
|
||||
delete[] rb;
|
||||
@ -539,7 +545,7 @@ void openclSolverBackend<block_size>::finalize() {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
|
||||
void openclSolverBackend<block_size>::copy_system_to_gpu() {
|
||||
Timer t;
|
||||
cl::Event event;
|
||||
|
||||
@ -561,8 +567,6 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& well
|
||||
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
|
||||
event.wait();
|
||||
|
||||
wellContribs.copyDataToGPU(queue.get());
|
||||
|
||||
if (verbosity > 2) {
|
||||
std::ostringstream out;
|
||||
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
|
||||
@ -702,11 +706,11 @@ void openclSolverBackend<block_size>::get_result(double *x) {
|
||||
} // end get_result()
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols, wellContribs);
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
initialize_wellContribs(wellContribs);
|
||||
if (analysis_done == false) {
|
||||
if (!analyse_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
@ -716,9 +720,10 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
copy_system_to_gpu(wellContribs);
|
||||
copy_system_to_gpu();
|
||||
} else {
|
||||
update_system(vals, b);
|
||||
initialize_wellContribs(wellContribs);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
|
@ -72,6 +72,7 @@ private:
|
||||
//cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
|
||||
|
||||
// shared pointers are also passed to other objects
|
||||
cl::Program program;
|
||||
std::shared_ptr<cl::Context> context;
|
||||
std::shared_ptr<cl::CommandQueue> queue;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
@ -149,13 +150,15 @@ private:
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions& wellContribs);
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
void initialize_wellContribs(WellContributions& wellContribs);
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
/// Copy linear system to GPU
|
||||
void copy_system_to_gpu(WellContributions& wellContribs);
|
||||
void copy_system_to_gpu();
|
||||
|
||||
/// Reorder the linear system so it corresponds with the coloring
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
|
Loading…
Reference in New Issue
Block a user