Initial commit

This commit is contained in:
Jose Eduardo Bueno
2020-09-03 09:46:44 -03:00
parent 4d10d9ac76
commit c7adc3495f
4 changed files with 40 additions and 31 deletions

View File

@@ -102,8 +102,19 @@ WellContributions::~WellContributions()
}
#if HAVE_OPENCL
void WellContributions::setOpenCLContext(cl::Context *context_){
this->context = context_;
}
void WellContributions::init(cl::Context *context){
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *stdwell_apply_){
this->stdwell_apply = stdwell_apply_;
}
void WellContributions::init(){
if(num_std_wells > 0){
d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
@@ -111,24 +122,17 @@ void WellContributions::init(cl::Context *context){
d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
}
void WellContributions::copyDataToGPU(cl::CommandQueue *queue){
if(num_std_wells > 0){
cl::Event event;
queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &event);
event.wait();
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
}
}
void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y) {
void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
// apply MultisegmentWells
if (num_ms_wells > 0) {
// allocate pinned memory on host if not yet done
@@ -151,26 +155,25 @@ void WellContributions::applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl
}
}
void WellContributions::applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
event.wait();
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributions::apply(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel){
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
if(num_std_wells > 0){
applyStdWell(queue, d_x, d_y, kernel);
applyStdWell(d_x, d_y);
}
if(num_ms_wells > 0){
applyMSWell(queue, d_x, d_y);
applyMSWell(d_x, d_y);
}
}

View File

@@ -115,6 +115,9 @@ private:
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int,
cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
kernel_type *stdwell_apply;
cl::Context *context;
cl::CommandQueue *queue;
#endif
#if HAVE_CUDA
@@ -133,8 +136,8 @@ private:
#endif
#if HAVE_OPENCL
void applyStdWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y, kernel_type *kernel);
void applyMSWell(cl::CommandQueue *queue, cl::Buffer& d_x, cl::Buffer& d_y);
void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
#endif
public:
@@ -155,9 +158,11 @@ public:
#endif
#if HAVE_OPENCL
void init(cl::Context *context);
void copyDataToGPU(cl::CommandQueue *queue);
void apply(cl::CommandQueue *queue, cl::Buffer& x, cl::Buffer& y, kernel_type *kernel);
void init();
void apply(cl::Buffer& x, cl::Buffer& y);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernel(kernel_type *stdwell_apply);
#endif
/// Create a new WellContributions

View File

@@ -242,7 +242,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(queue.get(), d_pw, d_v, add_well_contributions_k.get());
wellContribs.apply(d_pw, d_v);
t_well.stop();
t_rest.start();
@@ -271,7 +271,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(queue.get(), d_s, d_t, add_well_contributions_k.get());
wellContribs.apply(d_s, d_t);
t_well.stop();
t_rest.start();
@@ -495,7 +495,9 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
wellContribs.init(context.get());
wellContribs.setOpenCLContext(context.get());
wellContribs.setOpenCLQueue(queue.get());
wellContribs.init();
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -511,6 +513,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program_, "add_well_contributions")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
wellContribs.setKernel(add_well_contributions_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
@@ -539,7 +542,7 @@ void openclSolverBackend<block_size>::finalize() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& wellContribs) {
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
cl::Event event;
@@ -561,8 +564,6 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions& well
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wellContribs.copyDataToGPU(queue.get());
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@@ -716,7 +717,7 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu(wellContribs);
copy_system_to_gpu();
} else {
update_system(vals, b);
if (!create_preconditioner()) {

View File

@@ -155,7 +155,7 @@ private:
void finalize();
/// Copy linear system to GPU
void copy_system_to_gpu(WellContributions& wellContribs);
void copy_system_to_gpu();
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values