diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 5a530a776..907bdc683 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -241,16 +241,10 @@ BILU0::~BILU0() for (int color = 0; color < numColors; ++color) { const unsigned int firstRow = rowsPerColorPrefix[color]; const unsigned int lastRow = rowsPerColorPrefix[color+1]; - const unsigned int work_group_size2 = 128; - const unsigned int num_work_groups2 = 1024; - const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; - const unsigned int num_hwarps_per_group = work_group_size2 / 16; - const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot if (verbosity >= 4) { out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; } - event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2)); - event.wait(); + OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size); } if (verbosity >= 3) { @@ -266,7 +260,7 @@ BILU0::~BILU0() // however, if individual kernel calls are timed, waiting for events is needed // behavior on other GPUs is untested template - void BILU0::apply(cl::Buffer& y, cl::Buffer& x) + void BILU0::apply(const cl::Buffer& y, cl::Buffer& x) { const double relaxation = 0.9; cl::Event event; @@ -274,28 +268,24 @@ BILU0::~BILU0() for(int color = 0; color < numColors; ++color){ #if CHOW_PATEL - event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); #else - event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); #endif - // event.wait(); } for(int color = numColors-1; color >= 0; --color){ #if CHOW_PATEL - event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); #else - event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); #endif - // event.wait(); } // apply relaxation - event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N); - event.wait(); + OpenclKernels::scale(x, relaxation, N); if (verbosity >= 4) { - event.wait(); std::ostringstream out; out << "BILU0 apply: " << t_apply.stop() << " s"; OpmLog::info(out.str()); @@ -311,41 +301,17 @@ template void BILU0::setOpenCLQueue(cl::CommandQueue *queue_) { this->queue = queue_; } -template -void BILU0::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) { - this->work_group_size = work_group_size_; - this->total_work_items = total_work_items_; - this->lmem_per_work_group = lmem_per_work_group_; -} -template -void BILU0::setKernels( - cl::KernelFunctor *ILU_apply1_, - cl::KernelFunctor *ILU_apply2_, - cl::KernelFunctor *scale_, - cl::KernelFunctor *ilu_decomp_ -){ - this->ILU_apply1 = ILU_apply1_; - this->ILU_apply2 = ILU_apply2_; - this->scale = scale_; - this->ilu_decomp = ilu_decomp_; -} -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template BILU0::BILU0(ILUReorder, int); \ -template BILU0::~BILU0(); \ -template bool BILU0::init(BlockedMatrix*); \ -template bool BILU0::create_preconditioner(BlockedMatrix*); \ -template void BILU0::apply(cl::Buffer& x, cl::Buffer& y); \ -template void BILU0::setOpenCLContext(cl::Context*); \ -template void BILU0::setOpenCLQueue(cl::CommandQueue*); \ -template void BILU0::setKernelParameters(unsigned int, unsigned int, unsigned int); \ -template void BILU0::setKernels( \ - cl::KernelFunctor *, \ - cl::KernelFunctor *, \ - cl::KernelFunctor *, \ - cl::KernelFunctor * \ - ); +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template BILU0::BILU0(ILUReorder, int); \ +template BILU0::~BILU0(); \ +template bool BILU0::init(BlockedMatrix*); \ +template bool BILU0::create_preconditioner(BlockedMatrix*); \ +template void BILU0::apply(const cl::Buffer&, cl::Buffer&); \ +template void BILU0::setOpenCLContext(cl::Context*); \ +template void BILU0::setOpenCLQueue(cl::CommandQueue*); + INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index 2ee9a8928..e14f844de 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -72,21 +72,11 @@ namespace bda #endif } GPU_storage; - ilu_apply1_kernel_type *ILU_apply1; - ilu_apply2_kernel_type *ILU_apply2; - cl::KernelFunctor *scale; - cl::KernelFunctor *ilu_decomp; - GPU_storage s; cl::Context *context; cl::CommandQueue *queue; std::vector events; cl_int err; - int work_group_size = 0; - int total_work_items = 0; - int lmem_per_work_group = 0; #if CHOW_PATEL ChowPatelIlu chowPatelIlu; @@ -105,17 +95,10 @@ namespace bda bool create_preconditioner(BlockedMatrix *mat); // apply preconditioner, x = prec(y) - void apply(cl::Buffer& y, cl::Buffer& x); + void apply(const cl::Buffer& y, cl::Buffer& x); void setOpenCLContext(cl::Context *context); void setOpenCLQueue(cl::CommandQueue *queue); - void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group); - void setKernels( - cl::KernelFunctor *ILU_apply1, - cl::KernelFunctor *ILU_apply2, - cl::KernelFunctor *scale, - cl::KernelFunctor *ilu_decomp - ); int* getToOrder() { diff --git a/opm/simulators/linalg/bda/openclKernels.cpp b/opm/simulators/linalg/bda/openclKernels.cpp index 0ebe16e9e..7a42f63e0 100644 --- a/opm/simulators/linalg/bda/openclKernels.cpp +++ b/opm/simulators/linalg/bda/openclKernels.cpp @@ -18,12 +18,300 @@ */ #include +#include +#include + +#include +#include + #include +#include // defines CHOW_PATEL namespace bda { - std::string get_axpy_string() { +using Opm::OpmLog; +using Dune::Timer; + +// define static variables and kernels +int OpenclKernels::verbosity; +cl::CommandQueue *OpenclKernels::queue; +std::vector OpenclKernels::tmp; +bool OpenclKernels::initialized = false; + +std::unique_ptr > OpenclKernels::dot_k; +std::unique_ptr > OpenclKernels::norm_k; +std::unique_ptr > OpenclKernels::axpy_k; +std::unique_ptr > OpenclKernels::scale_k; +std::unique_ptr > OpenclKernels::custom_k; +std::unique_ptr OpenclKernels::spmv_blocked_k; +std::unique_ptr OpenclKernels::ILU_apply1_k; +std::unique_ptr OpenclKernels::ILU_apply2_k; +std::unique_ptr OpenclKernels::stdwell_apply_k; +std::unique_ptr OpenclKernels::stdwell_apply_no_reorder_k; +std::unique_ptr OpenclKernels::ilu_decomp_k; + + +// divide A by B, and round up: return (int)ceil(A/B) +unsigned int ceilDivision(const unsigned int A, const unsigned int B) +{ + return A / B + (A % B > 0); +} + +void add_kernel_string(cl::Program::Sources &sources, std::string &source) { + sources.emplace_back(source); +} + +void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector& devices, int verbosity_) { + + if (initialized) { + OpmLog::warning("Warning OpenclKernels is already initialized"); + return; + } + + queue = queue_; + verbosity = verbosity_; + + cl::Program::Sources sources; + std::string axpy_s = get_axpy_string(); + add_kernel_string(sources, axpy_s); + std::string scale_s = get_scale_string(); + add_kernel_string(sources, scale_s); + std::string dot_1_s = get_dot_1_string(); + add_kernel_string(sources, dot_1_s); + std::string norm_s = get_norm_string(); + add_kernel_string(sources, norm_s); + std::string custom_s = get_custom_string(); + add_kernel_string(sources, custom_s); + std::string spmv_blocked_s = get_spmv_blocked_string(); + add_kernel_string(sources, spmv_blocked_s); +#if CHOW_PATEL + bool ilu_operate_on_full_matrix = false; +#else + bool ilu_operate_on_full_matrix = true; +#endif + std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply1_s); + std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix); + add_kernel_string(sources, ILU_apply2_s); + std::string stdwell_apply_s = get_stdwell_apply_string(true); + add_kernel_string(sources, stdwell_apply_s); + std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false); + add_kernel_string(sources, stdwell_apply_no_reorder_s); + std::string ilu_decomp_s = get_ilu_decomp_string(); + add_kernel_string(sources, ilu_decomp_s); + + cl::Program program = cl::Program(*context, sources); + program.build(devices); + + // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA + // cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking + + // actually creating the kernels + dot_k.reset(new cl::KernelFunctor(cl::Kernel(program, "dot_1"))); + norm_k.reset(new cl::KernelFunctor(cl::Kernel(program, "norm"))); + axpy_k.reset(new cl::KernelFunctor(cl::Kernel(program, "axpy"))); + scale_k.reset(new cl::KernelFunctor(cl::Kernel(program, "scale"))); + custom_k.reset(new cl::KernelFunctor(cl::Kernel(program, "custom"))); + spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); + ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); + ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); + stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); + stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder"))); + ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); + + initialized = true; +} // end get_opencl_kernels() + + + + +double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) +{ + const unsigned int work_group_size = 256; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_dot; + tmp.resize(num_work_groups); + + cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); + + queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); + + double gpu_sum = 0.0; + for (unsigned int i = 0; i < num_work_groups; ++i) { + gpu_sum += tmp[i]; + } + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels dot() time: " << t_dot.stop() << " s"; + OpmLog::info(oss.str()); + } + + return gpu_sum; +} + +double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N) +{ + const unsigned int work_group_size = 256; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_norm; + tmp.resize(num_work_groups); + + cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); + + queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); + + double gpu_norm = 0.0; + for (unsigned int i = 0; i < num_work_groups; ++i) { + gpu_norm += tmp[i]; + } + gpu_norm = sqrt(gpu_norm); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels norm() time: " << t_norm.stop() << " s"; + OpmLog::info(oss.str()); + } + + return gpu_norm; +} + +void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t_axpy; + + cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels axpy() time: " << t_axpy.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::scale(cl::Buffer& in, const double a, int N) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t_scale; + + cl::Event event = (*scale_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, N); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels scale() time: " << t_scale.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(N, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + Timer t_custom; + + cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels custom() time: " << t_custom.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_spmv; + + cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels spmv_blocked() time: " << t_spmv.stop() << " s"; + OpmLog::info(oss.str()); + } +} + + +void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_ilu_apply1; + + cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + + if (verbosity >= 5) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels ILU_apply1() time: " << t_ilu_apply1.stop() << " s"; + OpmLog::info(oss.str()); + } +} + + +void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) +{ + const unsigned int work_group_size = 32; + const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + Timer t_ilu_apply2; + + cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + + if (verbosity >= 5) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels ILU_apply2() time: " << t_ilu_apply2.stop() << " s"; + OpmLog::info(oss.str()); + } +} + +void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size) +{ + const unsigned int work_group_size2 = 128; + const unsigned int num_work_groups2 = 1024; + const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; + const unsigned int num_hwarps_per_group = work_group_size2 / 16; + const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot + Timer t_ilu_decomp; + + cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2)); + + if (verbosity >= 4) { + event.wait(); + std::ostringstream oss; + oss << std::scientific << "OpenclKernels ILU_decomp() time: " << t_ilu_decomp.stop() << " s"; + OpmLog::info(oss.str()); + } +} + + + + std::string OpenclKernels::get_axpy_string() { return R"( __kernel void axpy( __global double *in, @@ -44,7 +332,7 @@ namespace bda // scale vector with scalar - std::string get_scale_string() { + std::string OpenclKernels::get_scale_string() { return R"( __kernel void scale( __global double *vec, @@ -64,7 +352,7 @@ namespace bda // returns partial sums, instead of the final dot product - std::string get_dot_1_string() { + std::string OpenclKernels::get_dot_1_string() { return R"( __kernel void dot_1( __global double *in1, @@ -107,7 +395,7 @@ namespace bda // returns partial sums, instead of the final norm // the square root must be computed on CPU - std::string get_norm_string() { + std::string OpenclKernels::get_norm_string() { return R"( __kernel void norm( __global double *in, @@ -148,7 +436,7 @@ namespace bda // p = (p - omega * v) * beta + r - std::string get_custom_string() { + std::string OpenclKernels::get_custom_string() { return R"( __kernel void custom( __global double *p, @@ -173,7 +461,7 @@ namespace bda )"; } - std::string get_spmv_blocked_string() { + std::string OpenclKernels::get_spmv_blocked_string() { return R"( __kernel void spmv_blocked( __global const double *vals, @@ -243,7 +531,7 @@ namespace bda - std::string get_ILU_apply1_string(bool full_matrix) { + std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) { std::string s = R"( __kernel void ILU_apply1( __global const double *LUvals, @@ -319,7 +607,7 @@ namespace bda } - std::string get_ILU_apply2_string(bool full_matrix) { + std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) { std::string s = R"( __kernel void ILU_apply2( __global const double *LUvals, @@ -402,7 +690,7 @@ namespace bda return s; } - std::string get_stdwell_apply_string(bool reorder) { + std::string OpenclKernels::get_stdwell_apply_string(bool reorder) { std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder"; std::string s = "__kernel void " + kernel_name + R"(( __global const double *Cnnzs, @@ -497,7 +785,7 @@ namespace bda } - std::string get_ilu_decomp_string() { + std::string OpenclKernels::get_ilu_decomp_string() { return R"( // a = a - (b * c) diff --git a/opm/simulators/linalg/bda/openclKernels.hpp b/opm/simulators/linalg/bda/openclKernels.hpp index f744ecb9d..7a9887c39 100644 --- a/opm/simulators/linalg/bda/openclKernels.hpp +++ b/opm/simulators/linalg/bda/openclKernels.hpp @@ -21,6 +21,7 @@ #define OPENCL_HPP #include +#include #include @@ -29,7 +30,7 @@ namespace bda using spmv_kernel_type = cl::KernelFunctor; -using ilu_apply1_kernel_type = cl::KernelFunctor; using ilu_apply2_kernel_type = cl::KernelFunctor; @@ -44,56 +45,93 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor; +class OpenclKernels +{ +private: + static int verbosity; + static cl::CommandQueue *queue; + static std::vector tmp; // used as tmp CPU buffer for dot() and norm() + static bool initialized; + + static std::unique_ptr > dot_k; + static std::unique_ptr > norm_k; + static std::unique_ptr > axpy_k; + static std::unique_ptr > scale_k; + static std::unique_ptr > custom_k; + static std::unique_ptr spmv_blocked_k; + static std::unique_ptr ILU_apply1_k; + static std::unique_ptr ILU_apply2_k; + static std::unique_ptr stdwell_apply_k; + static std::unique_ptr stdwell_apply_no_reorder_k; + static std::unique_ptr ilu_decomp_k; + /// Generate string with axpy kernel /// a = a + alpha * b - std::string get_axpy_string(); + static std::string get_axpy_string(); /// Generate string with scale kernel /// a = a * alpha - std::string get_scale_string(); + static std::string get_scale_string(); /// returns partial sums, instead of the final dot product /// partial sums are added on CPU - std::string get_dot_1_string(); + static std::string get_dot_1_string(); /// returns partial sums, instead of the final norm /// the square root must be computed on CPU - std::string get_norm_string(); + static std::string get_norm_string(); /// Generate string with custom kernel /// This kernel combines some ilubicgstab vector operations into 1 /// p = (p - omega * v) * beta + r - std::string get_custom_string(); + static std::string get_custom_string(); /// b = mat * x /// algorithm based on: /// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures, /// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42 - std::string get_spmv_blocked_string(); + static std::string get_spmv_blocked_string(); /// ILU apply part 1: forward substitution /// solves L*x=y where L is a lower triangular sparse blocked matrix /// this L can be it's own BSR matrix (if full_matrix is false), /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not - std::string get_ILU_apply1_string(bool full_matrix); + static std::string get_ILU_apply1_string(bool full_matrix); /// ILU apply part 2: backward substitution /// solves U*x=y where U is an upper triangular sparse blocked matrix /// this U can be it's own BSR matrix (if full_matrix is false), /// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start /// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not - std::string get_ILU_apply2_string(bool full_matrix); + static std::string get_ILU_apply2_string(bool full_matrix); /// Generate string with the stdwell_apply kernels /// If reorder is true, the B/Ccols do not correspond with the x/y vector /// the x/y vector is reordered, use toOrder to address that /// \param[in] reorder whether the matrix is reordered or not - std::string get_stdwell_apply_string(bool reorder); + static std::string get_stdwell_apply_string(bool reorder); /// Generate string with the exact ilu decomposition kernel /// The kernel takes a full BSR matrix and performs inplace ILU decomposition - std::string get_ilu_decomp_string(); + static std::string get_ilu_decomp_string(); + + OpenclKernels(){}; // disable instantiation + +public: + static void init(cl::Context *context, cl::CommandQueue *queue, std::vector& devices, int verbosity); + + static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); + static double norm(cl::Buffer& in, cl::Buffer& out, int N); + static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N); + static void scale(cl::Buffer& in, const double a, int N); + static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N); + static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size); + static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); + static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size); + + static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size); +}; } // end namespace bda diff --git a/opm/simulators/linalg/bda/openclSolverBackend.cpp b/opm/simulators/linalg/bda/openclSolverBackend.cpp index f87f46ed2..18bb14021 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.cpp @@ -174,6 +174,8 @@ openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, context = std::make_shared(devices[0]); queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err)); + OpenclKernels::init(context.get(), queue.get(), devices, verbosity); + } catch (const cl::Error& error) { std::ostringstream oss; oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; @@ -193,125 +195,6 @@ openclSolverBackend::~openclSolverBackend() { } -// divide A by B, and round up: return (int)ceil(A/B) -template -unsigned int openclSolverBackend::ceilDivision(const unsigned int A, const unsigned int B) -{ - return A / B + (A % B > 0); -} - - -template -double openclSolverBackend::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out) -{ - const unsigned int work_group_size = 256; - const unsigned int num_work_groups = ceilDivision(N, work_group_size); - const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; - Timer t_dot; - - cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); - - queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp); - - double gpu_sum = 0.0; - for (unsigned int i = 0; i < num_work_groups; ++i) { - gpu_sum += tmp[i]; - } - - if (verbosity >= 4) { - event.wait(); - std::ostringstream oss; - oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s"; - OpmLog::info(oss.str()); - } - - return gpu_sum; -} - -template -double openclSolverBackend::norm_w(cl::Buffer in, cl::Buffer out) -{ - const unsigned int work_group_size = 256; - const unsigned int num_work_groups = ceilDivision(N, work_group_size); - const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; - Timer t_norm; - - cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); - - queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp); - - double gpu_norm = 0.0; - for (unsigned int i = 0; i < num_work_groups; ++i) { - gpu_norm += tmp[i]; - } - gpu_norm = sqrt(gpu_norm); - - if (verbosity >= 4) { - event.wait(); - std::ostringstream oss; - oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s"; - OpmLog::info(oss.str()); - } - - return gpu_norm; -} - -template -void openclSolverBackend::axpy_w(cl::Buffer in, const double a, cl::Buffer out) -{ - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(N, work_group_size); - const unsigned int total_work_items = num_work_groups * work_group_size; - Timer t_axpy; - - cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N); - - if (verbosity >= 4) { - event.wait(); - std::ostringstream oss; - oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s"; - OpmLog::info(oss.str()); - } -} - -template -void openclSolverBackend::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta) -{ - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(N, work_group_size); - const unsigned int total_work_items = num_work_groups * work_group_size; - Timer t_custom; - - cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N); - - if (verbosity >= 4) { - event.wait(); - std::ostringstream oss; - oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s"; - OpmLog::info(oss.str()); - } -} - -template -void openclSolverBackend::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b) -{ - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(N, work_group_size); - const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; - Timer t_spmv; - - cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); - - if (verbosity >= 4) { - event.wait(); - std::ostringstream oss; - oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s"; - OpmLog::info(oss.str()); - } -} template void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) { @@ -320,7 +203,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr double norm, norm_0; if(wellContribs.getNumWells() > 0){ - wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get()); + // wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get()); } Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false); @@ -348,7 +231,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error"); } - norm = norm_w(d_r, d_tmp); + norm = OpenclKernels::norm(d_r, d_tmp, N); norm_0 = norm; if (verbosity > 1) { @@ -360,11 +243,11 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_rest.start(); for (it = 0.5; it < maxit; it += 0.5) { rhop = rho; - rho = dot_w(d_rw, d_r, d_tmp); + rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N); if (it > 1) { beta = (rho / rhop) * (alpha / omega); - custom_w(d_p, d_v, d_r, omega, beta); + OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N); } t_rest.stop(); @@ -375,7 +258,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // v = A * pw t_spmv.start(); - spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v); + OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); t_spmv.stop(); // apply wellContributions @@ -386,11 +269,11 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_well.stop(); t_rest.start(); - tmp1 = dot_w(d_rw, d_v, d_tmp); + tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N); alpha = rho / tmp1; - axpy_w(d_v, -alpha, d_r); // r = r - alpha * v - axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw - norm = norm_w(d_r, d_tmp); + OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v + OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw + norm = OpenclKernels::norm(d_r, d_tmp, N); t_rest.stop(); if (norm < tolerance * norm_0) { @@ -406,7 +289,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr // t = A * s t_spmv.start(); - spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t); + OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); t_spmv.stop(); // apply wellContributions @@ -417,12 +300,12 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_well.stop(); t_rest.start(); - tmp1 = dot_w(d_t, d_r, d_tmp); - tmp2 = dot_w(d_t, d_t, d_tmp); + tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N); + tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N); omega = tmp1 / tmp2; - axpy_w(d_s, omega, d_x); // x = x + omega * s - axpy_w(d_t, -omega, d_r); // r = r - omega * t - norm = norm_w(d_r, d_tmp); + OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s + OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t + norm = OpenclKernels::norm(d_r, d_tmp, N); t_rest.stop(); if (norm < tolerance * norm_0) { @@ -479,7 +362,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub prec->setOpenCLContext(context.get()); prec->setOpenCLQueue(queue.get()); - tmp = new double[N]; #if COPY_ROW_BY_ROW vals_contiguous = new double[N]; #endif @@ -507,10 +389,6 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb); } - get_opencl_kernels(); - - prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get()); - } catch (const cl::Error& error) { std::ostringstream oss; oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n"; @@ -525,68 +403,12 @@ void openclSolverBackend::initialize(int N_, int nnz_, int dim, doub initialized = true; } // end initialize() -void add_kernel_string(cl::Program::Sources &sources, std::string &source) { - sources.emplace_back(source); -} - -template -void openclSolverBackend::get_opencl_kernels() { - - cl::Program::Sources sources; - std::string axpy_s = get_axpy_string(); - add_kernel_string(sources, axpy_s); - std::string scale_s = get_scale_string(); - add_kernel_string(sources, scale_s); - std::string dot_1_s = get_dot_1_string(); - add_kernel_string(sources, dot_1_s); - std::string norm_s = get_norm_string(); - add_kernel_string(sources, norm_s); - std::string custom_s = get_custom_string(); - add_kernel_string(sources, custom_s); - std::string spmv_blocked_s = get_spmv_blocked_string(); - add_kernel_string(sources, spmv_blocked_s); -#if CHOW_PATEL - bool ilu_operate_on_full_matrix = false; -#else - bool ilu_operate_on_full_matrix = true; -#endif - std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix); - add_kernel_string(sources, ILU_apply1_s); - std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix); - add_kernel_string(sources, ILU_apply2_s); - std::string stdwell_apply_s = get_stdwell_apply_string(true); - add_kernel_string(sources, stdwell_apply_s); - std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false); - add_kernel_string(sources, stdwell_apply_no_reorder_s); - std::string ilu_decomp_s = get_ilu_decomp_string(); - add_kernel_string(sources, ilu_decomp_s); - - cl::Program program = cl::Program(*context, sources); - program.build(devices); - - // queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA - // cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking - - // actually creating the kernels - dot_k.reset(new cl::KernelFunctor(cl::Kernel(program, "dot_1"))); - norm_k.reset(new cl::KernelFunctor(cl::Kernel(program, "norm"))); - axpy_k.reset(new cl::KernelFunctor(cl::Kernel(program, "axpy"))); - scale_k.reset(new cl::KernelFunctor(cl::Kernel(program, "scale"))); - custom_k.reset(new cl::KernelFunctor(cl::Kernel(program, "custom"))); - spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked"))); - ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1"))); - ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2"))); - stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply"))); - stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder"))); - ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp"))); -} // end get_opencl_kernels() template void openclSolverBackend::finalize() { if (opencl_ilu_reorder != ILUReorder::NONE) { delete[] rb; } - delete[] tmp; #if COPY_ROW_BY_ROW delete[] vals_contiguous; #endif @@ -672,11 +494,6 @@ bool openclSolverBackend::analyse_matrix() { Timer t; bool success = prec->init(mat.get()); - int work_group_size = 32; - int num_work_groups = ceilDivision(N, work_group_size); - int total_work_items = num_work_groups * work_group_size; - int lmem_per_work_group = work_group_size * sizeof(double); - prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group); if (opencl_ilu_reorder == ILUReorder::NONE) { rmat = mat.get(); diff --git a/opm/simulators/linalg/bda/openclSolverBackend.hpp b/opm/simulators/linalg/bda/openclSolverBackend.hpp index d105ad482..a2114a0ce 100644 --- a/opm/simulators/linalg/bda/openclSolverBackend.hpp +++ b/opm/simulators/linalg/bda/openclSolverBackend.hpp @@ -61,21 +61,8 @@ private: cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm() cl::Buffer d_toOrder; // only used when reordering is used - double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm() - // shared pointers are also passed to other objects std::vector devices; - std::unique_ptr > dot_k; - std::unique_ptr > norm_k; - std::unique_ptr > axpy_k; - std::unique_ptr > scale_k; - std::unique_ptr > custom_k; - std::unique_ptr spmv_blocked_k; - std::shared_ptr ILU_apply1_k; - std::shared_ptr ILU_apply2_k; - std::shared_ptr stdwell_apply_k; - std::shared_ptr stdwell_apply_no_reorder_k; - std::shared_ptr ilu_decomp_k; Preconditioner *prec = nullptr; // only supported preconditioner is BILU0 int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings @@ -150,9 +137,6 @@ private: /// \param[in] cols array of columnIndices, contains nnz values void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); - /// Generate and compile opencl kernels - void get_opencl_kernels(); - /// Clean memory void finalize();