Add prolongate_vector() kernel

This commit is contained in:
Tong Dong Qiu 2021-11-26 09:36:18 +01:00
parent 9d611f04ee
commit eaded9dcf7
2 changed files with 47 additions and 0 deletions

View File

@ -50,6 +50,7 @@ std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Bu
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
@ -99,6 +100,8 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
add_kernel_source(sources, full_to_pressure_restriction_s);
const std::string& add_coarse_pressure_correction_s = get_add_coarse_pressure_correction_source();
add_kernel_source(sources, add_coarse_pressure_correction_s);
const std::string& prolongate_vector_s = get_prolongate_vector_source();
add_kernel_source(sources, prolongate_vector_s);
const std::string& spmv_blocked_s = get_blocked_matrix_operation_source(matrix_operation::spmv_op);
add_kernel_source(sources, spmv_blocked_s);
const std::string& spmv_s = get_matrix_operation_source(matrix_operation::spmv_op, true);
@ -140,6 +143,7 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
spmv_blocked_k.reset(new spmv_blocked_kernel_type(cl::Kernel(program, "spmv_blocked")));
spmv_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv")));
spmv_noreset_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_noreset")));
@ -316,6 +320,23 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
}
}
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t;
cl::Event event = (*prolongate_vector_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, cols, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels prolongate_vector() time: " << t.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset)
{
const unsigned int work_group_size = 32;
@ -690,6 +711,27 @@ void OpenclKernels::apply_stdwells_no_reorder(cl::Buffer& d_Cnnzs_ocl, cl::Buffe
)";
}
// prolongate vector during amg cycle
// every workitem handles one row
std::string OpenclKernels::get_prolongate_vector_source() {
return R"(
__kernel void prolongate_vector(
__global const double *in,
__global double *out,
__global const int *cols,
const unsigned int N)
{
const unsigned int NUM_THREADS = get_global_size(0);
unsigned int row = get_global_id(0);
while(row < N){
out[row] += in[cols[row]];
row += NUM_THREADS;
}
}
)";
}
/// either b = mat * x
/// or res = rhs - mat * x

View File

@ -74,6 +74,7 @@ private:
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
static std::unique_ptr<spmv_blocked_kernel_type> spmv_blocked_k;
static std::unique_ptr<spmv_kernel_type> spmv_k;
static std::unique_ptr<spmv_kernel_type> spmv_noreset_k;
@ -116,6 +117,9 @@ private:
/// Add the coarse pressure solution back to the finer, complete solution
static std::string get_add_coarse_pressure_correction_source();
/// Prolongate a vector during the AMG cycle
static std::string get_prolongate_vector_source();
/// b = mat * x
/// algorithm based on:
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
@ -162,6 +166,7 @@ public:
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
static void spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset = true);
static void residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size);