Tweak ILU opencl kernel launch params

This commit is contained in:
Tong Dong Qiu 2022-06-23 10:22:48 +02:00
parent 842a9a9317
commit 85776f528a
5 changed files with 29 additions and 22 deletions

View File

@ -274,7 +274,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
if (verbosity >= 5) { if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
} }
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size); OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, rowsPerColor[color], block_size);
} }
if (verbosity >= 3) { if (verbosity >= 3) {
@ -299,17 +299,17 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = 0; color < numColors; ++color) { for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, rowsPerColor[color], block_size);
#endif #endif
} }
for (int color = numColors - 1; color >= 0; --color) { for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, rowsPerColor[color], block_size);
#endif #endif
} }

View File

@ -14,7 +14,7 @@ __kernel void ILU_apply1(
const unsigned int block_size, const unsigned int block_size,
__local double *tmp) __local double *tmp)
{ {
const unsigned int warpsize = 32; const unsigned int warpsize = get_local_size(0);
const unsigned int bs = block_size; const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0); const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;

View File

@ -14,7 +14,7 @@ __kernel void ILU_apply2(
const unsigned int block_size, const unsigned int block_size,
__local double *tmp) __local double *tmp)
{ {
const unsigned int warpsize = 32; const unsigned int warpsize = get_local_size(0);
const unsigned int bs = block_size; const unsigned int bs = block_size;
const unsigned int idx_t = get_local_id(0); const unsigned int idx_t = get_local_id(0);
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;

View File

@ -41,6 +41,7 @@ int OpenclKernels::verbosity;
cl::CommandQueue *OpenclKernels::queue; cl::CommandQueue *OpenclKernels::queue;
std::vector<double> OpenclKernels::tmp; std::vector<double> OpenclKernels::tmp;
bool OpenclKernels::initialized = false; bool OpenclKernels::initialized = false;
size_t OpenclKernels::preferred_workgroup_size = 0;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
@ -140,6 +141,11 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL"))); isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU"))); isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
// testing shows all kernels have the same preferred_workgroup_size_multiple
// 32 for NVIDIA
// 64 for AMD
cl::Kernel(program, "ILU_apply1").getWorkGroupInfo(devices[0], CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferred_workgroup_size_multiple);
initialized = true; initialized = true;
} // end get_opencl_kernels() } // end get_opencl_kernels()
@ -388,10 +394,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x, const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color, cl::Buffer& rowsPerColor, int color,
int Nb, unsigned int block_size) int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply1; Timer t_ilu_apply1;
@ -413,10 +419,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color, cl::Buffer& rowsPerColor, int color,
int Nb, unsigned int block_size) int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply2; Timer t_ilu_apply2;
@ -437,19 +443,19 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int Nb, unsigned int block_size) int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size2 = 128; const unsigned int work_group_size = 128;
const unsigned int num_work_groups2 = 1024; const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int num_hwarps_per_group = work_group_size2 / 16; const unsigned int num_hwarps_per_group = work_group_size / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
Timer t_ilu_decomp; Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, vals, cols, rows, firstRow, lastRow, vals, cols, rows,
invDiagVals, diagIndex, Nb, invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group2)); cl::Local(lmem_per_work_group));
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();

View File

@ -64,6 +64,7 @@ private:
static cl::CommandQueue *queue; static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm() static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized; static bool initialized;
static size_t preferred_workgroup_size; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;