From 58b20fe2fbb9480f0ed975325d1b5f8a14e422ef Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Tue, 9 Aug 2022 11:50:31 +0200 Subject: [PATCH] Tweak ILU opencl kernel launch params --- opm/simulators/linalg/bda/opencl/BILU0.cpp | 10 +++--- .../bda/opencl/kernels/ILU_apply1_fm.cl | 2 +- .../bda/opencl/kernels/ILU_apply2_fm.cl | 2 +- .../linalg/bda/opencl/openclKernels.cpp | 36 +++++++++++-------- .../linalg/bda/opencl/openclKernels.hpp | 1 + 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index 538a55a21..2a8cc8050 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -275,7 +275,7 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix } OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, - s.invDiagVals, Nb, block_size); + s.invDiagVals, rowsPerColor[color], block_size); } if (verbosity >= 3) { @@ -303,11 +303,11 @@ void BILU0::apply(const cl::Buffer& y, cl::Buffer& x) #if CHOW_PATEL OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, - color, Nb, block_size); + color, rowsPerColor[color], block_size); #else OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, - color, Nb, block_size); + color, rowsPerColor[color], block_size); #endif } @@ -315,11 +315,11 @@ void BILU0::apply(const cl::Buffer& y, cl::Buffer& x) #if CHOW_PATEL OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, - color, Nb, block_size); + color, rowsPerColor[color], block_size); #else OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, - color, Nb, block_size); + color, rowsPerColor[color], block_size); #endif } diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl index 41d46e381..483d4e7f2 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl @@ -14,7 +14,7 @@ __kernel void ILU_apply1( const unsigned int block_size, __local double *tmp) { - const unsigned int warpsize = 32; + const unsigned int warpsize = get_local_size(0); const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl index f888b826e..d910552b6 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl @@ -14,7 +14,7 @@ __kernel void ILU_apply2( const unsigned int block_size, __local double *tmp) { - const unsigned int warpsize = 32; + const unsigned int warpsize = get_local_size(0); const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.cpp b/opm/simulators/linalg/bda/opencl/openclKernels.cpp index 08d670b8e..b43f70e66 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp @@ -41,6 +41,7 @@ int OpenclKernels::verbosity; cl::CommandQueue *OpenclKernels::queue; std::vector OpenclKernels::tmp; bool OpenclKernels::initialized = false; +size_t OpenclKernels::preferred_workgroup_size_multiple = 0; std::unique_ptr > OpenclKernels::dot_k; std::unique_ptr > OpenclKernels::norm_k; @@ -140,6 +141,11 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL"))); isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU"))); + // testing shows all kernels have the same preferred_workgroup_size_multiple + // 32 for NVIDIA + // 64 for AMD + cl::Kernel(program, "ILU_apply1").getWorkGroupInfo(devices[0], CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferred_workgroup_size_multiple); + initialized = true; } // end get_opencl_kernels() @@ -388,10 +394,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int work_group_size = preferred_workgroup_size_multiple; + const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply1; @@ -413,10 +419,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int work_group_size = preferred_workgroup_size_multiple; + const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply2; @@ -437,19 +443,19 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size2 = 128; - const unsigned int num_work_groups2 = 1024; - const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; - const unsigned int num_hwarps_per_group = work_group_size2 / 16; - const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot + const unsigned int work_group_size = 128; + const unsigned int num_work_groups = rowsThisColor; + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int num_hwarps_per_group = work_group_size / 16; + const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot Timer t_ilu_decomp; - cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), + cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), firstRow, lastRow, vals, cols, rows, - invDiagVals, diagIndex, Nb, - cl::Local(lmem_per_work_group2)); + invDiagVals, diagIndex, rowsThisColor, + cl::Local(lmem_per_work_group)); if (verbosity >= 4) { event.wait(); diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.hpp b/opm/simulators/linalg/bda/opencl/openclKernels.hpp index c56b748fb..e85b2ff66 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp @@ -64,6 +64,7 @@ private: static cl::CommandQueue *queue; static std::vector tmp; // used as tmp CPU buffer for dot() and norm() static bool initialized; + static size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE static std::unique_ptr > dot_k; static std::unique_ptr > norm_k;