From 58b20fe2fbb9480f0ed975325d1b5f8a14e422ef Mon Sep 17 00:00:00 2001
From: Tong Dong Qiu <tongdong.qiu@bigdataccelerate.com>
Date: Tue, 9 Aug 2022 11:50:31 +0200
Subject: [PATCH] Tweak ILU opencl kernel launch params

---
 opm/simulators/linalg/bda/opencl/BILU0.cpp    | 10 +++---
 .../bda/opencl/kernels/ILU_apply1_fm.cl       |  2 +-
 .../bda/opencl/kernels/ILU_apply2_fm.cl       |  2 +-
 .../linalg/bda/opencl/openclKernels.cpp       | 36 +++++++++++--------
 .../linalg/bda/opencl/openclKernels.hpp       |  1 +
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp
index 538a55a21..2a8cc8050 100644
--- a/opm/simulators/linalg/bda/opencl/BILU0.cpp
+++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp
@@ -275,7 +275,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
         }
         OpenclKernels::ILU_decomp(firstRow, lastRow,
                                   s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
-                                  s.invDiagVals, Nb, block_size);
+                                  s.invDiagVals, rowsPerColor[color], block_size);
     }
 
     if (verbosity >= 3) {
@@ -303,11 +303,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
 #if CHOW_PATEL
         OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
                                   s.diagIndex, y, x, s.rowsPerColor,
-                                  color, Nb, block_size);
+                                  color, rowsPerColor[color], block_size);
 #else
         OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
                                   s.diagIndex, y, x, s.rowsPerColor,
-                                  color, Nb, block_size);
+                                  color, rowsPerColor[color], block_size);
 #endif
     }
 
@@ -315,11 +315,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
 #if CHOW_PATEL
         OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
                                   s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
-                                  color, Nb, block_size);
+                                  color, rowsPerColor[color], block_size);
 #else
         OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
                                   s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
-                                  color, Nb, block_size);
+                                  color, rowsPerColor[color], block_size);
 #endif
     }
 
diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
index 41d46e381..483d4e7f2 100644
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl
@@ -14,7 +14,7 @@ __kernel void ILU_apply1(
     const unsigned int block_size,
     __local double *tmp)
 {
-    const unsigned int warpsize = 32;
+    const unsigned int warpsize = get_local_size(0);
     const unsigned int bs = block_size;
     const unsigned int idx_t = get_local_id(0);
     const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
index f888b826e..d910552b6 100644
--- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
+++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl
@@ -14,7 +14,7 @@ __kernel void ILU_apply2(
     const unsigned int block_size,
     __local double *tmp)
 {
-    const unsigned int warpsize = 32;
+    const unsigned int warpsize = get_local_size(0);
     const unsigned int bs = block_size;
     const unsigned int idx_t = get_local_id(0);
     const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.cpp b/opm/simulators/linalg/bda/opencl/openclKernels.cpp
index 08d670b8e..b43f70e66 100644
--- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp
@@ -41,6 +41,7 @@ int OpenclKernels::verbosity;
 cl::CommandQueue *OpenclKernels::queue;
 std::vector<double> OpenclKernels::tmp;
 bool OpenclKernels::initialized = false;
+size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
 
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
 std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
@@ -140,6 +141,11 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
     isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
     isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
 
+    // testing shows all kernels have the same preferred_workgroup_size_multiple
+    // 32 for NVIDIA
+    // 64 for AMD
+    cl::Kernel(program, "ILU_apply1").getWorkGroupInfo(devices[0], CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferred_workgroup_size_multiple);
+
     initialized = true;
 } // end get_opencl_kernels()
 
@@ -388,10 +394,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
                                cl::Buffer& rows, cl::Buffer& diagIndex,
                                const cl::Buffer& y, cl::Buffer& x,
                                cl::Buffer& rowsPerColor, int color,
-                               int Nb, unsigned int block_size)
+                               int rowsThisColor, unsigned int block_size)
 {
-    const unsigned int work_group_size = 32;
-    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int work_group_size = preferred_workgroup_size_multiple;
+    const unsigned int num_work_groups = rowsThisColor;
     const unsigned int total_work_items = num_work_groups * work_group_size;
     const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
     Timer t_ilu_apply1;
@@ -413,10 +419,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
                                cl::Buffer& rows, cl::Buffer& diagIndex,
                                cl::Buffer& invDiagVals, cl::Buffer& x,
                                cl::Buffer& rowsPerColor, int color,
-                               int Nb, unsigned int block_size)
+                               int rowsThisColor, unsigned int block_size)
 {
-    const unsigned int work_group_size = 32;
-    const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
+    const unsigned int work_group_size = preferred_workgroup_size_multiple;
+    const unsigned int num_work_groups = rowsThisColor;
     const unsigned int total_work_items = num_work_groups * work_group_size;
     const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
     Timer t_ilu_apply2;
@@ -437,19 +443,19 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
 void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
                                cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
                                cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
-                               int Nb, unsigned int block_size)
+                               int rowsThisColor, unsigned int block_size)
 {
-    const unsigned int work_group_size2 = 128;
-    const unsigned int num_work_groups2 = 1024;
-    const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
-    const unsigned int num_hwarps_per_group = work_group_size2 / 16;
-    const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double);           // each block needs a pivot
+    const unsigned int work_group_size = 128;
+    const unsigned int num_work_groups = rowsThisColor;
+    const unsigned int total_work_items = num_work_groups * work_group_size;
+    const unsigned int num_hwarps_per_group = work_group_size / 16;
+    const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double);           // each block needs a pivot
     Timer t_ilu_decomp;
 
-    cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)),
+    cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
                           firstRow, lastRow, vals, cols, rows,
-                          invDiagVals, diagIndex, Nb,
-                          cl::Local(lmem_per_work_group2));
+                          invDiagVals, diagIndex, rowsThisColor,
+                          cl::Local(lmem_per_work_group));
 
     if (verbosity >= 4) {
         event.wait();
diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.hpp b/opm/simulators/linalg/bda/opencl/openclKernels.hpp
index c56b748fb..e85b2ff66 100644
--- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp
+++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp
@@ -64,6 +64,7 @@ private:
     static cl::CommandQueue *queue;
     static std::vector<double> tmp;     // used as tmp CPU buffer for dot() and norm()
     static bool initialized;
+    static size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 
     static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
     static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;