Allow CMake to set CHOW_PATEL variables

2025-02-25 18:55:30 -06:00 · 2021-10-25 14:57:46 +02:00 · 2021-10-25 14:57:46 +02:00 · 803a2ac2f4
commit 803a2ac2f4
parent 920eeee426
3 changed files with 35 additions and 13 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -26,6 +26,9 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
 option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
 option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
 option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
+option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF)
+option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF)
+option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU_PARALLEL" OFF)

 if(SIBLING_SEARCH AND NOT opm-common_DIR)
  # guess the sibling dir
@ -211,6 +214,24 @@ if(OpenCL_FOUND)
    set(OpenCL_FOUND OFF)
    set(OPENCL_FOUND OFF)
  endif()
+  if(USE_CHOW_PATEL_ILU)
+    add_compile_options(-DCHOW_PATEL=1)
+    if(USE_CHOW_PATEL_ILU_GPU)
+      add_compile_options(-DCHOW_PATEL_GPU=1)
+      if(USE_CHOW_PATEL_ILU_GPU_PARALLEL)
+        add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=1)
+      else()
+        add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
+      endif()
+    else()
+      add_compile_options(-DCHOW_PATEL_GPU=0)
+      add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
+    endif()
+  endif()
+else()
+  if(USE_CHOW_PATEL_ILU)
+    message(FATAL_ERROR " CHOW_PATEL_ILU only works for openclSolver, but OpenCL was not found")
+  endif()
 endif()

 find_package(amgcl)
@ -242,6 +263,7 @@ if(OpenCL_FOUND)
  endif()
 endif()

+
 # read the list of components from this file (in the project directory);
 # it should set various lists with the names of the files to include
 include (CMakeLists_files.cmake)
--- a/opm/simulators/linalg/bda/ChowPatelIlu.cpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.cpp
@ -36,7 +36,7 @@ namespace Accelerator
 using Opm::OpmLog;
 using Dune::Timer;

-// if PARALLEL is 0:
+// if CHOW_PATEL_GPU_PARALLEL is 0:
 //    Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
 //    Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
 //    except when the number of blocks in that row exceeds the number of workitems per workgroup.
@ -48,15 +48,13 @@ using Dune::Timer;
 //    If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.

 // Notes:
-// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
-// PARALLEL 1 should be run with at least 32 workitems per workgroup.
+// CHOW_PATEL_GPU_PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
+// CHOW_PATEL_GPU_PARALLEL 1 should be run with at least 32 workitems per workgroup.
 // The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
-// PARALLEL 0 is generally faster, despite not having parallelization.
+// CHOW_PATEL_GPU_PARALLEL 0 is generally faster, despite not having parallelization.
 // only 3x3 blocks are supported

-#define PARALLEL 0
-
-#if PARALLEL
+#if CHOW_PATEL_GPU_PARALLEL
 inline const char* chow_patel_ilu_sweep_s  = R"(

 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
@ -272,7 +270,7 @@ __kernel void chow_patel_ilu_sweep(
 }
 )";

-#else // PARALLEL
+#else // CHOW_PATEL_GPU_PARALLEL

 inline const char* chow_patel_ilu_sweep_s  = R"(

@ -475,7 +473,7 @@ __kernel void chow_patel_ilu_sweep(
 }
 )";

-#endif // PARALLEL
+#endif // CHOW_PATEL_GPU_PARALLEL



@ -898,7 +896,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
                OpmLog::info(out.str());
            }
            std::ostringstream out;
-            out << "ChowPatelIlu PARALLEL: " << PARALLEL;
+            out << "ChowPatelIlu PARALLEL: " << CHOW_PATEL_GPU_PARALLEL;
            OpmLog::info(out.str());
        });

@ -932,7 +930,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
            auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
            auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
            int num_work_groups = Nb;
-#if PARALLEL
+#if CHOW_PATEL_GPU_PARALLEL
            int work_group_size = 32;
 #else
            int work_group_size = 16;
--- a/opm/simulators/linalg/bda/ChowPatelIlu.hpp
+++ b/opm/simulators/linalg/bda/ChowPatelIlu.hpp
@ -26,6 +26,10 @@
 #include <opm/simulators/linalg/bda/opencl.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

+// Variables CHOW_PATEL, CHOW_PATEL_GPU and CHOW_PATEL_GPU_PARALLEL are set by CMake
+// Pass -DUSE_CHOW_PATEL_ILU=1 to cmake to define CHOW_PATEL and use the iterative ILU decomposition
+// Pass -DUSE_CHOW_PATEL_ILU_GPU=1 to run the ILU decomposition sweeps on the GPU
+// Pass -DUSE_CHOW_PATEL_ILU_GPU_PARALLEL=1 to use more parallelisation in the GPU kernel, see ChowPatelIlu.cpp

 // if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
 // if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
@ -35,8 +39,6 @@
 // the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
 // the exact decomposition uses a full matrix LU which is the superposition of L and U
 // ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
-#define CHOW_PATEL     0
-#define CHOW_PATEL_GPU 1

 #if CHOW_PATEL