Allow CMake to set CHOW_PATEL variables

This commit is contained in:
Tong Dong Qiu 2021-10-25 14:57:46 +02:00
parent 920eeee426
commit 803a2ac2f4
3 changed files with 35 additions and 13 deletions

View File

@ -26,6 +26,9 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF)
option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF)
option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU_PARALLEL" OFF)
if(SIBLING_SEARCH AND NOT opm-common_DIR)
# guess the sibling dir
@ -211,6 +214,24 @@ if(OpenCL_FOUND)
set(OpenCL_FOUND OFF)
set(OPENCL_FOUND OFF)
endif()
if(USE_CHOW_PATEL_ILU)
add_compile_options(-DCHOW_PATEL=1)
if(USE_CHOW_PATEL_ILU_GPU)
add_compile_options(-DCHOW_PATEL_GPU=1)
if(USE_CHOW_PATEL_ILU_GPU_PARALLEL)
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=1)
else()
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
endif()
else()
add_compile_options(-DCHOW_PATEL_GPU=0)
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
endif()
endif()
else()
if(USE_CHOW_PATEL_ILU)
message(FATAL_ERROR " CHOW_PATEL_ILU only works for openclSolver, but OpenCL was not found")
endif()
endif()
find_package(amgcl)
@ -242,6 +263,7 @@ if(OpenCL_FOUND)
endif()
endif()
# read the list of components from this file (in the project directory);
# it should set various lists with the names of the files to include
include (CMakeLists_files.cmake)

View File

@ -36,7 +36,7 @@ namespace Accelerator
using Opm::OpmLog;
using Dune::Timer;
// if PARALLEL is 0:
// if CHOW_PATEL_GPU_PARALLEL is 0:
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
@ -48,15 +48,13 @@ using Dune::Timer;
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
// Notes:
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
// CHOW_PATEL_GPU_PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
// CHOW_PATEL_GPU_PARALLEL 1 should be run with at least 32 workitems per workgroup.
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
// PARALLEL 0 is generally faster, despite not having parallelization.
// CHOW_PATEL_GPU_PARALLEL 0 is generally faster, despite not having parallelization.
// only 3x3 blocks are supported
#define PARALLEL 0
#if PARALLEL
#if CHOW_PATEL_GPU_PARALLEL
inline const char* chow_patel_ilu_sweep_s = R"(
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
@ -272,7 +270,7 @@ __kernel void chow_patel_ilu_sweep(
}
)";
#else // PARALLEL
#else // CHOW_PATEL_GPU_PARALLEL
inline const char* chow_patel_ilu_sweep_s = R"(
@ -475,7 +473,7 @@ __kernel void chow_patel_ilu_sweep(
}
)";
#endif // PARALLEL
#endif // CHOW_PATEL_GPU_PARALLEL
@ -898,7 +896,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
OpmLog::info(out.str());
}
std::ostringstream out;
out << "ChowPatelIlu PARALLEL: " << PARALLEL;
out << "ChowPatelIlu PARALLEL: " << CHOW_PATEL_GPU_PARALLEL;
OpmLog::info(out.str());
});
@ -932,7 +930,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
int num_work_groups = Nb;
#if PARALLEL
#if CHOW_PATEL_GPU_PARALLEL
int work_group_size = 32;
#else
int work_group_size = 16;

View File

@ -26,6 +26,10 @@
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
// Variables CHOW_PATEL, CHOW_PATEL_GPU and CHOW_PATEL_GPU_PARALLEL are set by CMake
// Pass -DUSE_CHOW_PATEL_ILU=1 to cmake to define CHOW_PATEL and use the iterative ILU decomposition
// Pass -DUSE_CHOW_PATEL_ILU_GPU=1 to run the ILU decomposition sweeps on the GPU
// Pass -DUSE_CHOW_PATEL_ILU_GPU_PARALLEL=1 to use more parallelisation in the GPU kernel, see ChowPatelIlu.cpp
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
@ -35,8 +39,6 @@
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
// the exact decomposition uses a full matrix LU which is the superposition of L and U
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
#define CHOW_PATEL 0
#define CHOW_PATEL_GPU 1
#if CHOW_PATEL