mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Allow CMake to set CHOW_PATEL variables
This commit is contained in:
parent
920eeee426
commit
803a2ac2f4
@ -26,6 +26,9 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
|
||||
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
|
||||
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
|
||||
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
|
||||
option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF)
|
||||
option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF)
|
||||
option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU_PARALLEL" OFF)
|
||||
|
||||
if(SIBLING_SEARCH AND NOT opm-common_DIR)
|
||||
# guess the sibling dir
|
||||
@ -211,6 +214,24 @@ if(OpenCL_FOUND)
|
||||
set(OpenCL_FOUND OFF)
|
||||
set(OPENCL_FOUND OFF)
|
||||
endif()
|
||||
if(USE_CHOW_PATEL_ILU)
|
||||
add_compile_options(-DCHOW_PATEL=1)
|
||||
if(USE_CHOW_PATEL_ILU_GPU)
|
||||
add_compile_options(-DCHOW_PATEL_GPU=1)
|
||||
if(USE_CHOW_PATEL_ILU_GPU_PARALLEL)
|
||||
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=1)
|
||||
else()
|
||||
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
|
||||
endif()
|
||||
else()
|
||||
add_compile_options(-DCHOW_PATEL_GPU=0)
|
||||
add_compile_options(-DCHOW_PATEL_GPU_PARALLEL=0)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
if(USE_CHOW_PATEL_ILU)
|
||||
message(FATAL_ERROR " CHOW_PATEL_ILU only works for openclSolver, but OpenCL was not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
find_package(amgcl)
|
||||
@ -242,6 +263,7 @@ if(OpenCL_FOUND)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
# read the list of components from this file (in the project directory);
|
||||
# it should set various lists with the names of the files to include
|
||||
include (CMakeLists_files.cmake)
|
||||
|
@ -36,7 +36,7 @@ namespace Accelerator
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// if PARALLEL is 0:
|
||||
// if CHOW_PATEL_GPU_PARALLEL is 0:
|
||||
// Each row gets 1 workgroup, 1 workgroup can do multiple rows sequentially.
|
||||
// Each block in a row gets 1 workitem, all blocks are expected to be processed simultaneously,
|
||||
// except when the number of blocks in that row exceeds the number of workitems per workgroup.
|
||||
@ -48,15 +48,13 @@ using Dune::Timer;
|
||||
// If the number of blocks exceeds the number of warps, some warps will process multiple blocks sequentially.
|
||||
|
||||
// Notes:
|
||||
// PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
|
||||
// PARALLEL 1 should be run with at least 32 workitems per workgroup.
|
||||
// CHOW_PATEL_GPU_PARALLEL 0 should be able to run with any number of workitems per workgroup, but 8 and 16 tend to be quicker than 32.
|
||||
// CHOW_PATEL_GPU_PARALLEL 1 should be run with at least 32 workitems per workgroup.
|
||||
// The recommended number of workgroups for both options is Nb, which gives every row their own workgroup.
|
||||
// PARALLEL 0 is generally faster, despite not having parallelization.
|
||||
// CHOW_PATEL_GPU_PARALLEL 0 is generally faster, despite not having parallelization.
|
||||
// only 3x3 blocks are supported
|
||||
|
||||
#define PARALLEL 0
|
||||
|
||||
#if PARALLEL
|
||||
#if CHOW_PATEL_GPU_PARALLEL
|
||||
inline const char* chow_patel_ilu_sweep_s = R"(
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
@ -272,7 +270,7 @@ __kernel void chow_patel_ilu_sweep(
|
||||
}
|
||||
)";
|
||||
|
||||
#else // PARALLEL
|
||||
#else // CHOW_PATEL_GPU_PARALLEL
|
||||
|
||||
inline const char* chow_patel_ilu_sweep_s = R"(
|
||||
|
||||
@ -475,7 +473,7 @@ __kernel void chow_patel_ilu_sweep(
|
||||
}
|
||||
)";
|
||||
|
||||
#endif // PARALLEL
|
||||
#endif // CHOW_PATEL_GPU_PARALLEL
|
||||
|
||||
|
||||
|
||||
@ -898,7 +896,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
std::ostringstream out;
|
||||
out << "ChowPatelIlu PARALLEL: " << PARALLEL;
|
||||
out << "ChowPatelIlu PARALLEL: " << CHOW_PATEL_GPU_PARALLEL;
|
||||
OpmLog::info(out.str());
|
||||
});
|
||||
|
||||
@ -932,7 +930,7 @@ void ChowPatelIlu<block_size>::gpu_decomposition(
|
||||
auto *Uarg1 = (sweep % 2 == 0) ? &d_Ut_vals : &d_Utmp;
|
||||
auto *Uarg2 = (sweep % 2 == 0) ? &d_Utmp : &d_Ut_vals;
|
||||
int num_work_groups = Nb;
|
||||
#if PARALLEL
|
||||
#if CHOW_PATEL_GPU_PARALLEL
|
||||
int work_group_size = 32;
|
||||
#else
|
||||
int work_group_size = 16;
|
||||
|
@ -26,6 +26,10 @@
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
// Variables CHOW_PATEL, CHOW_PATEL_GPU and CHOW_PATEL_GPU_PARALLEL are set by CMake
|
||||
// Pass -DUSE_CHOW_PATEL_ILU=1 to cmake to define CHOW_PATEL and use the iterative ILU decomposition
|
||||
// Pass -DUSE_CHOW_PATEL_ILU_GPU=1 to run the ILU decomposition sweeps on the GPU
|
||||
// Pass -DUSE_CHOW_PATEL_ILU_GPU_PARALLEL=1 to use more parallelisation in the GPU kernel, see ChowPatelIlu.cpp
|
||||
|
||||
// if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
|
||||
// if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
|
||||
@ -35,8 +39,6 @@
|
||||
// the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
|
||||
// the exact decomposition uses a full matrix LU which is the superposition of L and U
|
||||
// ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
|
||||
#define CHOW_PATEL 0
|
||||
#define CHOW_PATEL_GPU 1
|
||||
|
||||
#if CHOW_PATEL
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user