Merge pull request #2998 from g-marchiori/fpgasolver-integration

Added fpgaSolver, requires Xilinx Alveo U280 FPGA board
This commit is contained in:
Markus Blatt 2021-04-15 11:21:39 +02:00 committed by GitHub
commit 13f62a718b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 2697 additions and 162 deletions

View File

@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for
option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF) option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF) option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON) option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
if(SIBLING_SEARCH AND NOT opm-common_DIR) if(SIBLING_SEARCH AND NOT opm-common_DIR)
# guess the sibling dir # guess the sibling dir
@ -132,9 +133,72 @@ if(CUDA_FOUND)
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
endif() endif()
find_package(OpenCL) find_package(OpenCL)
# include FPGA target only when ENABLE_FPGA is set to ON
if(ENABLE_FPGA)
# must include opencl.h which is a wrapper for all the OpenCL extensions
find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS})
# FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in <opm-simulators>/../FPGA
find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA)
# this code only runs if the user explicitly enabled the FPGA
# hence fatal_error instead of warning
if(NOT OPENCL_H OR NOT OpenCL_FOUND)
set(HAVE_FPGA 0)
message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.")
elseif(NOT FPGA_MODULE)
set(HAVE_FPGA 0)
message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.")
elseif(NOT DEFINED ENV{XILINX_XRT})
# Xilinx XRT must be installed and properly setup
set(HAVE_FPGA 0)
message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.")
else()
set(HAVE_FPGA 1)
message(STATUS "FPGA library and kernel integration active.")
# FIXME: set the correct path to the FPGA module
set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA)
# configuration variables with default values: they can be overridden on the cmake command line
# FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream
if(NOT FPGA_PORTS_CONFIG)
set(FPGA_PORTS_CONFIG 2r_3r3w_ddr)
endif()
# FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build)
if(NOT FPGA_DEBUG_LEVEL)
set(FPGA_DEBUG_LEVEL 0)
endif()
message(STATUS "Using the following settings for the FPGA library compilation: "
"FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, "
"FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}")
add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG})
add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL})
# include directories for the FPGA library
include_directories(${FPGA_SOURCE_DIR})
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app)
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common)
# add external project to compile the FPGA library
include (ExternalProject)
ExternalProject_Add(FPGA_library
# force the build step to always be run because source dependencies cannot be made explicit
BUILD_ALWAYS 1
DOWNLOAD_COMMAND ""
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND make
-f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile
SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app
PORTS_CONFIG=${FPGA_PORTS_CONFIG}
DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}
INSTALL_COMMAND ""
TEST_COMMAND ""
)
endif()
endif()
if(OpenCL_FOUND) if(OpenCL_FOUND)
# the current OpenCL implementation relies on cl.hpp, not cl2.hpp # the current OpenCL implementation relies on cl.hpp, not cl2.hpp
# make sure it is available, otherwise disable OpenCL # make sure it is available, otherwise disable OpenCL
@ -454,7 +518,8 @@ endif()
add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests) add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
# must link libraries after target 'flow' has been defined # must link libraries after target 'opmsimulators' has been defined
if(CUDA_FOUND) if(CUDA_FOUND)
target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} ) target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} ) target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
@ -463,3 +528,9 @@ endif()
if(OpenCL_FOUND) if(OpenCL_FOUND)
target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} ) target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
endif() endif()
if(HAVE_FPGA)
add_dependencies(opmsimulators FPGA_library)
ExternalProject_Get_Property(FPGA_library binary_dir)
target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a)
endif()

View File

@ -66,6 +66,16 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
endif() endif()
if(HAVE_FPGA)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
endif()
if(MPI_FOUND) if(MPI_FOUND)
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
opm/simulators/utils/ParallelSerialization.cpp) opm/simulators/utils/ParallelSerialization.cpp)
@ -184,6 +194,10 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cuda_header.hpp
opm/simulators/linalg/bda/cusparseSolverBackend.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp
opm/simulators/linalg/bda/ChowPatelIlu.hpp opm/simulators/linalg/bda/ChowPatelIlu.hpp
opm/simulators/linalg/bda/FPGAMatrix.hpp
opm/simulators/linalg/bda/FPGABILU0.hpp
opm/simulators/linalg/bda/FPGASolverBackend.hpp
opm/simulators/linalg/bda/FPGAUtils.hpp
opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/Reorder.hpp
opm/simulators/linalg/bda/ILUReorder.hpp opm/simulators/linalg/bda/ILUReorder.hpp
opm/simulators/linalg/bda/opencl.hpp opm/simulators/linalg/bda/opencl.hpp

View File

@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR
HAVE_PETSC HAVE_PETSC
HAVE_CUDA HAVE_CUDA
HAVE_OPENCL HAVE_OPENCL
HAVE_FPGA
HAVE_SUITESPARSE_UMFPACK_H HAVE_SUITESPARSE_UMFPACK_H
HAVE_DUNE_ISTL HAVE_DUNE_ISTL
DUNE_ISTL_VERSION_MAJOR DUNE_ISTL_VERSION_MAJOR

View File

@ -118,7 +118,7 @@ struct Linsolver {
using type = UndefinedProperty; using type = UndefinedProperty;
}; };
template<class TypeTag, class MyTypeTag> template<class TypeTag, class MyTypeTag>
struct GpuMode { struct AcceleratorMode {
using type = UndefinedProperty; using type = UndefinedProperty;
}; };
template<class TypeTag, class MyTypeTag> template<class TypeTag, class MyTypeTag>
@ -133,6 +133,10 @@ template<class TypeTag, class MyTypeTag>
struct OpenclIluReorder { struct OpenclIluReorder {
using type = UndefinedProperty; using type = UndefinedProperty;
}; };
template<class TypeTag, class MyTypeTag>
struct FpgaBitstream {
using type = UndefinedProperty;
};
template<class TypeTag> template<class TypeTag>
struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> { struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
@ -213,7 +217,7 @@ struct Linsolver<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr auto value = "ilu0"; static constexpr auto value = "ilu0";
}; };
template<class TypeTag> template<class TypeTag>
struct GpuMode<TypeTag, TTag::FlowIstlSolverParams> { struct AcceleratorMode<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr auto value = "none"; static constexpr auto value = "none";
}; };
template<class TypeTag> template<class TypeTag>
@ -226,7 +230,11 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
}; };
template<class TypeTag> template<class TypeTag>
struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> { struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr auto value = "graph_coloring"; static constexpr auto value = ""; // note: default value is chosen depending on the solver used
};
template<class TypeTag>
struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
static constexpr auto value = "";
}; };
} // namespace Opm::Properties } // namespace Opm::Properties
@ -252,12 +260,13 @@ namespace Opm
bool ignoreConvergenceFailure_; bool ignoreConvergenceFailure_;
bool scale_linear_system_; bool scale_linear_system_;
std::string linsolver_; std::string linsolver_;
std::string gpu_mode_; std::string accelerator_mode_;
int bda_device_id_; int bda_device_id_;
int opencl_platform_id_; int opencl_platform_id_;
int cpr_max_ell_iter_ = 20; int cpr_max_ell_iter_ = 20;
int cpr_reuse_setup_ = 0; int cpr_reuse_setup_ = 0;
std::string opencl_ilu_reorder_; std::string opencl_ilu_reorder_;
std::string fpga_bitstream_;
template <class TypeTag> template <class TypeTag>
void init() void init()
@ -279,10 +288,11 @@ namespace Opm
cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter); cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup); cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver); linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
} }
template <class TypeTag> template <class TypeTag>
@ -304,10 +314,11 @@ namespace Opm
EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver"); EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate"); EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'"); EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'"); EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly."); EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
} }
FlowLinearSolverParameters() { reset(); } FlowLinearSolverParameters() { reset(); }
@ -327,10 +338,11 @@ namespace Opm
ilu_milu_ = MILU_VARIANT::ILU; ilu_milu_ = MILU_VARIANT::ILU;
ilu_redblack_ = false; ilu_redblack_ = false;
ilu_reorder_sphere_ = true; ilu_reorder_sphere_ = true;
gpu_mode_ = "none"; accelerator_mode_ = "none";
bda_device_id_ = 0; bda_device_id_ = 0;
opencl_platform_id_ = 0; opencl_platform_id_ = 0;
opencl_ilu_reorder_ = "graph_coloring"; opencl_ilu_reorder_ = ""; // note: the default value is chosen depending on the solver used
fpga_bitstream_ = "";
} }
}; };

View File

@ -35,7 +35,7 @@
#include <opm/simulators/linalg/setupPropertyTree.hpp> #include <opm/simulators/linalg/setupPropertyTree.hpp>
#if HAVE_CUDA || HAVE_OPENCL #if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
#include <opm/simulators/linalg/bda/BdaBridge.hpp> #include <opm/simulators/linalg/bda/BdaBridge.hpp>
#endif #endif
@ -92,7 +92,7 @@ namespace Opm
using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>; using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>; using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
#if HAVE_CUDA || HAVE_OPENCL #if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
static const unsigned int block_size = Matrix::block_type::rows; static const unsigned int block_size = Matrix::block_type::rows;
std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge; std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
#endif #endif
@ -126,14 +126,14 @@ namespace Opm
#endif #endif
parameters_.template init<TypeTag>(); parameters_.template init<TypeTag>();
prm_ = setupPropertyTree<TypeTag>(parameters_); prm_ = setupPropertyTree<TypeTag>(parameters_);
#if HAVE_CUDA || HAVE_OPENCL #if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
{ {
std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) { if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
if (on_io_rank) { if (on_io_rank) {
OpmLog::warning("Cannot use GPU with MPI, GPU is disabled"); OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled");
} }
gpu_mode = "none"; accelerator_mode = "none";
} }
const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
@ -141,11 +141,12 @@ namespace Opm
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder)); std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
} }
#else #else
if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") { if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake"); OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
} }
#endif #endif
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@ -157,6 +158,12 @@ namespace Opm
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_); detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
#if HAVE_FPGA
// check usage of MatrixAddWellContributions: for FPGA they must be included
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) {
OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true");
}
#endif
const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst); const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
if (!ownersFirst) { if (!ownersFirst) {
const std::string msg = "The linear solver no longer supports --owner-cells-first=false."; const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
@ -242,43 +249,42 @@ namespace Opm
// Solve system. // Solve system.
Dune::InverseOperatorResult result; Dune::InverseOperatorResult result;
bool gpu_was_used = false; bool accelerator_was_used = false;
// Use GPU if: available, chosen by user, and successful. // Use GPU if: available, chosen by user, and successful.
#if HAVE_CUDA || HAVE_OPENCL // Use FPGA if: support compiled, chosen by user, and successful.
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
bool use_gpu = bdaBridge->getUseGpu(); bool use_gpu = bdaBridge->getUseGpu();
if (use_gpu) { bool use_fpga = bdaBridge->getUseFpga();
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); if (use_gpu || use_fpga) {
WellContributions wellContribs(gpu_mode); const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
WellContributions wellContribs(accelerator_mode);
bdaBridge->initWellContributions(wellContribs); bdaBridge->initWellContributions(wellContribs);
if (!useWellConn_) { if (!useWellConn_) {
simulator_.problem().wellModel().getWellContributions(wellContribs); simulator_.problem().wellModel().getWellContributions(wellContribs);
} }
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result); bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
if (result.converged) { if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful // get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x); bdaBridge->get_result(x);
gpu_was_used = true; accelerator_was_used = true;
} else { } else {
// CPU fallback // warn about CPU fallback
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver // BdaBridge might have disabled its BdaSolver for this simulation due to some error
if (use_gpu && simulator_.gridView().comm().rank() == 0) { // in that case the BdaBridge is disabled and flexibleSolver is always used
if (gpu_mode.compare("cusparse") == 0) { // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); if (simulator_.gridView().comm().rank() == 0) {
} OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
if (gpu_mode.compare("opencl") == 0) {
OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
}
} }
} }
} }
#endif #endif
// Otherwise, use flexible istl solver. // Otherwise, use flexible istl solver.
if (!gpu_was_used) { if (!accelerator_was_used) {
assert(flexibleSolver_); assert(flexibleSolver_);
flexibleSolver_->apply(x, *rhs_, result); flexibleSolver_->apply(x, *rhs_, result);
} }

View File

@ -45,11 +45,6 @@ namespace bda
BILU0<block_size>::~BILU0() BILU0<block_size>::~BILU0()
{ {
delete[] invDiagVals; delete[] invDiagVals;
delete[] diagIndex;
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] toOrder;
delete[] fromOrder;
}
} }
template <unsigned int block_size> template <unsigned int block_size>
@ -68,8 +63,8 @@ namespace bda
if (opencl_ilu_reorder == ILUReorder::NONE) { if (opencl_ilu_reorder == ILUReorder::NONE) {
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat); LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
} else { } else {
toOrder = new int[Nb]; toOrder.resize(Nb);
fromOrder = new int[Nb]; fromOrder.resize(Nb);
CSCRowIndices = new int[nnzbs]; CSCRowIndices = new int[nnzbs];
CSCColPointers = new int[Nb + 1]; CSCColPointers = new int[Nb + 1];
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs); rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
@ -88,10 +83,10 @@ namespace bda
std::ostringstream out; std::ostringstream out;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
out << "BILU0 reordering strategy: " << "level_scheduling\n"; out << "BILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
out << "BILU0 reordering strategy: " << "graph_coloring\n"; out << "BILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (opencl_ilu_reorder == ILUReorder::NONE) { } else if (opencl_ilu_reorder == ILUReorder::NONE) {
out << "BILU0 reordering strategy: none\n"; out << "BILU0 reordering strategy: none\n";
// numColors = 1; // numColors = 1;
@ -111,12 +106,13 @@ namespace bda
} }
OpmLog::info(out.str()); OpmLog::info(out.str());
if (opencl_ilu_reorder != ILUReorder::NONE) { if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] CSCRowIndices; delete[] CSCRowIndices;
delete[] CSCColPointers; delete[] CSCColPointers;
} }
diagIndex = new int[mat->Nb]; diagIndex.resize(mat->Nb);
invDiagVals = new double[mat->Nb * bs * bs]; invDiagVals = new double[mat->Nb * bs * bs];
#if CHOW_PATEL #if CHOW_PATEL
@ -482,7 +478,7 @@ namespace bda
diagIndex[row] = candidate - LUmat->colIndices; diagIndex[row] = candidate - LUmat->colIndices;
} }
events.resize(8); events.resize(8);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]); queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]); queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]); queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]); queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
@ -516,7 +512,7 @@ namespace bda
if (opencl_ilu_reorder != ILUReorder::NONE) { if (opencl_ilu_reorder != ILUReorder::NONE) {
m = rmat.get(); m = rmat.get();
Timer t_reorder; Timer t_reorder;
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get()); reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
if (verbosity >= 3){ if (verbosity >= 3){
std::ostringstream out; std::ostringstream out;
@ -556,7 +552,7 @@ namespace bda
diagIndex[row] = candidate - LUmat->colIndices; diagIndex[row] = candidate - LUmat->colIndices;
} }
events.resize(4); events.resize(4);
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]); queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
}); });

View File

@ -61,10 +61,10 @@ namespace bda
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr; std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
#endif #endif
double *invDiagVals = nullptr; double *invDiagVals = nullptr;
int *diagIndex = nullptr; std::vector<int> diagIndex;
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
int *toOrder = nullptr, *fromOrder = nullptr; std::vector<int> toOrder, fromOrder;
int numColors; int numColors;
int verbosity; int verbosity;
std::once_flag pattern_uploaded; std::once_flag pattern_uploaded;
@ -128,12 +128,12 @@ namespace bda
int* getToOrder() int* getToOrder()
{ {
return toOrder; return toOrder.data();
} }
int* getFromOrder() int* getFromOrder()
{ {
return fromOrder; return fromOrder.data();
} }
BlockedMatrix<block_size>* getRMat() BlockedMatrix<block_size>* getRMat()

View File

@ -18,8 +18,6 @@
*/ */
#include <config.h> #include <config.h>
#include <memory>
#include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp> #include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>
@ -54,21 +52,23 @@ namespace Opm
using bda::ILUReorder; using bda::ILUReorder;
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED) BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
: gpu_mode(gpu_mode_) : accelerator_mode(accelerator_mode_)
{ {
if (gpu_mode.compare("cusparse") == 0) { if (accelerator_mode.compare("cusparse") == 0) {
#if HAVE_CUDA #if HAVE_CUDA
use_gpu = true; use_gpu = true;
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID)); backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
#else #else
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
#endif #endif
} else if (gpu_mode.compare("opencl") == 0) { } else if (accelerator_mode.compare("opencl") == 0) {
#if HAVE_OPENCL #if HAVE_OPENCL
use_gpu = true; use_gpu = true;
ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING; ILUReorder ilu_reorder;
if (opencl_ilu_reorder == "level_scheduling") { if (opencl_ilu_reorder == "") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") { } else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING; ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
@ -81,10 +81,28 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
#else #else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif #endif
} else if (gpu_mode.compare("none") == 0) { } else if (accelerator_mode.compare("fpga") == 0) {
use_gpu = false; #if HAVE_FPGA
use_fpga = true;
ILUReorder ilu_reorder;
if (opencl_ilu_reorder == "") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
} else if (opencl_ilu_reorder == "level_scheduling") {
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
} else if (opencl_ilu_reorder == "graph_coloring") {
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
} else { } else {
OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]"); OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
}
backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
#else
OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
#endif
} else if (accelerator_mode.compare("none") == 0) {
use_gpu = false;
use_fpga = false;
} else {
OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
} }
} }
@ -130,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
// sparsity pattern should stay the same due to matrix-add-well-contributions // sparsity pattern should stay the same
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
template <class BridgeMatrix> template <class BridgeMatrix>
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) { void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
@ -161,14 +179,16 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED) void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
{ {
if (use_gpu) { if (use_gpu || use_fpga) {
BdaResult result; BdaResult result;
result.converged = false; result.converged = false;
static std::vector<int> h_rows; static std::vector<int> h_rows;
static std::vector<int> h_cols; static std::vector<int> h_cols;
const int dim = (*mat)[0][0].N(); const int dim = (*mat)[0][0].N();
const int N = mat->N()*dim; const int Nb = mat->N();
const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim; const int N = Nb * dim;
const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
const int nnz = nnzb * dim * dim;
if (dim != 3) { if (dim != 3) {
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@ -177,8 +197,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
} }
if (h_rows.capacity() == 0) { if (h_rows.capacity() == 0) {
h_rows.reserve(N+1); h_rows.reserve(Nb+1);
h_cols.reserve(nnz); h_cols.reserve(nnzb);
#if PRINT_TIMERS_BRIDGE #if PRINT_TIMERS_BRIDGE
Dune::Timer t; Dune::Timer t;
#endif #endif
@ -233,14 +253,14 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) { void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
if (use_gpu) { if (use_gpu || use_fpga) {
backend->get_result(static_cast<double*>(&(x[0][0]))); backend->get_result(static_cast<double*>(&(x[0][0])));
} }
} }
template <class BridgeMatrix, class BridgeVector, int block_size> template <class BridgeMatrix, class BridgeVector, int block_size>
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) { void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
if(gpu_mode.compare("opencl") == 0){ if(accelerator_mode.compare("opencl") == 0){
#if HAVE_OPENCL #if HAVE_OPENCL
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get()); const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get()); wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
@ -250,12 +270,12 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(We
} }
} }
#define INSTANTIATE_BDA_FUNCTIONS(n) \ #define INSTANTIATE_BDA_FUNCTIONS(n) \
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \ template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \ Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>::BdaBridge \ n>::BdaBridge \
(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \ (std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, \
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \
\ \
template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \ template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \ Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \

View File

@ -30,6 +30,10 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp> #include <opm/simulators/linalg/bda/WellContributions.hpp>
#if HAVE_FPGA
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
#endif
namespace Opm namespace Opm
{ {
@ -42,19 +46,22 @@ class BdaBridge
{ {
private: private:
bool use_gpu = false; bool use_gpu = false;
std::string gpu_mode; bool use_fpga = false;
std::string accelerator_mode;
std::unique_ptr<bda::BdaSolver<block_size> > backend; std::unique_ptr<bda::BdaSolver<block_size> > backend;
public: public:
/// Construct a BdaBridge /// Construct a BdaBridge
/// \param[in] gpu_mode to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]' /// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]'
/// \param[in] fpga_bitstream FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[<filename>]'
/// \param[in] linear_solver_verbosity verbosity of BdaSolver /// \param[in] linear_solver_verbosity verbosity of BdaSolver
/// \param[in] maxit maximum number of iterations for BdaSolver /// \param[in] maxit maximum number of iterations for BdaSolver
/// \param[in] tolerance required relative tolerance for BdaSolver /// \param[in] tolerance required relative tolerance for BdaSolver
/// \param[in] platformID the OpenCL platform ID to be used /// \param[in] platformID the OpenCL platform ID to be used
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors /// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
/// Solve linear system, A*x = b /// Solve linear system, A*x = b
/// \warning Values of A might get overwritten! /// \warning Values of A might get overwritten!
@ -79,6 +86,16 @@ public:
/// \param[in] wellContribs container to hold all WellContributions /// \param[in] wellContribs container to hold all WellContributions
void initWellContributions(WellContributions& wellContribs); void initWellContributions(WellContributions& wellContribs);
/// Return whether the BdaBridge will use the FPGA or not
/// return whether the BdaBridge will use the FPGA or not
bool getUseFpga(){
return use_fpga;
}
/// Return the selected accelerator mode, this is input via the command-line
std::string getAccleratorName(){
return accelerator_mode;
}
}; // end class BdaBridge }; // end class BdaBridge

View File

@ -55,6 +55,8 @@ namespace bda
int maxit = 200; int maxit = 200;
double tolerance = 1e-2; double tolerance = 1e-2;
std::string bitstream = "";
int N; // number of rows int N; // number of rows
int Nb; // number of blocked rows (Nb*block_size == N) int Nb; // number of blocked rows (Nb*block_size == N)
int nnz; // number of nonzeroes (scalars) int nnz; // number of nonzeroes (scalars)
@ -66,7 +68,8 @@ namespace bda
bool initialized = false; bool initialized = false;
public: public:
/// Construct a BdaSolver, can be cusparseSolver or openclSolver /// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver
/// \param[in] fpga_bitstream FPGA bitstream file name (only for fpgaSolver)
/// \param[in] linear_solver_verbosity verbosity of solver /// \param[in] linear_solver_verbosity verbosity of solver
/// \param[in] maxit maximum number of iterations for solver /// \param[in] maxit maximum number of iterations for solver
/// \param[in] tolerance required relative tolerance for solver /// \param[in] tolerance required relative tolerance for solver
@ -74,6 +77,7 @@ namespace bda
/// \param[in] deviceID the device to be used /// \param[in] deviceID the device to be used
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
/// Define virtual destructor, so that the derivedclass destructor will be called /// Define virtual destructor, so that the derivedclass destructor will be called
virtual ~BdaSolver() {}; virtual ~BdaSolver() {};

View File

@ -20,13 +20,19 @@
#include <cstring> #include <cstring>
#include <cmath> #include <cmath>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp> #include <config.h>
using bda::BlockedMatrix; #include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda namespace bda
{ {
using Opm::OpmLog;
/*Sort a row of matrix elements from a blocked CSR-format.*/ /*Sort a row of matrix elements from a blocked CSR-format.*/
template <unsigned int block_size> template <unsigned int block_size>
@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
} }
} }
// subtract c from b and store in a #if HAVE_FPGA
// a = b - c
/*Subtract two blocks from one another element by element*/
template <unsigned int block_size> template <unsigned int block_size>
void blockSub(double *a, double *b, double *c) void blockSub(double *mat1, double *mat2, double *resMat) {
{
for (unsigned int row = 0; row < block_size; row++) { for (unsigned int row = 0; row < block_size; row++) {
for (unsigned int col = 0; col < block_size; col++) { for (unsigned int col = 0; col < block_size; col++) {
a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col]; resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
} }
} }
} }
/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
template <unsigned int block_size>
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
for (unsigned int row = 0; row < block_size; row++) {
if (resetRes) {
resVect[row] = 0.0;
}
for (unsigned int col = 0; col < block_size; col++) {
resVect[row] += scale * mat[row * block_size + col] * vect[col];
}
}
}
template <unsigned int block_size>
int BlockedMatrix<block_size>::countUnblockedNnzs() {
int numNnzsOverThreshold = 0;
int totalNnzs = rowPointers[Nb];
for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
if (fabs(nnzValues[idx]) > nnzThreshold) {
numNnzsOverThreshold++;
}
}
return numNnzsOverThreshold;
}
/*
* Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
* If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
*/
template <unsigned int block_size>
void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
const unsigned int bs = block_size;
int valIndex = 0, nnzsPerRow;
mat->rowPointers[0] = 0;
// go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and
// write all non-zero values and corresponding column indices that belong to the same row into the new matrix.
for (int row = 0; row < Nb; row++) {
for (unsigned int bRow = 0; bRow < bs; bRow++) {
nnzsPerRow = 0;
for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) {
for (unsigned int bCol = 0; bCol < bs; bCol++) {
int idx = 0;
// If the matrix is the U matrix, store the rows inside a block in reverse order.
if (isUMatrix) {
idx = col * bs * bs + (bs - bRow - 1) * bs + bCol;
} else {
idx = col * bs * bs + bRow * bs + bCol;
}
if (fabs(nnzValues[idx]) > nnzThreshold) {
mat->nnzValues[valIndex] = nnzValues[idx];
mat->colIndices[valIndex] = colIndices[col] * bs + bCol;
valIndex++;
nnzsPerRow++;
}
}
}
// Update the rowpointers of the new matrix
mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow;
}
}
}
/*Optimized version*/
// ub* prefixes indicate unblocked data
template <unsigned int block_size>
int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
{
int res;
int numUnblockedNnzs = countUnblockedNnzs();
// initialize the non-blocked matrix with the obtained size.
std::unique_ptr<Matrix> ubMat = std::make_unique<Matrix>(Nb * block_size, numUnblockedNnzs);
unblock(ubMat.get(), isUMatrix);
std::vector<int> ubNodesPerColor(numColors);
for (int i = 0; i < numColors; i++) {
ubNodesPerColor[i] = nodesPerColor[i] * block_size;
}
*valSize = ubMat->nnzs;
res = ubMat->toRDF(numColors, ubNodesPerColor,
colIndicesInColor, nnzsPerRowLimit,
ubNnzValues, ubColIndices, nnzValsSizes,
NROffsets, colorSizes);
return res;
}
// coloring is already done, numColors and nodesPerColor are set
// [rows|columns]PerColorLimit are already queried from the FPGA
// colIndicesInColor, PIndicesAddr and colorSizes are written here
// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU
// node == row
// color == partition
// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs
// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF().
// due to partitioning, lots of columns are removed, this matrix keeps track of the mapping
// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
template <unsigned int block_size>
int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
int rowsPerColorLimit, int columnsPerColorLimit,
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes)
{
// Data related to column indices per partition
int doneRows = 0;
std::vector<bool> isColAccessed(Nb); // std::vector<bool> might have some different optimized implementation, initialize in a loop
std::vector<bool> isLColAccessed(Nb);
int totalCols = 0; // sum of numColAccesses for each color, blocked
int LTotalCols = 0, UTotalCols = 0;
int maxCols = 0; // max value of numColAccesses for any color
int maxRowsPerColor = 0; // max value of numRows for any color
int maxColsPerRow = 0; // max value of colsPerRow for any color
// colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates
// colsInColor[c][i] contains the ith column that color c accesses
// initial size allows for each color to access all columns, with space for padding
std::vector<std::vector<int> > colsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
std::vector<std::vector<int> > LColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
std::vector<std::vector<int> > UColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
// find which columns are accessed in each color, as well as how many non-zeroes there are per color.
for (int c = 0; c < numColors; c++) {
int numRows = 0;
// initialize
for (int row = 0; row < Nb; row++) {
isColAccessed[row] = false;
isLColAccessed[row] = false;
}
if (c > 0) {
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
isLColAccessed[i] = true;
}
}
int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0; // number of unique accesses, blocked
// for every row in this color
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
int colsPerRow = 0; // number of blocks for this row
bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]);
for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) {
// for every column in the current row, check if that column was accessed before this color
int col = colIndices[idx];
if (isColAccessed[col] == false) {
colsInColor[c][numColAccesses] = col;
isColAccessed[col] = true;
numColAccesses++;
if (col > row) {
UColsInColor[numColors - c - 1][UNumColAccesses] = col;
UNumColAccesses++;
}
}
if (isLColAccessed[col] == false) {
if (col < row) {
LColsInColor[c][LNumColAccesses] = col;
LNumColAccesses++;
isLColAccessed[col] = true;
}
}
colsPerRow++;
}
if (rowIsEmpty != true) {
numRows++;
}
maxColsPerRow = std::max(maxColsPerRow, colsPerRow);
}
// add columns from previous color into L partition to simplify data forwarding
if (c > 0) {
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
LColsInColor[c][LNumColAccesses] = i;
LNumColAccesses++;
}
}
colorSizes[c * 4 + 10] = numColAccesses * block_size;
LColorSizes[c * 4 + 10] = LNumColAccesses * block_size;
UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size;
// store mapping
for (int col = 0; col < numColAccesses; col++) {
for (unsigned int i = 0; i < block_size; i++) {
colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i;
}
}
for (int col = 0; col < LNumColAccesses; col++) {
for (unsigned int i = 0; i < block_size; i++) {
LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i;
}
}
for (int col = 0; col < UNumColAccesses; col++) {
for (unsigned int i = 0; i < block_size; i++) {
UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i;
}
}
// zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline
while (numColAccesses % 16 != 0) {
colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1];
numColAccesses++;
}
while (LNumColAccesses % 16 != 0) {
LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1];
LNumColAccesses++;
}
while (UNumColAccesses % 16 != 0) {
UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1];
UNumColAccesses++;
}
maxCols = std::max(numColAccesses, maxCols);
totalCols += numColAccesses;
LTotalCols += LNumColAccesses;
UTotalCols += UNumColAccesses;
doneRows = doneRows + nodesPerColor[c];
maxRowsPerColor = std::max(numRows, maxRowsPerColor);
}
if (maxCols * static_cast<int>(block_size) > columnsPerColorLimit) {
std::ostringstream errorstring;
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit;
OPM_THROW(std::logic_error, errorstring.str());
}
doneRows = 0;
int diagValsSize = 0;
int maxRows = 0;
for (int c = 0; c < numColors; c++) {
// calculate sizes that include zeropadding
diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8);
doneRows += nodesPerColor[c];
if (nodesPerColor[c] * static_cast<int>(block_size) > maxRows)
maxRows = nodesPerColor[c];
colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size;
}
if (maxRows * static_cast<int>(block_size) > rowsPerColorLimit) {
std::ostringstream errorstring;
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit;
OPM_THROW(std::logic_error, errorstring.str());
}
// create and fill sizes array as far as already possible
colorSizes[0] = Nb * block_size;
LColorSizes[0] = Nb * block_size;
UColorSizes[0] = Nb * block_size;
// col_sizes (but the matrix is square)
colorSizes[1] = Nb * block_size;
LColorSizes[1] = Nb * block_size;
UColorSizes[1] = Nb * block_size;
colorSizes[2] = totalCols * block_size;
LColorSizes[2] = LTotalCols * block_size;
UColorSizes[2] = UTotalCols * block_size;
// missing val_size, written in Matrix::toRDF()
colorSizes[4] = numColors;
LColorSizes[4] = numColors;
UColorSizes[4] = numColors;
// missing NRFlagsSize, written in Matrix::toRDF()
colorSizes[6] = diagValsSize;
LColorSizes[6] = diagValsSize;
UColorSizes[6] = diagValsSize;
int paddingIdx = numColors;
while (paddingIdx % 4 != 0) {
for (unsigned int i = 0; i < 4; i++) {
colorSizes[paddingIdx * 4 + 8 + i] = 0;
LColorSizes[paddingIdx * 4 + 8 + i] = 0;
UColorSizes[paddingIdx * 4 + 8 + i] = 0;
}
paddingIdx++;
}
int index = 0, Lindex = 0, Uindex = 0;
for (int c = 0; c < numColors; c++) {
// for each unique col access
for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
for (unsigned int i = 0; i < block_size; i++) {
PIndicesAddr[index] = colsInColor[c][col] * block_size + i;
index++;
}
}
// add padding
while (index % 16 != 0) {
PIndicesAddr[index] = PIndicesAddr[index - 1];
index++;
}
for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
for (unsigned int i = 0; i < block_size; i++) {
LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i;
Lindex++;
}
}
while (Lindex % 16 != 0) {
LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1];
Lindex++;
}
for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
for (unsigned int i = 0; i < block_size; i++) {
UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i;
Uindex++;
}
}
while (Uindex % 16 != 0) {
UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1];
Uindex++;
}
}
return 0;
}
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag) {
const unsigned int block_size = 3;
int doneRows = rowSize - 1; // since the rows of U are reversed, the rows of the diag are also reversed
int RDFIndex = 0;
for (int c = 0; c < numColors; c++) {
for (int r = 0; r < rowsPerColor[c]; r++) {
// the rows in the block are reversed
for (int i = static_cast<int>(block_size) - 1; i >= 0; i--) {
for (unsigned int j = 0; j < block_size; j++) {
RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j];
RDFIndex++;
}
// add 4th column, zeropadding
RDFDiag[RDFIndex] = 0.0;
RDFIndex++;
}
}
doneRows -= rowsPerColor[c];
// make sure the color completely fills a cacheline
// a color with 3 blocks would otherwise leave space
while (RDFIndex % 8 != 0) {
RDFDiag[RDFIndex] = 0.0;
RDFIndex++;
}
}
assert(RDFIndex % 8 == 0);
}
#endif // HAVE_FPGA
#define INSTANTIATE_BDA_FUNCTIONS(n) \ #define INSTANTIATE_BDA_FUNCTIONS(n) \
template void sortBlockedRow<n>(int *, double *, int, int); \ template void sortBlockedRow<n>(int *, double *, int, int); \
template void blockMultSub<n>(double *, double *, double *); \ template void blockMultSub<n>(double *, double *, double *); \
template void blockMult<n>(double *, double *, double *); \ template void blockMult<n>(double *, double *, double *); \
template void blockSub<n>(double *, double *, double *); \
INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2); INSTANTIATE_BDA_FUNCTIONS(2);
@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS #undef INSTANTIATE_BDA_FUNCTIONS
#if HAVE_FPGA
#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \
template void blockSub<n>(double *, double *, double *); \
template void blockVectMult<n>(double *, double *, double, double *, bool); \
template int BlockedMatrix<n>::toRDF(int, int *, bool, \
std::vector<std::vector<int> >& , int, int *, \
std::vector<std::vector<double> >&, short int *, unsigned char *, int *, int *); \
template int BlockedMatrix<n>::findPartitionColumns(int, int *, \
int, int, \
std::vector<std::vector<int> >& , int *, int *, \
std::vector<std::vector<int> >& , int *, int *, \
std::vector<std::vector<int> >& , int *, int *);
INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
#endif // HAVE_FPGA
} // end namespace bda } // end namespace bda

View File

@ -20,29 +20,40 @@
#ifndef BLOCKED_MATRIX_HPP #ifndef BLOCKED_MATRIX_HPP
#define BLOCKED_MATRIX_HPP #define BLOCKED_MATRIX_HPP
#if HAVE_FPGA
#include <vector>
#endif
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
namespace bda namespace bda
{ {
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix. /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer. /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
template<int BS> template<unsigned int block_size>
struct BlockedMatrix{ class BlockedMatrix
{
public:
/// Allocate BlockedMatrix and data arrays with given sizes /// Allocate BlockedMatrix and data arrays with given sizes
/// \param[in] Nb number of blockrows /// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of nonzero blocks /// \param[in] nnzbs number of nonzero blocks
BlockedMatrix(int Nb_, int nnzbs_) BlockedMatrix(int Nb_, int nnzbs_)
: nnzValues(new double[nnzbs_*BS*BS]), : nnzValues(new double[nnzbs_*block_size*block_size]),
colIndices(new int[nnzbs_*BS*BS]), colIndices(new int[nnzbs_*block_size*block_size]),
rowPointers(new int[Nb_+1]), rowPointers(new int[Nb_+1]),
Nb(Nb_), Nb(Nb_),
nnzbs(nnzbs_), nnzbs(nnzbs_),
deleteNnzs(true), deleteNnzs(true),
deleteSparsity(true) deleteSparsity(true)
{} {}
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
/// \param[in] M matrix to be copied /// \param[in] M matrix to be copied
BlockedMatrix(const BlockedMatrix& M) BlockedMatrix(const BlockedMatrix& M)
: nnzValues(new double[M.nnzbs*BS*BS]), : nnzValues(new double[M.nnzbs*block_size*block_size]),
colIndices(M.colIndices), colIndices(M.colIndices),
rowPointers(M.rowPointers), rowPointers(M.rowPointers),
Nb(M.Nb), Nb(M.Nb),
@ -50,10 +61,11 @@ struct BlockedMatrix{
deleteNnzs(true), deleteNnzs(true),
deleteSparsity(false) deleteSparsity(false)
{} {}
/// Allocate BlockedMatrix, but let data arrays point to existing arrays /// Allocate BlockedMatrix, but let data arrays point to existing arrays
/// \param[in] Nb number of blockrows /// \param[in] Nb number of blockrows
/// \param[in] nnzbs number of nonzero blocks /// \param[in] nnzbs number of nonzero blocks
/// \param[in] nnzValues array of nonzero values, contains nnzb*BS*BS scalars /// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
/// \param[in] colIndices array of column indices, contains nnzb entries /// \param[in] colIndices array of column indices, contains nnzb entries
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries /// \param[in] rowPointers array of row pointers, contains Nb+1 entries
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_) BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
@ -76,6 +88,28 @@ struct BlockedMatrix{
} }
} }
#if HAVE_FPGA
constexpr static double nnzThreshold = 1e-80; // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero
int countUnblockedNnzs();
void unblock(Matrix *mat, bool isUMatrix);
/// Converts this matrix to the dataformat used by the FPGA
/// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking
int toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
std::vector<std::vector<double> >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize);
/// Analyses the sparsity pattern and prepares for toRDF()
/// Is only called once
int findPartitionColumns(int numColors, int *nodesPerColor,
int rowsPerColorLimit, int columnsPerColorLimit,
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes);
#endif
double *nnzValues; double *nnzValues;
int *colIndices; int *colIndices;
int *rowPointers; int *rowPointers;
@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c);
template <unsigned int block_size> template <unsigned int block_size>
void blockMult(double *mat1, double *mat2, double *resMat); void blockMult(double *mat1, double *mat2, double *resMat);
/// Subtract blocks
/// a = b - c #if HAVE_FPGA
/// \param[out] a result block
/// \param[in] b input block
/// \param[in] c input block
template <unsigned int block_size> template <unsigned int block_size>
void blockSub(double *a, double *b, double *c); void blockSub(double *mat1, double *mat2, double *resMat);
template <unsigned int block_size>
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
/// Convert a blocked inverse diagonal to the FPGA format.
/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
/// Since the rows of U are reversed, the rows of the diag are also reversed.
/// The cachelines can hold 8 doubles, a block has 9 doubles.
/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows.
/// Then 2 blocks (24 doubles) fit on 3 cachelines.
/// Example:
/// [1 2 3] [1 2 3 0] [1 2 3 0 4 5 6 0]
/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1]
/// [7 8 9] [7 8 9 0] [block2 row2 block2 row3]
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
#endif
} // end namespace bda } // end namespace bda

View File

@ -0,0 +1,413 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/MatrixBlock.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
{
using Opm::OpmLog;
using Dune::Timer;
template <unsigned int block_size>
FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
{
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
level_scheduling = true;
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
graph_coloring = true;
} else {
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
}
}
template <unsigned int block_size>
FPGABILU0<block_size>::~FPGABILU0()
{
delete[] invDiagVals;
}
template <unsigned int block_size>
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
resultPointers.resize(numResultPointers, nullptr);
resultSizes.resize(numResultSizes);
// Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
const unsigned int nnzSplit = 1;
this->N = mat->Nb * block_size;
this->Nb = mat->Nb;
this->nnz = mat->nnzbs * block_size * block_size;
this->nnzbs = mat->nnzbs;
toOrder.resize(Nb);
fromOrder.resize(Nb);
std::vector<int> CSCRowIndices(nnzbs);
std::vector<int> CSCColPointers(Nb + 1);
if (level_scheduling) {
Timer t_convert;
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
OpmLog::info(out.str());
}
}
Timer t_analysis;
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
std::ostringstream out;
if (level_scheduling) {
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
} else if (graph_coloring) {
out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
}
if (numColors > maxNumColors) {
std::ostringstream errorstring;
errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
OPM_THROW(std::logic_error, errorstring.str());
}
if (verbosity >= 3) {
out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
}
OpmLog::info(out.str());
int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
int blockDiagSize = 0;
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
int doneRows = 0;
for (int c = 0; c < numColors; c++) {
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
int columnIndex = rMat->colIndices[j];
if (columnIndex < i) {
LColorRoundedValSize += 9;
LNROffsetSize += 9;
}
if (columnIndex > i) {
UColorRoundedValSize += 9;
UNROffsetSize += 9;
}
colorRoundedValSize += 9;
NROffsetSize += 9;
}
blockDiagSize += 12;
}
// End of color: round all sizes to nearest cacheline
colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
NROffsetSize = roundUpTo(NROffsetSize, 64);
LNROffsetSize = roundUpTo(LNROffsetSize, 64);
UNROffsetSize = roundUpTo(UNROffsetSize, 64);
blockDiagSize = roundUpTo(blockDiagSize, 8);
doneRows += rowsPerColor[c];
}
int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
int worstCaseColumnAccessNum = numColors * maxColsPerColor;
nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
// initial number of nnz, used to allocate
nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
colIndices.resize(colorRoundedValSize);
LColIndices.resize(LColorRoundedValSize);
UColIndices.resize(UColorRoundedValSize);
NROffsets.resize(NROffsetSize);
LNROffsets.resize(LNROffsetSize);
UNROffsets.resize(UNROffsetSize);
PIndicesAddr.resize(worstCaseColumnAccessNum);
LPIndicesAddr.resize(worstCaseColumnAccessNum);
UPIndicesAddr.resize(worstCaseColumnAccessNum);
colorSizes.resize(colorSizesNum);
LColorSizes.resize(colorSizesNum);
UColorSizes.resize(colorSizesNum);
blockDiag.resize(blockDiagSize);
colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
maxRowsPerColor, maxColsPerColor,
colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
if (err != 0) {
std::ostringstream errorstring;
errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
OPM_THROW(std::logic_error, errorstring.str());
}
diagIndex.resize(mat->Nb, 0);
invDiagVals = new double[mat->Nb * bs * bs];
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
resultPointers[0] = (void *) colorSizes.data();
resultPointers[1] = (void *) PIndicesAddr.data();
resultPointers[2] = (void *) nnzValues.data();
resultPointers[3] = (void *) colIndices.data();
resultPointers[4] = (void *) NROffsets.data();
resultPointers[5] = (void *) nnzValsSizes.data();
resultPointers[6] = (void *) LColorSizes.data();
resultPointers[7] = (void *) LPIndicesAddr.data();
resultPointers[8] = (void *) LnnzValues.data();
resultPointers[9] = (void *) LColIndices.data();
resultPointers[10] = (void *) LNROffsets.data();
resultPointers[11] = (void *) LnnzValsSizes.data();
resultPointers[12] = (void *) UColorSizes.data();
resultPointers[13] = (void *) UPIndicesAddr.data();
resultPointers[14] = (void *) UnnzValues.data();
resultPointers[15] = (void *) UColIndices.data();
resultPointers[16] = (void *) UNROffsets.data();
resultPointers[17] = (void *) UnnzValsSizes.data();
resultPointers[18] = (void *) blockDiag.data();
//resultPointers[19] and [20] are set by the caller
resultSizes[0] = mat->Nb * block_size;
resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
resultSizes[2] = numColors;
resultSizes[3] = worstCaseColumnAccessNum; //totalCols
resultSizes[4] = NROffsetSize; //NRFlagSize
resultSizes[5] = blockDiagSize; //diagValsSize
resultSizes[6] = mat->Nb * block_size;
resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
resultSizes[8] = numColors;
resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
resultSizes[10] = LNROffsetSize; //LNRFlagSize
resultSizes[11] = blockDiagSize; //LDiagValsSize
resultSizes[12] = mat->Nb * block_size;
resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
resultSizes[14] = numColors;
resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
resultSizes[16] = UNROffsetSize; //UNRFlagSize
resultSizes[17] = blockDiagSize; //UDiagValsSize
return true;
} // end init()
template <unsigned int block_size>
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
{
const unsigned int bs = block_size;
Timer t_reorder;
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
OpmLog::info(out.str());
}
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
Timer t_memcpy;
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
OpmLog::info(out.str());
}
int i, j, ij, ik, jk;
int iRowStart, iRowEnd, jRowEnd;
double pivot[bs * bs];
int LSize = 0;
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
Timer t_decomposition;
// go through all rows
for (i = 0; i < LUMat->Nb; i++) {
iRowStart = LUMat->rowPointers[i];
iRowEnd = LUMat->rowPointers[i + 1];
// go through all elements of the row
for (ij = iRowStart; ij < iRowEnd; ij++) {
j = LUMat->colIndices[ij];
// if the element is the diagonal, store the index and go to next row
if (j == i) {
diagIndex[i] = ij;
break;
}
// if an element beyond the diagonal is reach, no diagonal was found
// throw an error now. TODO: perform reordering earlier to prevent this
if (j > i) {
std::ostringstream out;
out << "BILU0 Error could not find diagonal value in row: " << i;
OpmLog::error(out.str());
return false;
}
LSize++;
// calculate the pivot of this row
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
jRowEnd = LUMat->rowPointers[j + 1];
jk = diagIndex[j] + 1;
ik = ij + 1;
// substract that row scaled by the pivot from this row.
while (ik < iRowEnd && jk < jRowEnd) {
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
ik++;
jk++;
} else {
if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
{ ik++; }
else
{ jk++; }
}
}
}
// store the inverse in the diagonal!
inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
}
LMat->rowPointers[0] = 0;
UMat->rowPointers[0] = 0;
// Split the LU matrix into two by comparing column indices to diagonal indices
for (i = 0; i < LUMat->Nb; i++) {
LMat->rowPointers[i + 1] = LMat->rowPointers[i];
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
if (j < diagIndex[i]) {
memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
}
}
}
// Reverse the order or the (blocked) rows for the U matrix,
// because the rows are accessed in reverse order when applying the ILU0
int URowIndex = 0;
for (i = LUMat->Nb - 1; i >= 0; i--) {
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
if (j > diagIndex[i]) {
memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
}
}
URowIndex++;
}
if (verbosity >= 3) {
std::ostringstream out;
out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
OpmLog::info(out.str());
}
std::vector<int> URowsPerColor(numColors);
rowSize = block_size * rMat->Nb;
LRowSize = block_size * LMat->Nb;
URowSize = block_size * UMat->Nb;
LNumColors = numColors;
UNumColors = numColors;
for (int c = 0; c < numColors; c++) {
URowsPerColor[numColors - c - 1] = rowsPerColor[c];
}
int err;
err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
if (err != 0) {
return false;
}
err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
if (err != 0) {
return false;
}
err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
if (err != 0) {
return false;
}
blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
// resultPointers are set in the init method
resultSizes[0] = rowSize;
resultSizes[1] = colorSizes[3]; // zeropadded valSize;
resultSizes[2] = numColors;
resultSizes[3] = colorSizes[2]; //totalCols
resultSizes[4] = colorSizes[5]; //NRFlagSize
resultSizes[5] = colorSizes[6]; //diagValsSize
resultSizes[6] = LRowSize;
resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
resultSizes[8] = LNumColors;
resultSizes[9] = LColorSizes[2]; //LTotalCols
resultSizes[10] = LColorSizes[5]; //LNRFlagSize
resultSizes[11] = LColorSizes[6]; //LDiagValsSize
resultSizes[12] = URowSize;
resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
resultSizes[14] = UNumColors;
resultSizes[15] = UColorSizes[2]; //UTotalCols
resultSizes[16] = UColorSizes[5]; //UNRFlagSize
resultSizes[17] = UColorSizes[6]; //UDiagValsSize
return true;
} // end create_preconditioner()
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
template FPGABILU0<n>::~FPGABILU0(); \
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda

View File

@ -0,0 +1,117 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef FPGA_BILU0_HEADER_INCLUDED
#define FPGA_BILU0_HEADER_INCLUDED
#include <vector>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
namespace bda
{
/*
* This class implements a Blocked ILU0 preconditioner, with output data
* specifically formatted for the FPGA.
* The decomposition and reorders of the rows of the matrix are done on CPU.
*/
template <unsigned int block_size>
class FPGABILU0
{
private:
int N; // number of rows of the matrix
int Nb; // number of blockrows of the matrix
int nnz; // number of nonzeroes of the matrix (scalar)
int nnzbs; // number of blocks of the matrix
std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
double *invDiagVals = nullptr;
std::vector<int> diagIndex;
std::vector<int> toOrder, fromOrder;
std::vector<int> rowsPerColor;
int numColors;
int verbosity;
// sizes and arrays used during RDF generation
std::vector<std::vector<double> > nnzValues, LnnzValues, UnnzValues;
std::vector<short int> colIndices, LColIndices, UColIndices;
std::vector<unsigned char> NROffsets, LNROffsets, UNROffsets;
std::vector<int> PIndicesAddr, LPIndicesAddr, UPIndicesAddr;
std::vector<int> colorSizes, LColorSizes, UColorSizes;
std::vector<int> nnzValsSizes, LnnzValsSizes, UnnzValsSizes;
std::vector<std::vector<int> > colIndicesInColor, LColIndicesInColor, UColIndicesInColor;
int rowSize, valSize;
int LRowSize, LValSize, LNumColors;
int URowSize, UValSize, UNumColors;
std::vector<double> blockDiag;
ILUReorder opencl_ilu_reorder;
bool level_scheduling = false, graph_coloring = false;
int numResultPointers = 21;
std::vector<void *> resultPointers;
int numResultSizes = 18;
std::vector<int> resultSizes;
int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor
public:
FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors);
~FPGABILU0();
// analysis (optional)
bool init(BlockedMatrix<block_size> *mat);
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<block_size> *mat);
int* getToOrder()
{
return toOrder.data();
}
int* getFromOrder()
{
return fromOrder.data();
}
BlockedMatrix<block_size>* getRMat()
{
return rMat.get();
}
void **getResultPointers()
{
return resultPointers.data();
}
int *getResultSizes()
{
return resultSizes.data();
}
};
} //namespace bda
#endif // FPGA_BILU0_HEADER_INCLUDED

View File

@ -0,0 +1,249 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
namespace bda
{
/*Sort a row of matrix elements from a CSR-format.*/
void sortRow(int *colIndices, double *data, int left, int right) {
int l = left;
int r = right;
int middle = colIndices[(l + r) >> 1];
do {
while (colIndices[l] < middle)
l++;
while (colIndices[r] > middle)
r--;
if (l <= r) {
int lColIndex = colIndices[l];
colIndices[l] = colIndices[r];
colIndices[r] = lColIndex;
double lDatum = data[l];
data[l] = data[r];
data[r] = lDatum;
l++;
r--;
}
} while (l < r);
if (left < r)
sortRow(colIndices, data, left, r);
if (right > l)
sortRow(colIndices, data, l, right);
}
/*
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
* and colors, and the size, in elements, of the NROffsets array, followed by:
* the number of rows (rounded to the nearest 32), the number of rows (not rounded),
* the number of columns (not rounded) and the number of non-zero values
* (rounded to the nearest 32) for every partition.
* This array is zero padded up to the nearest 64-byte cacheline.
* - The "colIndicesInColor" array, which contains for every partition, from which elements
* in the global X vector the elements of that X vector partition came.
* For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then
* that X vector partition will only have three elements, and the color_col_indices array
* will contain 1, 3 and 6 for that partition.
* This array is zero padded up to the nearest 64-byte cacheline for every partition.
* - The "nnzValues" array contains all non-zero values of each partition of the matrix.
* This array is zero-padded so that each color has a multiple of 32 elements (to have the
* same number of elements per partition as the column indices array).
* - The "colIndices" array contains all column indices of each partition of the matrix.
* These column indices are the local indices for that partition, so to be used, first a
* local X vector partition needs to be loaded into some local memory (this is done using
* data from the _color_col_indices array), before these column indices can be used as
* addresses to that local memory to read the desired X vector values.
* This array is zero-padded so that data for every partition fills up a number of complete
* cachelines (this means every color has a multiple of 32 elements).
* - "NROffsets" is the name of the array that contains the new row offsets for
* all elements of every partition of the matrix. New row offsets are 8-bit values which
* are 0 if that element is not the first element in a row, or which, if that element is
* the first element of a row) is equal to the amount of empty rows between that new row
* and the row before it plus 1. This array is zero-padded so that data for every partition
* fills up a number of complete cachelines (this means every color has a multiple of 64 elements).
*/
int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsThisRowLimit,
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes)
{
auto mat = this;
int doneRows = 0;
int totalRowNum = 0; // total number of non-empty rows
int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color
int maxNNZsPerColor = 0; // max of nnzsPerColor
int totalValSize = 0; // sum of nnzsPerColor, padded
std::vector<int> nnzRowsPerColor(numColors);
// find number of nnzs per color and number of non-empty rows
for (int c = 0; c < numColors; c++) {
int numRows = 0;
nnzRowsPerColor[c] = 0;
int firstNnzOfColor = mat->rowPointers[doneRows];
int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]];
nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices
totalValSize += nnzsPerColor;
maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor);
int row = doneRows;
for (; row < doneRows + nodesPerColor[c]; row++) {
if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) {
numRows++;
nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1;
}
}
doneRows = row;
totalRowNum += numRows;
}
int conseqZeroRows = 0; // number of consecutive empty rows
int maxConseqZeroRows = 0;
int numEmptyRows = 0; // total number of empty rows
std::vector<int> rowOffsets(totalRowNum);
std::vector<int> nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows
std::vector<int> colorValPointers(numColors + 1); // points to first nnz of first row of each color
std::vector<int> colorValZeroPointers(numColors); // points to first padded zero for each color
int nonEmptyRowIdx = 0; // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen
doneRows = 0;
int totalPaddingSize = 0; // number of padded zeros from previous colors
int NROffsetSize = 0; // number of NROffsets entries, padded to multiple of 64 for each color
int maxRows = 0;
int maxNNZsPerRow = 0;
// determine the row offset of each row (amount of zero rows between it and the previous non-zero row)
// this is later converted to rowOffset for each nnz
for (int c = 0; c < numColors; c++) {
conseqZeroRows = 0;
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row];
if (nnzsThisRow == 0) {
conseqZeroRows++;
numEmptyRows++;
} else {
maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow);
nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1];
rowOffsets[nonEmptyRowIdx] = conseqZeroRows;
maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows);
conseqZeroRows = 0;
nonEmptyRowIdx++;
}
}
// calculate sizes that include zeropadding
colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize;
colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32);
totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c];
NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64);
doneRows += nodesPerColor[c];
maxRows = std::max(nodesPerColor[c], maxRows);
}
if (maxNNZsPerRow > nnzsThisRowLimit) {
std::ostringstream errorstring;
errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit;
OPM_THROW(std::logic_error, errorstring.str());
}
// create and fill RDF arrays
colorSizes[3] = colorValPointers[numColors]; // total number of nnzs the FPGA has to process, including zeropadding
colorSizes[5] = NROffsetSize;
for (int c = 0; c < numColors; c++) {
colorSizes[c * 4 + 8] = nnzRowsPerColor[c];
colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c];
}
int rowIndex = 0; // keep track of where to read/write
int valIndex = 0;
int NRIndex = 0;
int halfwayPoint = colorValPointers[numColors] / 2;
nnzValsSizes[0] = colorValPointers[numColors];
colorSizes[7] = halfwayPoint;
for (int c = 0; c < numColors; c++) {
int nnzsThisRow;
// make sure 32 values are written in batches (pad with zeros if needed)
for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) {
for (int vb = 0; vb < 32; vb++) {
// if there are enough values for the whole cacheline
if (v + vb < colorValZeroPointers[c]) {
ubNnzValues[0][v + vb] = mat->nnzValues[valIndex];
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex]]);
// if this val is the first of a row
if (nnzRowPointers[rowIndex] == valIndex) {
if (rowOffsets[rowIndex] + 1 >= 255) {
std::ostringstream errorstring;
errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1;
OPM_THROW(std::logic_error, errorstring.str());
}
NROffsets[NRIndex] = static_cast<unsigned char>(rowOffsets[rowIndex] + 1);
// skip all empty rows
while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) {
rowIndex++;
nnzsThisRow = 0;
}
nnzsThisRow++;
}
else
{
NROffsets[NRIndex] = (unsigned char) 0;
nnzsThisRow++;
}
valIndex++;
}
else // zeropadding is needed
{
ubNnzValues[0][v + vb] = 0.0;
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex - 1]]);
NROffsets[NRIndex] = 0;
}
NRIndex++;
}
}
// zeropad the NROffsets file
while (NRIndex % 64 != 0) {
NROffsets[NRIndex] = 0;
NRIndex++;
}
}
return 0;
}
} // end namespace bda

View File

@ -0,0 +1,84 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef FPGA_MATRIX_HEADER_INCLUDED
#define FPGA_MATRIX_HEADER_INCLUDED
#include <vector>
namespace bda
{
/// This struct resembles a csr matrix, only doubles are supported
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
class Matrix {
public:
/// Allocate Matrix and data arrays with given sizes
/// \param[in] N number of rows
/// \param[in] nnzs number of nonzeros
Matrix(int N_, int nnzs_)
: nnzValues(new double[nnzs_]),
colIndices(new int[nnzs_]),
rowPointers(new int[N_+1]),
N(N_),
nnzs(nnzs_)
{}
/// All constructors allocate new memory, so always delete here
~Matrix(){
delete[] nnzValues;
delete[] colIndices;
delete[] rowPointers;
}
/// Converts this matrix to the dataformat used by the FPGA.
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
/// The purpose of this format is to allow the data to be streamable.
/// The rowPointers array has an unpredictable reading pattern/timing,
/// it also needs a extra work if a row is shorter than a cacheline.
/// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets.
/// The value of this offset is 0, unless the corresponding nnz is the first of a row,
/// in that case it is 'the number of empty rows preceeding it + 1'.
/// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx.
/// Example:
/// [1 0 0 3 0] nnzValues [1 3 2 2 1 4 3 4 1]
/// [0 2 2 0 1] colIndices [0 3 1 2 4 0 1 2 4]
/// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9]
/// [0 0 0 0 0] rowOffsets [1 0 1 0 0 1 2 0 0]
/// [0 3 4 0 1]
/// The rowOffset is stored in 1 byte, meaning the maximum value is 255.
int toRDF(int numColors, std::vector<int>& nodesPerColor,
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
double *nnzValues;
int *colIndices;
int *rowPointers;
int N;
int nnzs;
};
void sortRow(int *colIndices, double *data, int left, int right);
} // end namespace bda
#endif // FPGA_MATRIX_HEADER_INCLUDED

View File

@ -0,0 +1,732 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <cmath>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/material/common/Unused.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA
// kernel will be disabled and execution will continue using DUNE
#define FPGA_EXIT_WITH_HW_FAILURE
//#undef FPGA_EXIT_WITH_HW_FAILURE
// if defined, the function generate_statistics will create a CSV-formatted file
// with detailed statistics about the FPGA backend performance
//#define FPGA_STATISTICS_FILE_ENABLED
#undef FPGA_STATISTICS_FILE_ENABLED
namespace bda
{
using Opm::OpmLog;
template <unsigned int block_size>
FpgaSolverBackend<block_size>::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(fpga_bitstream, verbosity_, maxit_, tolerance_)
{
int err;
std::ostringstream oss;
double start = second();
// currently, only block size == 3 is supported by the FPGA backend
assert(block_size == 3);
if (verbosity < 1) {
perf_call_enabled = false;
}
// setup bitstream name and other parameters
if (fpga_bitstream.compare("") == 0) {
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified");
}
if (!fileExists(fpga_bitstream.c_str())) {
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable");
}
// -----------------------------
// FPGA: setup the OpenCL platform
// -----------------------------
std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp
// auto-select the proper FPGA device and create context and other CL objects
err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1);
if (err != 0) {
oss << "Failed to setup the OpenCL device (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
oss << "Detected FPGA platform type is ";
if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; }
OpmLog::info(oss.str());
oss.str("");
oss.clear();
// -----------------------------
// FPGA: setup the debug buffer
// -----------------------------
// set kernel debug lines depending on an environment variable
const char *xem = getenv("XCL_EMULATION_MODE");
if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) {
debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU;
oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n";
OpmLog::info(oss.str());
oss.str("");
oss.clear();
} else {
// set to 2 to reduce overhead in reading back and interpreting the debug lines;
// increase to get more debug info from the kernel
// range is 2..DEBUG_OUTBUF_WORDS_MAX-1
debug_outbuf_words = 2;
}
// host debug buffer setup
err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize);
if (err != 0) {
oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// device debug buffer setup
err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize);
if (err != 0) {
oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n";
OPM_THROW(std::logic_error, oss.str());
}
// copy debug buffer to device
err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words);
if (err != 0) {
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n";
OPM_THROW(std::logic_error, oss.str());
}
// ------------------------------------------------
// FPGA: query the kernel for limits/configuration
// ------------------------------------------------
err = fpga_kernel_query(context, commands, kernel, cldebug,
debugBuffer, debug_outbuf_words,
rst_assert_cycles, rst_settle_cycles,
&hw_x_vector_elem, &hw_max_row_size,
&hw_max_column_size, &hw_max_colors_size,
&hw_max_nnzs_per_row, &hw_max_matrix_size,
&hw_use_uram, &hw_write_ilu0_results,
&hw_dma_data_width, &hw_mult_num,
&hw_x_vector_latency, &hw_add_latency, &hw_mult_latency,
&hw_num_read_ports, &hw_num_write_ports,
&hw_reset_cycles, &hw_reset_settle);
if (err != 0) {
oss << "Failed to call fpga_kernel_query (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
if (verbosity >= 1) {
oss << "FPGA kernel limits/configuration:\n";
oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n";
oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n";
oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n";
oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n";
oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n";
oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n";
oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n";
oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle;
OpmLog::info(oss.str());
oss.str("");
oss.clear();
}
// check that LU results are generated by the kernel
if (use_LU_res && !hw_write_ilu0_results) {
OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage");
oss.str("");
oss.clear();
use_LU_res = false;
}
// setup preconditioner
double start_prec = second();
prec = std::make_unique<Preconditioner>(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size);
perf_total.s_preconditioner_setup = second() - start_prec;
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
level_scheduling = true;
}
perf_total.s_initialization = second() - start;
} // end fpgaSolverBackend
template <unsigned int block_size>
FpgaSolverBackend<block_size>::~FpgaSolverBackend()
{
if (verbosity >= 1) {
generate_statistics();
}
delete[] rx;
delete[] rb;
if (nnzValArrays != nullptr) { free(nnzValArrays); }
if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); }
if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); }
// FPGA: buffers
free(debugBuffer);
for (int b = 0; b < RW_BUF; b++) {
free(dataBuffer[b]);
}
free(databufferSize);
// FPGA: OpenCL objects
if (cldebug != nullptr) { clReleaseMemObject(cldebug); }
for (int b = 0; b < RW_BUF; b++) {
if (cldata[b] != nullptr) {
clReleaseMemObject(cldata[b]);
}
}
clReleaseCommandQueue(commands);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseDevice(device_id);
} // end ~fpgaSolverBackend()
// copy result to host memory
// caller must be sure that x is a valid array
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::get_result(double *x_)
{
double start = 0;
if (perf_call_enabled) {
start = second();
}
// apply to results the reordering (stored in toOrder)
reorderBlockedVectorByPattern<block_size>(mat->Nb, rx, toOrder, x_);
// TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private)
if (perf_call_enabled) {
perf_call.back().s_postprocess = second() - start;
}
} // end get_result()
template <unsigned int block_size>
SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res)
{
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
perf_call.emplace_back();
update_system(vals, b);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
solve_system(res);
if (verbosity >= 1) {
std::ostringstream oss;
oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \
", iterations: " << res.iterations << ", reduction: " << res.reduction << \
", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed;
OpmLog::info(oss.str());
}
return SolverStatus::BDA_SOLVER_SUCCESS;
}
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
{
double start = second();
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
Nb = (N + dim - 1) / dim;
// allocate host memory for matrices and vectors
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
std::ostringstream oss;
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
"block size: " << dim << ", total nnz: " << this->nnz << "\n";
oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
OpmLog::info(oss.str());
rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
perf_total.s_initialization += second() - start;
initialized = true;
} // end initialize()
template <unsigned int block_size>
bool FpgaSolverBackend<block_size>::analyse_matrix()
{
std::ostringstream oss;
int err;
double start = second();
bool success = prec->init(mat.get());
if (!success) {
OpmLog::warning("Preconditioner for FPGA solver failed to initialize");
return success;
}
toOrder = prec->getToOrder();
fromOrder = prec->getFromOrder();
rMat = prec->getRMat();
processedPointers = prec->getResultPointers();
processedSizes = prec->getResultSizes();
processedPointers[19] = rb;
processedPointers[20] = rx;
nnzValArrays_size = static_cast<int*>(processedPointers[5])[0];
L_nnzValArrays_size = static_cast<int*>(processedPointers[11])[0];
U_nnzValArrays_size = static_cast<int*>(processedPointers[17])[0];
// -------------------------------------
// FPGA: setup host/device data buffers
// -------------------------------------
// allocate memory and setup data layout
err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits,
processedSizes,
&setupArray,
&nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray,
&L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray,
&U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray,
&BLKDArray, &X1Array, &R1Array,
&X2Array, &R2Array,
&LresArray, &UresArray,
&databufferSize, dataBuffer,
result_offsets, 1 /*num_nnz_arrays*/,
true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */
debugbufferSize);
if (err) {
oss << "Failed to call fpga_setup_host_datamem (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// results buffers setup
if (use_LU_res) {
resultsBufferNum = 4;
} else {
resultsBufferNum = 2;
}
if (resultsBufferNum > RES_BUF_MAX) {
oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")";
OPM_THROW(std::logic_error, oss.str());
}
resultsNum = processedSizes[0]; // rowSize, invariant between system solves
for (int i = 0; i < resultsBufferNum; i++) {
resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double);
}
// device data memory setup
err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata);
if (err != 0) {
oss << "Failed to call fpga_setup_device_datamem (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// ------------------------------------
// FPGA: setup the kernel's parameters
// ------------------------------------
err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit,
debug_sample_rate, tolerance, cldata, cldebug);
if (err != 0) {
oss << "Failed to call fpga_set_kernel_parameters (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
perf_total.s_analysis = second() - start;
analysis_done = true;
return success;
} // end analyse_matrix()
template <unsigned int block_size>
bool FpgaSolverBackend<block_size>::create_preconditioner()
{
double start = 0;
if (perf_call_enabled) {
start = second();
}
memset(rx, 0, sizeof(double) * N);
bool result = prec->create_preconditioner(mat.get());
if (!result) {
OpmLog::warning("fpgaSolverBackend: create_preconditioner failed");
}
if (perf_call_enabled) {
perf_call.back().s_preconditioner_create = second() - start;
}
return result;
} // end create_preconditioner()
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::solve_system(BdaResult &res)
{
std::ostringstream oss;
int err;
double start = 0, start_total = 0;
// ------------------------------------
// FPGA: return immediately if FPGA is disabled
// ------------------------------------
if (fpga_disabled) {
res.converged = false;
OpmLog::warning("FPGA is disabled, fallback to SW execution");
return;
}
fpga_calls++;
if (perf_call_enabled) {
start = second();
start_total = start;
}
// check if any buffer is larger than the size set in preconditioner->init
// TODO: add check for all other buffer sizes that may overflow?
err = 0;
if ( ((int *)processedPointers[5])[0] > nnzValArrays_size ||
((int *)processedPointers[11])[0] > L_nnzValArrays_size ||
((int *)processedPointers[17])[0] > U_nnzValArrays_size ) {
err = 1;
}
if (err != 0) {
OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)");
}
// ------------------------------------
// FPGA: copy input data to host data buffers
// ------------------------------------
if (perf_call_enabled) {
start = second();
}
err = fpga_copy_host_datamem(
processedPointers, processedSizes, setupArray,
nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray,
L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray,
U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray,
BLKDArray, X1Array, R1Array, X2Array, R2Array,
use_LU_res, LresArray, UresArray,
databufferSize, dataBuffer,
1 /* nnzValArrays_num */,
reset_data_buffers, fill_results_buffers,
dump_data_buffers, fpga_calls);
if (perf_call_enabled) {
perf_call.back().s_mem_setup = second() - start;
}
if (err != 0) {
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// ------------------------------------
// FPGA: copy buffers to device
// ------------------------------------
// copy debug buffer to device
if (perf_call_enabled) {
start = second();
}
err = fpga_copy_to_device_debugbuf(commands,
cldebug, debugBuffer, debugbufferSize,
debug_outbuf_words);
if (err != 0) {
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// copy data buffers to device
err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata);
if (err != 0) {
oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
if (perf_call_enabled) {
perf_call.back().s_mem_h2d = second() - start;
}
// ------------------------------------
// FPGA: execute the kernel
// ------------------------------------
double time_elapsed_ms;
if (perf_call_enabled) {
start = second();
}
err = fpga_kernel_run(commands, kernel, &time_elapsed_ms);
if (perf_call_enabled) {
perf_call.back().s_kernel_exec = second() - start;
}
if (err != 0) {
oss << "Failed to call fpga_kernel_run (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// ----------------------------------------
// FPGA: read back debug buffer from device
// ----------------------------------------
if (perf_call_enabled) {
start = second();
}
err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10),
commands,
debug_outbuf_words, debugbufferSize,
cldebug, debugBuffer,
abort_cycles,
&kernel_cycles, &kernel_iter_run,
norms, &last_norm_idx,
&kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults,
&kernel_wrafterend, &kernel_dbgfifofull);
if (err != 0) {
oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
if (kernel_wrafterend) {
OpmLog::warning("Detected recoverable FPGA error: kernel write after end");
}
if (kernel_dbgfifofull) {
OpmLog::warning("Detected recoverable FPGA error: debug FIFO full");
}
if (kernel_aborted || kernel_signature || kernel_overflow) {
#if defined(FPGA_EXIT_WITH_HW_FAILURE)
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")";
OPM_THROW(std::logic_error, oss.str());
#else
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n";
oss << "Disabling FPGA kernel: execution will continue with SW kernel";
OpmLog::warning(oss.str());
oss.str("");
oss.clear();
fpga_disabled = true;
#endif
}
if (perf_call_enabled) {
perf_call.back().n_kernel_exec_cycles = kernel_cycles;
}
// copy (back) results only if FPGA is not disabled
if (!fpga_disabled) {
if (kernel_noresults) {
OpmLog::warning("FPGA kernel did not return results because the required precision is already reached");
// rx still contains zeros from initial guess
} else {
// ------------------------------------
// FPGA: read back results from device
// ------------------------------------
err = fpga_map_results(even(kernel_iter_run),
use_residuals, use_LU_res, commands,
resultsNum, resultsBufferNum, resultsBufferSize,
debugbufferSize,
cldata, resultsBuffer,
result_offsets,
dump_results, data_dir, basename, sequence);
if (err != 0) {
oss << "Failed to call fpga_map_results (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
// TODO: copy results buffers to reordering output buffers
memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double));
err = fpga_unmap_results(even(kernel_iter_run),
use_residuals, use_LU_res,
commands, cldata, resultsBuffer);
if (err != 0) {
oss << "Failed to call fpga_unmap_results (" << err << ")";
OPM_THROW(std::logic_error, oss.str());
}
}
}
// set results and update statistics (if enabled)
if (perf_call_enabled) {
perf_call.back().s_mem_d2h = second() - start;
}
float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns
res.iterations = (int)iter;
res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm
res.conv_rate = pow(res.reduction, 1.0 / iter);
res.elapsed = second() - start_total;
if (perf_call_enabled) {
perf_call.back().s_solve = res.elapsed;
perf_call.back().n_kernel_exec_iters = iter;
}
// convergence depends on number of iterations reached and hw execution errors
res.converged = true;
if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) {
res.converged = false;
if (verbosity >= 1) {
oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \
", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \
", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit);
OpmLog::warning(oss.str());
oss.str("");
oss.clear();
}
}
if (perf_call_enabled) {
perf_call.back().converged = res.converged;
perf_call.back().converged_flags = ((unsigned int)fpga_disabled) +
((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) +
((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4);
}
} // end solve_system()
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::update_system(double *vals, double *b)
{
double start = 0;
mat->nnzValues = vals;
// reorder inputs using previously found ordering (stored in fromOrder)
if (perf_call_enabled) {
start = second();
}
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
if (perf_call_enabled) {
perf_call.back().s_reorder = second() - start;
}
} // end update_system()
template <unsigned int block_size>
void FpgaSolverBackend<block_size>::generate_statistics()
{
std::ostringstream oss;
unsigned int conv_iter = 0, conv_ovf = 0;
if (!perf_call_enabled || fpga_calls == 0) {
OpmLog::warning("FPGA statistics were not collected");
return;
}
std::printf("--- FPGA statistics ---\n");
std::printf("total solver calls..........: %u\n", fpga_calls);
std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization);
std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup);
#if defined(FPGA_STATISTICS_FILE_ENABLED)
// DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file
FILE *fout = fopen("fpga_statistics_details.csv", "w");
if (fout != nullptr) {
std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n");
}
#endif
unsigned int num_data_points = perf_call.size();
for (unsigned int i = 0; i < num_data_points; i++) {
perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create;
if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; }
if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; }
perf_total.s_analysis += perf_call[i].s_analysis;
if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; }
if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; }
perf_total.s_reorder += perf_call[i].s_reorder;
if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; }
if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; }
perf_total.s_mem_setup += perf_call[i].s_mem_setup;
if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; }
if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; }
perf_total.s_mem_h2d += perf_call[i].s_mem_h2d;
if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; }
if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; }
perf_total.s_kernel_exec += perf_call[i].s_kernel_exec;
if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; }
if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; }
perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles;
if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; }
if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; }
perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters;
if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; }
if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; }
perf_total.s_mem_d2h += perf_call[i].s_mem_d2h;
if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; }
if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; }
perf_total.s_solve += perf_call[i].s_solve;
if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; }
if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; }
perf_total.s_postprocess += perf_call[i].s_postprocess;
if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; }
if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; }
perf_total.n_converged += (unsigned int)perf_call[i].converged;
if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; }
if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; }
#if defined(FPGA_STATISTICS_FILE_ENABLED)
if (fout != nullptr) {
std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n",
i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder,
perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles,
perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess,
(unsigned int)perf_call[i].converged);
}
#endif
}
#if defined(FPGA_STATISTICS_FILE_ENABLED)
if (fout != nullptr) {
fclose(fout);
}
#endif
perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points;
perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points;
perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points;
perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points;
perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points;
perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points;
perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points;
perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points;
perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points;
perf_total.s_solve_avg = perf_total.s_solve / num_data_points;
perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points;
std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max);
std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max);
std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max);
std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max);
std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max);
std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max);
std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n",
perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max);
std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n",
perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max);
std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max);
std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max);
std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max);
std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n",
perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf);
std::printf("-----------------------\n");
} //end generate_statistics()
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template FpgaSolverBackend<n>::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);
INSTANTIATE_BDA_FUNCTIONS(3);
INSTANTIATE_BDA_FUNCTIONS(4);
#undef INSTANTIATE_BDA_FUNCTIONS
} //namespace bda

View File

@ -0,0 +1,265 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
namespace bda
{
/// This class implements an ilu0-bicgstab solver on FPGA
template <unsigned int block_size>
class FpgaSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
typedef FPGABILU0<block_size> Preconditioner;
using Base::N;
using Base::Nb;
using Base::nnz;
using Base::nnzb;
using Base::verbosity;
using Base::maxit;
using Base::tolerance;
using Base::initialized;
private:
double *rx = nullptr; // reordered x
double *rb = nullptr; // reordered b
int *fromOrder = nullptr, *toOrder = nullptr;
bool analysis_done = false;
bool level_scheduling = false;
// LUMat will shallow copy rowPointers and colIndices of mat/rMat
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
BlockedMatrix<block_size> *rMat = nullptr;
std::unique_ptr<Preconditioner> prec = nullptr;
// vectors with data processed by the preconditioner (input to the kernel)
void **processedPointers = nullptr;
int *processedSizes = nullptr;
unsigned int fpga_calls = 0;
bool perf_call_enabled = true;
// per call performance metrics
typedef struct {
double s_preconditioner_create = 0.0;
double s_analysis = 0.0;
double s_reorder = 0.0;
double s_mem_setup = 0.0;
double s_mem_h2d = 0.0;
double s_kernel_exec = 0.0;
unsigned int n_kernel_exec_cycles = 0;
float n_kernel_exec_iters = 0.0;
double s_mem_d2h = 0.0;
double s_solve = 0.0;
double s_postprocess = 0.0;
bool converged = false;
unsigned int converged_flags = 0;
} perf_call_metrics_t;
// cumulative performance metrics
typedef struct {
double s_initialization;
double s_preconditioner_setup;
double s_preconditioner_create;
double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
double s_analysis;
double s_analysis_min,s_analysis_max,s_analysis_avg;
double s_reorder;
double s_reorder_min,s_reorder_max,s_reorder_avg;
double s_mem_setup;
double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
double s_mem_h2d;
double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
double s_kernel_exec;
double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
unsigned long n_kernel_exec_cycles;
unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
float n_kernel_exec_iters;
float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
double s_mem_d2h;
double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
double s_solve;
double s_solve_min,s_solve_max,s_solve_avg;
double s_postprocess;
double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
unsigned int n_converged;
} perf_total_metrics_t;
std::vector<perf_call_metrics_t> perf_call;
perf_total_metrics_t perf_total;
// fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
// fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
unsigned int fpga_config_bits = 0;
bool fpga_disabled = false;
bool platform_awsf1;
unsigned int debugbufferSize;
unsigned long int *debugBuffer = nullptr;
unsigned int *databufferSize = nullptr;
unsigned char *dataBuffer[RW_BUF] = {nullptr};
unsigned int debug_outbuf_words;
int resultsNum;
int resultsBufferNum;
unsigned int resultsBufferSize[RES_BUF_MAX];
unsigned int result_offsets[6];
unsigned int kernel_cycles, kernel_iter_run;
double norms[4];
unsigned char last_norm_idx;
bool kernel_aborted, kernel_signature, kernel_overflow;
bool kernel_noresults;
bool kernel_wrafterend, kernel_dbgfifofull;
bool use_residuals = false;
bool use_LU_res = false;
int sequence = 0;
// TODO: these values may be sent via command line parameters
unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
int nnzValArrays_size = 0;
int L_nnzValArrays_size = 0;
int U_nnzValArrays_size = 0;
// aliases to areas of the host data buffers
long unsigned int *setupArray = nullptr;
double **nnzValArrays = nullptr;
short unsigned int *columnIndexArray = nullptr;
unsigned char *newRowOffsetArray = nullptr;
unsigned int *PIndexArray = nullptr;
unsigned int *colorSizesArray = nullptr;
double **L_nnzValArrays = nullptr;
short unsigned int *L_columnIndexArray = nullptr;
unsigned char *L_newRowOffsetArray = nullptr;
unsigned int *L_PIndexArray = nullptr;
unsigned int *L_colorSizesArray = nullptr;
double **U_nnzValArrays = nullptr;
short unsigned int *U_columnIndexArray = nullptr;
unsigned char *U_newRowOffsetArray = nullptr;
unsigned int *U_PIndexArray = nullptr;
unsigned int *U_colorSizesArray = nullptr;
double *BLKDArray = nullptr;
double *X1Array = nullptr, *X2Array = nullptr;
double *R1Array = nullptr, *R2Array = nullptr;
double *LresArray = nullptr, *UresArray = nullptr;
double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
// OpenCL variables
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_mem cldata[RW_BUF] = {nullptr};
cl_mem cldebug = nullptr;
// HW limits/configuration variables
unsigned int hw_x_vector_elem;
unsigned int hw_max_row_size;
unsigned int hw_max_column_size;
unsigned int hw_max_colors_size;
unsigned short hw_max_nnzs_per_row;
unsigned int hw_max_matrix_size;
bool hw_use_uram;
bool hw_write_ilu0_results;
unsigned short hw_dma_data_width;
unsigned char hw_x_vector_latency;
unsigned char hw_add_latency;
unsigned char hw_mult_latency;
unsigned char hw_mult_num;
unsigned char hw_num_read_ports;
unsigned char hw_num_write_ports;
unsigned short hw_reset_cycles;
unsigned short hw_reset_settle;
// debug
bool reset_data_buffers = false;
bool fill_results_buffers = false;
int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
bool dump_results = false;
char *data_dir = nullptr;
char *basename = nullptr;
unsigned short rst_assert_cycles = 0;
unsigned short rst_settle_cycles = 0;
/// Allocate host memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vector
void update_system(double *vals, double *b);
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
bool analyse_matrix();
/// Perform ilu0-decomposition
/// \return true iff decomposition was successful
bool create_preconditioner();
/// Solve linear system
/// \param[inout] res summary of solver result
void solve_system(BdaResult &res);
/// Generate FPGA backend statistics
void generate_statistics(void);
public:
/// Construct an fpgaSolver
/// \param[in] fpga_bitstream FPGA bitstream file name
/// \param[in] linear_solver_verbosity verbosity of fpgaSolver
/// \param[in] maxit maximum number of iterations for fpgaSolver
/// \param[in] tolerance required relative tolerance for fpgaSolver
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
/// Destroy an fpgaSolver, and free memory
~FpgaSolverBackend();
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
/// \param[in] N number of rows, divide by dim to get number of blockrows
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
/// \param[in] dim size of block
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
/// \param[in] b input vector, contains N values
/// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A
/// \param[inout] res summary of solver result
/// \return status code
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
/// Get result after linear solve, and peform postprocessing if necessary
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
void get_result(double *x) override;
}; // end class fpgaSolverBackend
} //namespace bda
#endif

View File

@ -0,0 +1,63 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <sys/time.h>
#include <fstream>
namespace bda
{
double second(void)
{
struct timeval tv;
gettimeofday(&tv, nullptr);
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
}
bool even(int n)
{
if (n % 2 == 0) {
return true;
} else {
return false;
}
}
int roundUpTo(int i, int n)
{
if (i % n == 0) {
return i;
} else {
return (i / n + 1) * n;
}
}
bool fileExists(const char *filename)
{
FILE *fin;
fin = fopen(filename, "r");
if (fin == nullptr) {
return false;
} else {
fclose(fin);
return true;
}
}
} //namespace bda

View File

@ -0,0 +1,39 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef FPGA_UTILS_HEADER_INCLUDED
#define FPGA_UTILS_HEADER_INCLUDED
namespace bda
{
union double2int
{
unsigned long int int_val;
double double_val;
};
double second(void);
bool even(int n);
int roundUpTo(int i, int n);
bool fileExists(const char *filename);
} // end namespace bda
#endif // FPGA_UTILS_HEADER_INCLUDED

View File

@ -17,12 +17,7 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>. along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include <vector>
#include <cstring>
#include <algorithm> // for fill()
#include <random> #include <random>
#include <limits>
#include <sstream>
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>

View File

@ -20,6 +20,8 @@
#ifndef REORDER_HPP #ifndef REORDER_HPP
#define REORDER_HPP #define REORDER_HPP
#include <vector>
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp> #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
namespace bda namespace bda

View File

@ -28,15 +28,18 @@
namespace Opm namespace Opm
{ {
WellContributions::WellContributions(std::string gpu_mode){ WellContributions::WellContributions(std::string accelerator_mode){
if(gpu_mode.compare("cusparse") == 0){ if(accelerator_mode.compare("cusparse") == 0){
cuda_gpu = true; cuda_gpu = true;
} }
else if(gpu_mode.compare("opencl") == 0){ else if(accelerator_mode.compare("opencl") == 0){
opencl_gpu = true; opencl_gpu = true;
} }
else if(accelerator_mode.compare("fpga") == 0){
// unused for FPGA, but must be defined to avoid error
}
else{ else{
OPM_THROW(std::logic_error, "Error: invalid GPU mode"); OPM_THROW(std::logic_error, "Invalid accelerator mode");
} }
} }

View File

@ -176,7 +176,7 @@ public:
void alloc(); void alloc();
/// Create a new WellContributions /// Create a new WellContributions
WellContributions(std::string gpu_mode); WellContributions(std::string accelerator_mode);
/// Destroy a WellContributions, and free memory /// Destroy a WellContributions, and free memory
~WellContributions(); ~WellContributions();