mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #2998 from g-marchiori/fpgasolver-integration
Added fpgaSolver, requires Xilinx Alveo U280 FPGA board
This commit is contained in:
commit
13f62a718b
@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for
|
||||
option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
|
||||
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
|
||||
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
|
||||
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
|
||||
|
||||
if(SIBLING_SEARCH AND NOT opm-common_DIR)
|
||||
# guess the sibling dir
|
||||
@ -132,9 +133,72 @@ if(CUDA_FOUND)
|
||||
include_directories(${CUDA_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
|
||||
find_package(OpenCL)
|
||||
|
||||
# include FPGA target only when ENABLE_FPGA is set to ON
|
||||
if(ENABLE_FPGA)
|
||||
# must include opencl.h which is a wrapper for all the OpenCL extensions
|
||||
find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS})
|
||||
|
||||
# FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in <opm-simulators>/../FPGA
|
||||
find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA)
|
||||
|
||||
# this code only runs if the user explicitly enabled the FPGA
|
||||
# hence fatal_error instead of warning
|
||||
if(NOT OPENCL_H OR NOT OpenCL_FOUND)
|
||||
set(HAVE_FPGA 0)
|
||||
message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.")
|
||||
elseif(NOT FPGA_MODULE)
|
||||
set(HAVE_FPGA 0)
|
||||
message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.")
|
||||
elseif(NOT DEFINED ENV{XILINX_XRT})
|
||||
# Xilinx XRT must be installed and properly setup
|
||||
set(HAVE_FPGA 0)
|
||||
message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.")
|
||||
else()
|
||||
set(HAVE_FPGA 1)
|
||||
message(STATUS "FPGA library and kernel integration active.")
|
||||
|
||||
# FIXME: set the correct path to the FPGA module
|
||||
set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA)
|
||||
|
||||
# configuration variables with default values: they can be overridden on the cmake command line
|
||||
# FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream
|
||||
if(NOT FPGA_PORTS_CONFIG)
|
||||
set(FPGA_PORTS_CONFIG 2r_3r3w_ddr)
|
||||
endif()
|
||||
# FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build)
|
||||
if(NOT FPGA_DEBUG_LEVEL)
|
||||
set(FPGA_DEBUG_LEVEL 0)
|
||||
endif()
|
||||
message(STATUS "Using the following settings for the FPGA library compilation: "
|
||||
"FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, "
|
||||
"FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}")
|
||||
add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG})
|
||||
add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL})
|
||||
# include directories for the FPGA library
|
||||
include_directories(${FPGA_SOURCE_DIR})
|
||||
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app)
|
||||
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common)
|
||||
# add external project to compile the FPGA library
|
||||
include (ExternalProject)
|
||||
ExternalProject_Add(FPGA_library
|
||||
# force the build step to always be run because source dependencies cannot be made explicit
|
||||
BUILD_ALWAYS 1
|
||||
DOWNLOAD_COMMAND ""
|
||||
UPDATE_COMMAND ""
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND make
|
||||
-f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile
|
||||
SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app
|
||||
PORTS_CONFIG=${FPGA_PORTS_CONFIG}
|
||||
DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(OpenCL_FOUND)
|
||||
# the current OpenCL implementation relies on cl.hpp, not cl2.hpp
|
||||
# make sure it is available, otherwise disable OpenCL
|
||||
@ -454,7 +518,8 @@ endif()
|
||||
|
||||
add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
|
||||
|
||||
# must link libraries after target 'flow' has been defined
|
||||
# must link libraries after target 'opmsimulators' has been defined
|
||||
|
||||
if(CUDA_FOUND)
|
||||
target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
|
||||
target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
|
||||
@ -463,3 +528,9 @@ endif()
|
||||
if(OpenCL_FOUND)
|
||||
target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
|
||||
endif()
|
||||
|
||||
if(HAVE_FPGA)
|
||||
add_dependencies(opmsimulators FPGA_library)
|
||||
ExternalProject_Get_Property(FPGA_library binary_dir)
|
||||
target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a)
|
||||
endif()
|
||||
|
@ -66,6 +66,16 @@ if(OPENCL_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||
endif()
|
||||
if(HAVE_FPGA)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||
endif()
|
||||
|
||||
if(MPI_FOUND)
|
||||
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
||||
opm/simulators/utils/ParallelSerialization.cpp)
|
||||
@ -184,6 +194,10 @@ list (APPEND PUBLIC_HEADER_FILES
|
||||
opm/simulators/linalg/bda/cuda_header.hpp
|
||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
||||
opm/simulators/linalg/bda/FPGAMatrix.hpp
|
||||
opm/simulators/linalg/bda/FPGABILU0.hpp
|
||||
opm/simulators/linalg/bda/FPGASolverBackend.hpp
|
||||
opm/simulators/linalg/bda/FPGAUtils.hpp
|
||||
opm/simulators/linalg/bda/Reorder.hpp
|
||||
opm/simulators/linalg/bda/ILUReorder.hpp
|
||||
opm/simulators/linalg/bda/opencl.hpp
|
||||
|
@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR
|
||||
HAVE_PETSC
|
||||
HAVE_CUDA
|
||||
HAVE_OPENCL
|
||||
HAVE_FPGA
|
||||
HAVE_SUITESPARSE_UMFPACK_H
|
||||
HAVE_DUNE_ISTL
|
||||
DUNE_ISTL_VERSION_MAJOR
|
||||
|
@ -118,7 +118,7 @@ struct Linsolver {
|
||||
using type = UndefinedProperty;
|
||||
};
|
||||
template<class TypeTag, class MyTypeTag>
|
||||
struct GpuMode {
|
||||
struct AcceleratorMode {
|
||||
using type = UndefinedProperty;
|
||||
};
|
||||
template<class TypeTag, class MyTypeTag>
|
||||
@ -133,6 +133,10 @@ template<class TypeTag, class MyTypeTag>
|
||||
struct OpenclIluReorder {
|
||||
using type = UndefinedProperty;
|
||||
};
|
||||
template<class TypeTag, class MyTypeTag>
|
||||
struct FpgaBitstream {
|
||||
using type = UndefinedProperty;
|
||||
};
|
||||
|
||||
template<class TypeTag>
|
||||
struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
@ -213,7 +217,7 @@ struct Linsolver<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
static constexpr auto value = "ilu0";
|
||||
};
|
||||
template<class TypeTag>
|
||||
struct GpuMode<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
struct AcceleratorMode<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
static constexpr auto value = "none";
|
||||
};
|
||||
template<class TypeTag>
|
||||
@ -226,7 +230,11 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
};
|
||||
template<class TypeTag>
|
||||
struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
static constexpr auto value = "graph_coloring";
|
||||
static constexpr auto value = ""; // note: default value is chosen depending on the solver used
|
||||
};
|
||||
template<class TypeTag>
|
||||
struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
|
||||
static constexpr auto value = "";
|
||||
};
|
||||
|
||||
} // namespace Opm::Properties
|
||||
@ -252,12 +260,13 @@ namespace Opm
|
||||
bool ignoreConvergenceFailure_;
|
||||
bool scale_linear_system_;
|
||||
std::string linsolver_;
|
||||
std::string gpu_mode_;
|
||||
std::string accelerator_mode_;
|
||||
int bda_device_id_;
|
||||
int opencl_platform_id_;
|
||||
int cpr_max_ell_iter_ = 20;
|
||||
int cpr_reuse_setup_ = 0;
|
||||
std::string opencl_ilu_reorder_;
|
||||
std::string fpga_bitstream_;
|
||||
|
||||
template <class TypeTag>
|
||||
void init()
|
||||
@ -279,10 +288,11 @@ namespace Opm
|
||||
cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
|
||||
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
|
||||
linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
|
||||
gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||
accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||
bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
||||
opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
||||
opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||
fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||
}
|
||||
|
||||
template <class TypeTag>
|
||||
@ -304,10 +314,11 @@ namespace Opm
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
|
||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
|
||||
}
|
||||
|
||||
FlowLinearSolverParameters() { reset(); }
|
||||
@ -327,10 +338,11 @@ namespace Opm
|
||||
ilu_milu_ = MILU_VARIANT::ILU;
|
||||
ilu_redblack_ = false;
|
||||
ilu_reorder_sphere_ = true;
|
||||
gpu_mode_ = "none";
|
||||
accelerator_mode_ = "none";
|
||||
bda_device_id_ = 0;
|
||||
opencl_platform_id_ = 0;
|
||||
opencl_ilu_reorder_ = "graph_coloring";
|
||||
opencl_ilu_reorder_ = ""; // note: the default value is chosen depending on the solver used
|
||||
fpga_bitstream_ = "";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -35,7 +35,7 @@
|
||||
#include <opm/simulators/linalg/setupPropertyTree.hpp>
|
||||
|
||||
|
||||
#if HAVE_CUDA || HAVE_OPENCL
|
||||
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||
#endif
|
||||
|
||||
@ -92,7 +92,7 @@ namespace Opm
|
||||
using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
|
||||
using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
|
||||
|
||||
#if HAVE_CUDA || HAVE_OPENCL
|
||||
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||
static const unsigned int block_size = Matrix::block_type::rows;
|
||||
std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
|
||||
#endif
|
||||
@ -126,14 +126,14 @@ namespace Opm
|
||||
#endif
|
||||
parameters_.template init<TypeTag>();
|
||||
prm_ = setupPropertyTree<TypeTag>(parameters_);
|
||||
#if HAVE_CUDA || HAVE_OPENCL
|
||||
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||
{
|
||||
std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||
if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) {
|
||||
std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||
if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
|
||||
if (on_io_rank) {
|
||||
OpmLog::warning("Cannot use GPU with MPI, GPU is disabled");
|
||||
OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled");
|
||||
}
|
||||
gpu_mode = "none";
|
||||
accelerator_mode = "none";
|
||||
}
|
||||
const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
||||
const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
||||
@ -141,11 +141,12 @@ namespace Opm
|
||||
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
||||
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
||||
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
||||
}
|
||||
#else
|
||||
if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") {
|
||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
|
||||
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
|
||||
OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
|
||||
}
|
||||
#endif
|
||||
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
||||
@ -157,6 +158,12 @@ namespace Opm
|
||||
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
|
||||
|
||||
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||
#if HAVE_FPGA
|
||||
// check usage of MatrixAddWellContributions: for FPGA they must be included
|
||||
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) {
|
||||
OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true");
|
||||
}
|
||||
#endif
|
||||
const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
|
||||
if (!ownersFirst) {
|
||||
const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
|
||||
@ -242,43 +249,42 @@ namespace Opm
|
||||
|
||||
// Solve system.
|
||||
Dune::InverseOperatorResult result;
|
||||
bool gpu_was_used = false;
|
||||
bool accelerator_was_used = false;
|
||||
|
||||
// Use GPU if: available, chosen by user, and successful.
|
||||
#if HAVE_CUDA || HAVE_OPENCL
|
||||
// Use FPGA if: support compiled, chosen by user, and successful.
|
||||
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||
bool use_gpu = bdaBridge->getUseGpu();
|
||||
if (use_gpu) {
|
||||
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
||||
WellContributions wellContribs(gpu_mode);
|
||||
|
||||
bool use_fpga = bdaBridge->getUseFpga();
|
||||
if (use_gpu || use_fpga) {
|
||||
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||
WellContributions wellContribs(accelerator_mode);
|
||||
bdaBridge->initWellContributions(wellContribs);
|
||||
|
||||
if (!useWellConn_) {
|
||||
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
||||
}
|
||||
|
||||
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
|
||||
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
|
||||
if (result.converged) {
|
||||
// get result vector x from non-Dune backend, iff solve was successful
|
||||
bdaBridge->get_result(x);
|
||||
gpu_was_used = true;
|
||||
accelerator_was_used = true;
|
||||
} else {
|
||||
// CPU fallback
|
||||
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
|
||||
if (use_gpu && simulator_.gridView().comm().rank() == 0) {
|
||||
if (gpu_mode.compare("cusparse") == 0) {
|
||||
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
||||
}
|
||||
if (gpu_mode.compare("opencl") == 0) {
|
||||
OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
|
||||
}
|
||||
// warn about CPU fallback
|
||||
// BdaBridge might have disabled its BdaSolver for this simulation due to some error
|
||||
// in that case the BdaBridge is disabled and flexibleSolver is always used
|
||||
// or maybe the BdaSolver did not converge in time, then it will be used next linear solve
|
||||
if (simulator_.gridView().comm().rank() == 0) {
|
||||
OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Otherwise, use flexible istl solver.
|
||||
if (!gpu_was_used) {
|
||||
if (!accelerator_was_used) {
|
||||
assert(flexibleSolver_);
|
||||
flexibleSolver_->apply(x, *rhs_, result);
|
||||
}
|
||||
|
@ -33,24 +33,19 @@
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
|
||||
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
|
||||
{}
|
||||
{}
|
||||
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::~BILU0()
|
||||
{
|
||||
template <unsigned int block_size>
|
||||
BILU0<block_size>::~BILU0()
|
||||
{
|
||||
delete[] invDiagVals;
|
||||
delete[] diagIndex;
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] toOrder;
|
||||
delete[] fromOrder;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
@ -68,8 +63,8 @@ namespace bda
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
||||
} else {
|
||||
toOrder = new int[Nb];
|
||||
fromOrder = new int[Nb];
|
||||
toOrder.resize(Nb);
|
||||
fromOrder.resize(Nb);
|
||||
CSCRowIndices = new int[nnzbs];
|
||||
CSCColPointers = new int[Nb + 1];
|
||||
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||
@ -88,10 +83,10 @@ namespace bda
|
||||
std::ostringstream out;
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
out << "BILU0 reordering strategy: none\n";
|
||||
// numColors = 1;
|
||||
@ -111,12 +106,13 @@ namespace bda
|
||||
}
|
||||
OpmLog::info(out.str());
|
||||
|
||||
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] CSCRowIndices;
|
||||
delete[] CSCColPointers;
|
||||
}
|
||||
|
||||
diagIndex = new int[mat->Nb];
|
||||
diagIndex.resize(mat->Nb);
|
||||
invDiagVals = new double[mat->Nb * bs * bs];
|
||||
|
||||
#if CHOW_PATEL
|
||||
@ -159,7 +155,7 @@ namespace bda
|
||||
}
|
||||
|
||||
return true;
|
||||
} // end init()
|
||||
} // end init()
|
||||
|
||||
|
||||
// implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
|
||||
@ -482,7 +478,7 @@ namespace bda
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(8);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
|
||||
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
|
||||
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
|
||||
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
|
||||
@ -516,7 +512,7 @@ namespace bda
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
m = rmat.get();
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
|
||||
reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
|
||||
|
||||
if (verbosity >= 3){
|
||||
std::ostringstream out;
|
||||
@ -556,7 +552,7 @@ namespace bda
|
||||
diagIndex[row] = candidate - LUmat->colIndices;
|
||||
}
|
||||
events.resize(4);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||
});
|
||||
@ -637,30 +633,30 @@ namespace bda
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLContext(cl::Context *context_){
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLContext(cl::Context *context_) {
|
||||
this->context = context_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_){
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
|
||||
this->queue = queue_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_){
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
|
||||
this->work_group_size = work_group_size_;
|
||||
this->total_work_items = total_work_items_;
|
||||
this->lmem_per_work_group = lmem_per_work_group_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernels(
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
|
||||
){
|
||||
){
|
||||
this->ILU_apply1 = ILU_apply1_;
|
||||
this->ILU_apply2 = ILU_apply2_;
|
||||
this->ilu_decomp_k = ilu_decomp_k_;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
|
@ -61,10 +61,10 @@ namespace bda
|
||||
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
|
||||
#endif
|
||||
double *invDiagVals = nullptr;
|
||||
int *diagIndex = nullptr;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||
int *toOrder = nullptr, *fromOrder = nullptr;
|
||||
std::vector<int> toOrder, fromOrder;
|
||||
int numColors;
|
||||
int verbosity;
|
||||
std::once_flag pattern_uploaded;
|
||||
@ -128,12 +128,12 @@ namespace bda
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
return toOrder;
|
||||
return toOrder.data();
|
||||
}
|
||||
|
||||
int* getFromOrder()
|
||||
{
|
||||
return fromOrder;
|
||||
return fromOrder.data();
|
||||
}
|
||||
|
||||
BlockedMatrix<block_size>* getRMat()
|
||||
|
@ -18,8 +18,6 @@
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
@ -54,21 +52,23 @@ namespace Opm
|
||||
using bda::ILUReorder;
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
|
||||
: gpu_mode(gpu_mode_)
|
||||
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
|
||||
: accelerator_mode(accelerator_mode_)
|
||||
{
|
||||
if (gpu_mode.compare("cusparse") == 0) {
|
||||
if (accelerator_mode.compare("cusparse") == 0) {
|
||||
#if HAVE_CUDA
|
||||
use_gpu = true;
|
||||
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
||||
#endif
|
||||
} else if (gpu_mode.compare("opencl") == 0) {
|
||||
} else if (accelerator_mode.compare("opencl") == 0) {
|
||||
#if HAVE_OPENCL
|
||||
use_gpu = true;
|
||||
ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||
if (opencl_ilu_reorder == "level_scheduling") {
|
||||
ILUReorder ilu_reorder;
|
||||
if (opencl_ilu_reorder == "") {
|
||||
ilu_reorder = bda::ILUReorder::GRAPH_COLORING; // default when not selected by user
|
||||
} else if (opencl_ilu_reorder == "level_scheduling") {
|
||||
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
||||
} else if (opencl_ilu_reorder == "graph_coloring") {
|
||||
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||
@ -81,10 +81,28 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||
#endif
|
||||
} else if (gpu_mode.compare("none") == 0) {
|
||||
use_gpu = false;
|
||||
} else if (accelerator_mode.compare("fpga") == 0) {
|
||||
#if HAVE_FPGA
|
||||
use_fpga = true;
|
||||
ILUReorder ilu_reorder;
|
||||
if (opencl_ilu_reorder == "") {
|
||||
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
|
||||
} else if (opencl_ilu_reorder == "level_scheduling") {
|
||||
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
||||
} else if (opencl_ilu_reorder == "graph_coloring") {
|
||||
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
|
||||
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
||||
}
|
||||
backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
|
||||
#else
|
||||
OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
|
||||
#endif
|
||||
} else if (accelerator_mode.compare("none") == 0) {
|
||||
use_gpu = false;
|
||||
use_fpga = false;
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
|
||||
}
|
||||
}
|
||||
|
||||
@ -130,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
|
||||
|
||||
|
||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||
// sparsity pattern should stay the same due to matrix-add-well-contributions
|
||||
// sparsity pattern should stay the same
|
||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||
template <class BridgeMatrix>
|
||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||
@ -161,14 +179,16 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
||||
{
|
||||
|
||||
if (use_gpu) {
|
||||
if (use_gpu || use_fpga) {
|
||||
BdaResult result;
|
||||
result.converged = false;
|
||||
static std::vector<int> h_rows;
|
||||
static std::vector<int> h_cols;
|
||||
const int dim = (*mat)[0][0].N();
|
||||
const int N = mat->N()*dim;
|
||||
const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
|
||||
const int Nb = mat->N();
|
||||
const int N = Nb * dim;
|
||||
const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
|
||||
const int nnz = nnzb * dim * dim;
|
||||
|
||||
if (dim != 3) {
|
||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||
@ -177,8 +197,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
}
|
||||
|
||||
if (h_rows.capacity() == 0) {
|
||||
h_rows.reserve(N+1);
|
||||
h_cols.reserve(nnz);
|
||||
h_rows.reserve(Nb+1);
|
||||
h_cols.reserve(nnzb);
|
||||
#if PRINT_TIMERS_BRIDGE
|
||||
Dune::Timer t;
|
||||
#endif
|
||||
@ -225,7 +245,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
res.converged = result.converged;
|
||||
res.conv_rate = result.conv_rate;
|
||||
res.elapsed = result.elapsed;
|
||||
}else{
|
||||
} else {
|
||||
res.converged = false;
|
||||
}
|
||||
}
|
||||
@ -233,14 +253,14 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
|
||||
if (use_gpu) {
|
||||
if (use_gpu || use_fpga) {
|
||||
backend->get_result(static_cast<double*>(&(x[0][0])));
|
||||
}
|
||||
}
|
||||
|
||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
|
||||
if(gpu_mode.compare("opencl") == 0){
|
||||
if(accelerator_mode.compare("opencl") == 0){
|
||||
#if HAVE_OPENCL
|
||||
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
|
||||
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
|
||||
@ -250,12 +270,12 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(We
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||
n>::BdaBridge \
|
||||
(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \
|
||||
(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, \
|
||||
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \
|
||||
\
|
||||
template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||
|
@ -30,6 +30,10 @@
|
||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||
|
||||
|
||||
#if HAVE_FPGA
|
||||
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
@ -42,19 +46,22 @@ class BdaBridge
|
||||
{
|
||||
private:
|
||||
bool use_gpu = false;
|
||||
std::string gpu_mode;
|
||||
bool use_fpga = false;
|
||||
std::string accelerator_mode;
|
||||
std::unique_ptr<bda::BdaSolver<block_size> > backend;
|
||||
|
||||
public:
|
||||
/// Construct a BdaBridge
|
||||
/// \param[in] gpu_mode to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
|
||||
/// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]'
|
||||
/// \param[in] fpga_bitstream FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[<filename>]'
|
||||
/// \param[in] linear_solver_verbosity verbosity of BdaSolver
|
||||
/// \param[in] maxit maximum number of iterations for BdaSolver
|
||||
/// \param[in] tolerance required relative tolerance for BdaSolver
|
||||
/// \param[in] platformID the OpenCL platform ID to be used
|
||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
|
||||
BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
||||
|
||||
|
||||
/// Solve linear system, A*x = b
|
||||
/// \warning Values of A might get overwritten!
|
||||
@ -79,6 +86,16 @@ public:
|
||||
/// \param[in] wellContribs container to hold all WellContributions
|
||||
void initWellContributions(WellContributions& wellContribs);
|
||||
|
||||
/// Return whether the BdaBridge will use the FPGA or not
|
||||
/// return whether the BdaBridge will use the FPGA or not
|
||||
bool getUseFpga(){
|
||||
return use_fpga;
|
||||
}
|
||||
|
||||
/// Return the selected accelerator mode, this is input via the command-line
|
||||
std::string getAccleratorName(){
|
||||
return accelerator_mode;
|
||||
}
|
||||
|
||||
}; // end class BdaBridge
|
||||
|
||||
|
@ -55,6 +55,8 @@ namespace bda
|
||||
int maxit = 200;
|
||||
double tolerance = 1e-2;
|
||||
|
||||
std::string bitstream = "";
|
||||
|
||||
int N; // number of rows
|
||||
int Nb; // number of blocked rows (Nb*block_size == N)
|
||||
int nnz; // number of nonzeroes (scalars)
|
||||
@ -66,7 +68,8 @@ namespace bda
|
||||
bool initialized = false;
|
||||
|
||||
public:
|
||||
/// Construct a BdaSolver, can be cusparseSolver or openclSolver
|
||||
/// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver
|
||||
/// \param[in] fpga_bitstream FPGA bitstream file name (only for fpgaSolver)
|
||||
/// \param[in] linear_solver_verbosity verbosity of solver
|
||||
/// \param[in] maxit maximum number of iterations for solver
|
||||
/// \param[in] tolerance required relative tolerance for solver
|
||||
@ -74,6 +77,7 @@ namespace bda
|
||||
/// \param[in] deviceID the device to be used
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
||||
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
|
||||
|
||||
/// Define virtual destructor, so that the derivedclass destructor will be called
|
||||
virtual ~BdaSolver() {};
|
||||
|
@ -20,13 +20,19 @@
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <config.h>
|
||||
|
||||
using bda::BlockedMatrix;
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
||||
|
||||
template <unsigned int block_size>
|
||||
@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
|
||||
}
|
||||
}
|
||||
|
||||
// subtract c from b and store in a
|
||||
// a = b - c
|
||||
#if HAVE_FPGA
|
||||
|
||||
/*Subtract two blocks from one another element by element*/
|
||||
template <unsigned int block_size>
|
||||
void blockSub(double *a, double *b, double *c)
|
||||
{
|
||||
void blockSub(double *mat1, double *mat2, double *resMat) {
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
|
||||
resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
|
||||
template <unsigned int block_size>
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
|
||||
for (unsigned int row = 0; row < block_size; row++) {
|
||||
if (resetRes) {
|
||||
resVect[row] = 0.0;
|
||||
}
|
||||
for (unsigned int col = 0; col < block_size; col++) {
|
||||
resVect[row] += scale * mat[row * block_size + col] * vect[col];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::countUnblockedNnzs() {
|
||||
int numNnzsOverThreshold = 0;
|
||||
int totalNnzs = rowPointers[Nb];
|
||||
for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
|
||||
if (fabs(nnzValues[idx]) > nnzThreshold) {
|
||||
numNnzsOverThreshold++;
|
||||
}
|
||||
}
|
||||
return numNnzsOverThreshold;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
|
||||
* If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
|
||||
*/
|
||||
template <unsigned int block_size>
|
||||
void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
|
||||
const unsigned int bs = block_size;
|
||||
int valIndex = 0, nnzsPerRow;
|
||||
|
||||
mat->rowPointers[0] = 0;
|
||||
// go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and
|
||||
// write all non-zero values and corresponding column indices that belong to the same row into the new matrix.
|
||||
for (int row = 0; row < Nb; row++) {
|
||||
for (unsigned int bRow = 0; bRow < bs; bRow++) {
|
||||
nnzsPerRow = 0;
|
||||
for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) {
|
||||
for (unsigned int bCol = 0; bCol < bs; bCol++) {
|
||||
int idx = 0;
|
||||
// If the matrix is the U matrix, store the rows inside a block in reverse order.
|
||||
if (isUMatrix) {
|
||||
idx = col * bs * bs + (bs - bRow - 1) * bs + bCol;
|
||||
} else {
|
||||
idx = col * bs * bs + bRow * bs + bCol;
|
||||
}
|
||||
|
||||
if (fabs(nnzValues[idx]) > nnzThreshold) {
|
||||
mat->nnzValues[valIndex] = nnzValues[idx];
|
||||
mat->colIndices[valIndex] = colIndices[col] * bs + bCol;
|
||||
valIndex++;
|
||||
nnzsPerRow++;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Update the rowpointers of the new matrix
|
||||
mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*Optimized version*/
|
||||
// ub* prefixes indicate unblocked data
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
|
||||
{
|
||||
int res;
|
||||
int numUnblockedNnzs = countUnblockedNnzs();
|
||||
|
||||
// initialize the non-blocked matrix with the obtained size.
|
||||
std::unique_ptr<Matrix> ubMat = std::make_unique<Matrix>(Nb * block_size, numUnblockedNnzs);
|
||||
|
||||
unblock(ubMat.get(), isUMatrix);
|
||||
|
||||
std::vector<int> ubNodesPerColor(numColors);
|
||||
for (int i = 0; i < numColors; i++) {
|
||||
ubNodesPerColor[i] = nodesPerColor[i] * block_size;
|
||||
}
|
||||
|
||||
*valSize = ubMat->nnzs;
|
||||
|
||||
res = ubMat->toRDF(numColors, ubNodesPerColor,
|
||||
colIndicesInColor, nnzsPerRowLimit,
|
||||
ubNnzValues, ubColIndices, nnzValsSizes,
|
||||
NROffsets, colorSizes);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
// coloring is already done, numColors and nodesPerColor are set
|
||||
// [rows|columns]PerColorLimit are already queried from the FPGA
|
||||
// colIndicesInColor, PIndicesAddr and colorSizes are written here
|
||||
// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU
|
||||
// node == row
|
||||
// color == partition
|
||||
// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs
|
||||
// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF().
|
||||
// due to partitioning, lots of columns are removed, this matrix keeps track of the mapping
|
||||
// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
|
||||
// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
|
||||
// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
|
||||
template <unsigned int block_size>
|
||||
int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
|
||||
int rowsPerColorLimit, int columnsPerColorLimit,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
|
||||
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
|
||||
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes)
|
||||
{
|
||||
// Data related to column indices per partition
|
||||
int doneRows = 0;
|
||||
std::vector<bool> isColAccessed(Nb); // std::vector<bool> might have some different optimized implementation, initialize in a loop
|
||||
std::vector<bool> isLColAccessed(Nb);
|
||||
int totalCols = 0; // sum of numColAccesses for each color, blocked
|
||||
int LTotalCols = 0, UTotalCols = 0;
|
||||
int maxCols = 0; // max value of numColAccesses for any color
|
||||
int maxRowsPerColor = 0; // max value of numRows for any color
|
||||
int maxColsPerRow = 0; // max value of colsPerRow for any color
|
||||
// colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates
|
||||
// colsInColor[c][i] contains the ith column that color c accesses
|
||||
// initial size allows for each color to access all columns, with space for padding
|
||||
std::vector<std::vector<int> > colsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||
std::vector<std::vector<int> > LColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||
std::vector<std::vector<int> > UColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||
|
||||
// find which columns are accessed in each color, as well as how many non-zeroes there are per color.
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
int numRows = 0;
|
||||
// initialize
|
||||
for (int row = 0; row < Nb; row++) {
|
||||
isColAccessed[row] = false;
|
||||
isLColAccessed[row] = false;
|
||||
}
|
||||
if (c > 0) {
|
||||
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
|
||||
isLColAccessed[i] = true;
|
||||
}
|
||||
}
|
||||
int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0; // number of unique accesses, blocked
|
||||
// for every row in this color
|
||||
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
|
||||
int colsPerRow = 0; // number of blocks for this row
|
||||
bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]);
|
||||
for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) {
|
||||
// for every column in the current row, check if that column was accessed before this color
|
||||
int col = colIndices[idx];
|
||||
if (isColAccessed[col] == false) {
|
||||
colsInColor[c][numColAccesses] = col;
|
||||
isColAccessed[col] = true;
|
||||
numColAccesses++;
|
||||
if (col > row) {
|
||||
UColsInColor[numColors - c - 1][UNumColAccesses] = col;
|
||||
UNumColAccesses++;
|
||||
}
|
||||
}
|
||||
if (isLColAccessed[col] == false) {
|
||||
if (col < row) {
|
||||
LColsInColor[c][LNumColAccesses] = col;
|
||||
LNumColAccesses++;
|
||||
isLColAccessed[col] = true;
|
||||
}
|
||||
}
|
||||
colsPerRow++;
|
||||
}
|
||||
if (rowIsEmpty != true) {
|
||||
numRows++;
|
||||
}
|
||||
maxColsPerRow = std::max(maxColsPerRow, colsPerRow);
|
||||
}
|
||||
|
||||
// add columns from previous color into L partition to simplify data forwarding
|
||||
if (c > 0) {
|
||||
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
|
||||
LColsInColor[c][LNumColAccesses] = i;
|
||||
LNumColAccesses++;
|
||||
}
|
||||
}
|
||||
|
||||
colorSizes[c * 4 + 10] = numColAccesses * block_size;
|
||||
LColorSizes[c * 4 + 10] = LNumColAccesses * block_size;
|
||||
UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size;
|
||||
|
||||
// store mapping
|
||||
for (int col = 0; col < numColAccesses; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i;
|
||||
}
|
||||
}
|
||||
for (int col = 0; col < LNumColAccesses; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i;
|
||||
}
|
||||
}
|
||||
for (int col = 0; col < UNumColAccesses; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i;
|
||||
}
|
||||
}
|
||||
|
||||
// zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline
|
||||
while (numColAccesses % 16 != 0) {
|
||||
colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1];
|
||||
numColAccesses++;
|
||||
}
|
||||
while (LNumColAccesses % 16 != 0) {
|
||||
LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1];
|
||||
LNumColAccesses++;
|
||||
}
|
||||
while (UNumColAccesses % 16 != 0) {
|
||||
UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1];
|
||||
UNumColAccesses++;
|
||||
}
|
||||
maxCols = std::max(numColAccesses, maxCols);
|
||||
totalCols += numColAccesses;
|
||||
LTotalCols += LNumColAccesses;
|
||||
UTotalCols += UNumColAccesses;
|
||||
doneRows = doneRows + nodesPerColor[c];
|
||||
maxRowsPerColor = std::max(numRows, maxRowsPerColor);
|
||||
}
|
||||
|
||||
if (maxCols * static_cast<int>(block_size) > columnsPerColorLimit) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit;
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
doneRows = 0;
|
||||
int diagValsSize = 0;
|
||||
int maxRows = 0;
|
||||
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
// calculate sizes that include zeropadding
|
||||
diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8);
|
||||
doneRows += nodesPerColor[c];
|
||||
if (nodesPerColor[c] * static_cast<int>(block_size) > maxRows)
|
||||
maxRows = nodesPerColor[c];
|
||||
colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
|
||||
LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
|
||||
UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size;
|
||||
}
|
||||
|
||||
if (maxRows * static_cast<int>(block_size) > rowsPerColorLimit) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit;
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
// create and fill sizes array as far as already possible
|
||||
colorSizes[0] = Nb * block_size;
|
||||
LColorSizes[0] = Nb * block_size;
|
||||
UColorSizes[0] = Nb * block_size;
|
||||
// col_sizes (but the matrix is square)
|
||||
colorSizes[1] = Nb * block_size;
|
||||
LColorSizes[1] = Nb * block_size;
|
||||
UColorSizes[1] = Nb * block_size;
|
||||
colorSizes[2] = totalCols * block_size;
|
||||
LColorSizes[2] = LTotalCols * block_size;
|
||||
UColorSizes[2] = UTotalCols * block_size;
|
||||
// missing val_size, written in Matrix::toRDF()
|
||||
colorSizes[4] = numColors;
|
||||
LColorSizes[4] = numColors;
|
||||
UColorSizes[4] = numColors;
|
||||
// missing NRFlagsSize, written in Matrix::toRDF()
|
||||
colorSizes[6] = diagValsSize;
|
||||
LColorSizes[6] = diagValsSize;
|
||||
UColorSizes[6] = diagValsSize;
|
||||
|
||||
int paddingIdx = numColors;
|
||||
while (paddingIdx % 4 != 0) {
|
||||
for (unsigned int i = 0; i < 4; i++) {
|
||||
colorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||
LColorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||
UColorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||
}
|
||||
paddingIdx++;
|
||||
}
|
||||
|
||||
int index = 0, Lindex = 0, Uindex = 0;
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
// for each unique col access
|
||||
for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
PIndicesAddr[index] = colsInColor[c][col] * block_size + i;
|
||||
index++;
|
||||
}
|
||||
}
|
||||
// add padding
|
||||
while (index % 16 != 0) {
|
||||
PIndicesAddr[index] = PIndicesAddr[index - 1];
|
||||
index++;
|
||||
}
|
||||
for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i;
|
||||
Lindex++;
|
||||
}
|
||||
}
|
||||
while (Lindex % 16 != 0) {
|
||||
LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1];
|
||||
Lindex++;
|
||||
}
|
||||
for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||
for (unsigned int i = 0; i < block_size; i++) {
|
||||
UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i;
|
||||
Uindex++;
|
||||
}
|
||||
}
|
||||
while (Uindex % 16 != 0) {
|
||||
UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1];
|
||||
Uindex++;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag) {
|
||||
const unsigned int block_size = 3;
|
||||
int doneRows = rowSize - 1; // since the rows of U are reversed, the rows of the diag are also reversed
|
||||
int RDFIndex = 0;
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
for (int r = 0; r < rowsPerColor[c]; r++) {
|
||||
|
||||
// the rows in the block are reversed
|
||||
for (int i = static_cast<int>(block_size) - 1; i >= 0; i--) {
|
||||
for (unsigned int j = 0; j < block_size; j++) {
|
||||
RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j];
|
||||
RDFIndex++;
|
||||
}
|
||||
// add 4th column, zeropadding
|
||||
RDFDiag[RDFIndex] = 0.0;
|
||||
RDFIndex++;
|
||||
}
|
||||
}
|
||||
doneRows -= rowsPerColor[c];
|
||||
|
||||
// make sure the color completely fills a cacheline
|
||||
// a color with 3 blocks would otherwise leave space
|
||||
while (RDFIndex % 8 != 0) {
|
||||
RDFDiag[RDFIndex] = 0.0;
|
||||
RDFIndex++;
|
||||
}
|
||||
}
|
||||
assert(RDFIndex % 8 == 0);
|
||||
}
|
||||
|
||||
#endif // HAVE_FPGA
|
||||
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template void sortBlockedRow<n>(int *, double *, int, int); \
|
||||
template void blockMultSub<n>(double *, double *, double *); \
|
||||
template void blockMult<n>(double *, double *, double *); \
|
||||
template void blockSub<n>(double *, double *, double *); \
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
#if HAVE_FPGA
|
||||
#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \
|
||||
template void blockSub<n>(double *, double *, double *); \
|
||||
template void blockVectMult<n>(double *, double *, double, double *, bool); \
|
||||
template int BlockedMatrix<n>::toRDF(int, int *, bool, \
|
||||
std::vector<std::vector<int> >& , int, int *, \
|
||||
std::vector<std::vector<double> >&, short int *, unsigned char *, int *, int *); \
|
||||
template int BlockedMatrix<n>::findPartitionColumns(int, int *, \
|
||||
int, int, \
|
||||
std::vector<std::vector<int> >& , int *, int *, \
|
||||
std::vector<std::vector<int> >& , int *, int *, \
|
||||
std::vector<std::vector<int> >& , int *, int *);
|
||||
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
|
||||
|
||||
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
|
||||
#endif // HAVE_FPGA
|
||||
|
||||
|
||||
} // end namespace bda
|
||||
|
@ -20,29 +20,40 @@
|
||||
#ifndef BLOCKED_MATRIX_HPP
|
||||
#define BLOCKED_MATRIX_HPP
|
||||
|
||||
#if HAVE_FPGA
|
||||
#include <vector>
|
||||
#endif
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
template<int BS>
|
||||
struct BlockedMatrix{
|
||||
template<unsigned int block_size>
|
||||
class BlockedMatrix
|
||||
{
|
||||
|
||||
public:
|
||||
|
||||
/// Allocate BlockedMatrix and data arrays with given sizes
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
BlockedMatrix(int Nb_, int nnzbs_)
|
||||
: nnzValues(new double[nnzbs_*BS*BS]),
|
||||
colIndices(new int[nnzbs_*BS*BS]),
|
||||
: nnzValues(new double[nnzbs_*block_size*block_size]),
|
||||
colIndices(new int[nnzbs_*block_size*block_size]),
|
||||
rowPointers(new int[Nb_+1]),
|
||||
Nb(Nb_),
|
||||
nnzbs(nnzbs_),
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(true)
|
||||
{}
|
||||
|
||||
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
|
||||
/// \param[in] M matrix to be copied
|
||||
BlockedMatrix(const BlockedMatrix& M)
|
||||
: nnzValues(new double[M.nnzbs*BS*BS]),
|
||||
: nnzValues(new double[M.nnzbs*block_size*block_size]),
|
||||
colIndices(M.colIndices),
|
||||
rowPointers(M.rowPointers),
|
||||
Nb(M.Nb),
|
||||
@ -50,10 +61,11 @@ struct BlockedMatrix{
|
||||
deleteNnzs(true),
|
||||
deleteSparsity(false)
|
||||
{}
|
||||
|
||||
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
|
||||
/// \param[in] Nb number of blockrows
|
||||
/// \param[in] nnzbs number of nonzero blocks
|
||||
/// \param[in] nnzValues array of nonzero values, contains nnzb*BS*BS scalars
|
||||
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
|
||||
/// \param[in] colIndices array of column indices, contains nnzb entries
|
||||
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
|
||||
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
||||
@ -76,6 +88,28 @@ struct BlockedMatrix{
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_FPGA
|
||||
constexpr static double nnzThreshold = 1e-80; // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero
|
||||
|
||||
int countUnblockedNnzs();
|
||||
|
||||
void unblock(Matrix *mat, bool isUMatrix);
|
||||
|
||||
/// Converts this matrix to the dataformat used by the FPGA
|
||||
/// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking
|
||||
int toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
|
||||
std::vector<std::vector<double> >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize);
|
||||
|
||||
/// Analyses the sparsity pattern and prepares for toRDF()
|
||||
/// Is only called once
|
||||
int findPartitionColumns(int numColors, int *nodesPerColor,
|
||||
int rowsPerColorLimit, int columnsPerColorLimit,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
|
||||
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
|
||||
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes);
|
||||
#endif
|
||||
|
||||
double *nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c);
|
||||
template <unsigned int block_size>
|
||||
void blockMult(double *mat1, double *mat2, double *resMat);
|
||||
|
||||
/// Subtract blocks
|
||||
/// a = b - c
|
||||
/// \param[out] a result block
|
||||
/// \param[in] b input block
|
||||
/// \param[in] c input block
|
||||
|
||||
#if HAVE_FPGA
|
||||
template <unsigned int block_size>
|
||||
void blockSub(double *a, double *b, double *c);
|
||||
void blockSub(double *mat1, double *mat2, double *resMat);
|
||||
|
||||
template <unsigned int block_size>
|
||||
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
|
||||
|
||||
/// Convert a blocked inverse diagonal to the FPGA format.
|
||||
/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
|
||||
/// Since the rows of U are reversed, the rows of the diag are also reversed.
|
||||
/// The cachelines can hold 8 doubles, a block has 9 doubles.
|
||||
/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows.
|
||||
/// Then 2 blocks (24 doubles) fit on 3 cachelines.
|
||||
/// Example:
|
||||
/// [1 2 3] [1 2 3 0] [1 2 3 0 4 5 6 0]
|
||||
/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1]
|
||||
/// [7 8 9] [7 8 9 0] [block2 row2 block2 row3]
|
||||
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
|
||||
#endif
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
|
413
opm/simulators/linalg/bda/FPGABILU0.cpp
Normal file
413
opm/simulators/linalg/bda/FPGABILU0.cpp
Normal file
@ -0,0 +1,413 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/MatrixBlock.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
template <unsigned int block_size>
|
||||
FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
|
||||
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
|
||||
{
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
level_scheduling = true;
|
||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||
graph_coloring = true;
|
||||
} else {
|
||||
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
FPGABILU0<block_size>::~FPGABILU0()
|
||||
{
|
||||
delete[] invDiagVals;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
resultPointers.resize(numResultPointers, nullptr);
|
||||
resultSizes.resize(numResultSizes);
|
||||
|
||||
// Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
|
||||
const unsigned int nnzSplit = 1;
|
||||
|
||||
this->N = mat->Nb * block_size;
|
||||
this->Nb = mat->Nb;
|
||||
this->nnz = mat->nnzbs * block_size * block_size;
|
||||
this->nnzbs = mat->nnzbs;
|
||||
|
||||
toOrder.resize(Nb);
|
||||
fromOrder.resize(Nb);
|
||||
|
||||
std::vector<int> CSCRowIndices(nnzbs);
|
||||
std::vector<int> CSCColPointers(Nb + 1);
|
||||
|
||||
if (level_scheduling) {
|
||||
Timer t_convert;
|
||||
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
}
|
||||
|
||||
Timer t_analysis;
|
||||
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
|
||||
std::ostringstream out;
|
||||
if (level_scheduling) {
|
||||
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
|
||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
} else if (graph_coloring) {
|
||||
out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
|
||||
findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||
}
|
||||
|
||||
if (numColors > maxNumColors) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
|
||||
}
|
||||
OpmLog::info(out.str());
|
||||
|
||||
int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
|
||||
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
|
||||
int blockDiagSize = 0;
|
||||
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
|
||||
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
int doneRows = 0;
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
|
||||
for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
|
||||
int columnIndex = rMat->colIndices[j];
|
||||
if (columnIndex < i) {
|
||||
LColorRoundedValSize += 9;
|
||||
LNROffsetSize += 9;
|
||||
}
|
||||
if (columnIndex > i) {
|
||||
UColorRoundedValSize += 9;
|
||||
UNROffsetSize += 9;
|
||||
}
|
||||
colorRoundedValSize += 9;
|
||||
NROffsetSize += 9;
|
||||
}
|
||||
blockDiagSize += 12;
|
||||
}
|
||||
// End of color: round all sizes to nearest cacheline
|
||||
colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
|
||||
LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
|
||||
UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
|
||||
NROffsetSize = roundUpTo(NROffsetSize, 64);
|
||||
LNROffsetSize = roundUpTo(LNROffsetSize, 64);
|
||||
UNROffsetSize = roundUpTo(UNROffsetSize, 64);
|
||||
blockDiagSize = roundUpTo(blockDiagSize, 8);
|
||||
|
||||
doneRows += rowsPerColor[c];
|
||||
}
|
||||
int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
|
||||
int worstCaseColumnAccessNum = numColors * maxColsPerColor;
|
||||
|
||||
nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
|
||||
LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
|
||||
UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
|
||||
// initial number of nnz, used to allocate
|
||||
nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
|
||||
LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
|
||||
UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
|
||||
colIndices.resize(colorRoundedValSize);
|
||||
LColIndices.resize(LColorRoundedValSize);
|
||||
UColIndices.resize(UColorRoundedValSize);
|
||||
NROffsets.resize(NROffsetSize);
|
||||
LNROffsets.resize(LNROffsetSize);
|
||||
UNROffsets.resize(UNROffsetSize);
|
||||
PIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||
LPIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||
UPIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||
colorSizes.resize(colorSizesNum);
|
||||
LColorSizes.resize(colorSizesNum);
|
||||
UColorSizes.resize(colorSizesNum);
|
||||
blockDiag.resize(blockDiagSize);
|
||||
colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||
LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||
UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||
|
||||
int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
|
||||
maxRowsPerColor, maxColsPerColor,
|
||||
colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
|
||||
LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
|
||||
UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
|
||||
if (err != 0) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
diagIndex.resize(mat->Nb, 0);
|
||||
invDiagVals = new double[mat->Nb * bs * bs];
|
||||
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
resultPointers[0] = (void *) colorSizes.data();
|
||||
resultPointers[1] = (void *) PIndicesAddr.data();
|
||||
resultPointers[2] = (void *) nnzValues.data();
|
||||
resultPointers[3] = (void *) colIndices.data();
|
||||
resultPointers[4] = (void *) NROffsets.data();
|
||||
resultPointers[5] = (void *) nnzValsSizes.data();
|
||||
resultPointers[6] = (void *) LColorSizes.data();
|
||||
resultPointers[7] = (void *) LPIndicesAddr.data();
|
||||
resultPointers[8] = (void *) LnnzValues.data();
|
||||
resultPointers[9] = (void *) LColIndices.data();
|
||||
resultPointers[10] = (void *) LNROffsets.data();
|
||||
resultPointers[11] = (void *) LnnzValsSizes.data();
|
||||
resultPointers[12] = (void *) UColorSizes.data();
|
||||
resultPointers[13] = (void *) UPIndicesAddr.data();
|
||||
resultPointers[14] = (void *) UnnzValues.data();
|
||||
resultPointers[15] = (void *) UColIndices.data();
|
||||
resultPointers[16] = (void *) UNROffsets.data();
|
||||
resultPointers[17] = (void *) UnnzValsSizes.data();
|
||||
resultPointers[18] = (void *) blockDiag.data();
|
||||
//resultPointers[19] and [20] are set by the caller
|
||||
resultSizes[0] = mat->Nb * block_size;
|
||||
resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
|
||||
resultSizes[2] = numColors;
|
||||
resultSizes[3] = worstCaseColumnAccessNum; //totalCols
|
||||
resultSizes[4] = NROffsetSize; //NRFlagSize
|
||||
resultSizes[5] = blockDiagSize; //diagValsSize
|
||||
resultSizes[6] = mat->Nb * block_size;
|
||||
resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
|
||||
resultSizes[8] = numColors;
|
||||
resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
|
||||
resultSizes[10] = LNROffsetSize; //LNRFlagSize
|
||||
resultSizes[11] = blockDiagSize; //LDiagValsSize
|
||||
resultSizes[12] = mat->Nb * block_size;
|
||||
resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
|
||||
resultSizes[14] = numColors;
|
||||
resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
|
||||
resultSizes[16] = UNROffsetSize; //UNRFlagSize
|
||||
resultSizes[17] = blockDiagSize; //UDiagValsSize
|
||||
return true;
|
||||
} // end init()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
||||
{
|
||||
const unsigned int bs = block_size;
|
||||
Timer t_reorder;
|
||||
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||
Timer t_memcpy;
|
||||
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
int i, j, ij, ik, jk;
|
||||
int iRowStart, iRowEnd, jRowEnd;
|
||||
double pivot[bs * bs];
|
||||
int LSize = 0;
|
||||
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
||||
|
||||
Timer t_decomposition;
|
||||
|
||||
// go through all rows
|
||||
for (i = 0; i < LUMat->Nb; i++) {
|
||||
iRowStart = LUMat->rowPointers[i];
|
||||
iRowEnd = LUMat->rowPointers[i + 1];
|
||||
|
||||
// go through all elements of the row
|
||||
for (ij = iRowStart; ij < iRowEnd; ij++) {
|
||||
j = LUMat->colIndices[ij];
|
||||
// if the element is the diagonal, store the index and go to next row
|
||||
if (j == i) {
|
||||
diagIndex[i] = ij;
|
||||
break;
|
||||
}
|
||||
// if an element beyond the diagonal is reach, no diagonal was found
|
||||
// throw an error now. TODO: perform reordering earlier to prevent this
|
||||
if (j > i) {
|
||||
std::ostringstream out;
|
||||
out << "BILU0 Error could not find diagonal value in row: " << i;
|
||||
OpmLog::error(out.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
LSize++;
|
||||
// calculate the pivot of this row
|
||||
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
|
||||
|
||||
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
|
||||
|
||||
jRowEnd = LUMat->rowPointers[j + 1];
|
||||
jk = diagIndex[j] + 1;
|
||||
ik = ij + 1;
|
||||
// substract that row scaled by the pivot from this row.
|
||||
while (ik < iRowEnd && jk < jRowEnd) {
|
||||
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
|
||||
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
|
||||
ik++;
|
||||
jk++;
|
||||
} else {
|
||||
if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
|
||||
{ ik++; }
|
||||
else
|
||||
{ jk++; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// store the inverse in the diagonal!
|
||||
inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
|
||||
memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
|
||||
}
|
||||
|
||||
LMat->rowPointers[0] = 0;
|
||||
UMat->rowPointers[0] = 0;
|
||||
|
||||
// Split the LU matrix into two by comparing column indices to diagonal indices
|
||||
for (i = 0; i < LUMat->Nb; i++) {
|
||||
LMat->rowPointers[i + 1] = LMat->rowPointers[i];
|
||||
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
|
||||
if (j < diagIndex[i]) {
|
||||
memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
|
||||
LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
|
||||
LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reverse the order or the (blocked) rows for the U matrix,
|
||||
// because the rows are accessed in reverse order when applying the ILU0
|
||||
int URowIndex = 0;
|
||||
for (i = LUMat->Nb - 1; i >= 0; i--) {
|
||||
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
|
||||
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
|
||||
if (j > diagIndex[i]) {
|
||||
memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
|
||||
UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
|
||||
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
|
||||
}
|
||||
}
|
||||
URowIndex++;
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
std::ostringstream out;
|
||||
out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
}
|
||||
|
||||
std::vector<int> URowsPerColor(numColors);
|
||||
rowSize = block_size * rMat->Nb;
|
||||
LRowSize = block_size * LMat->Nb;
|
||||
URowSize = block_size * UMat->Nb;
|
||||
LNumColors = numColors;
|
||||
UNumColors = numColors;
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
URowsPerColor[numColors - c - 1] = rowsPerColor[c];
|
||||
}
|
||||
int err;
|
||||
err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
|
||||
colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
|
||||
nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
|
||||
if (err != 0) {
|
||||
return false;
|
||||
}
|
||||
err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
|
||||
LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
|
||||
LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
|
||||
if (err != 0) {
|
||||
return false;
|
||||
}
|
||||
err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
|
||||
UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
|
||||
UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
|
||||
if (err != 0) {
|
||||
return false;
|
||||
}
|
||||
blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
|
||||
// resultPointers are set in the init method
|
||||
resultSizes[0] = rowSize;
|
||||
resultSizes[1] = colorSizes[3]; // zeropadded valSize;
|
||||
resultSizes[2] = numColors;
|
||||
resultSizes[3] = colorSizes[2]; //totalCols
|
||||
resultSizes[4] = colorSizes[5]; //NRFlagSize
|
||||
resultSizes[5] = colorSizes[6]; //diagValsSize
|
||||
resultSizes[6] = LRowSize;
|
||||
resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
|
||||
resultSizes[8] = LNumColors;
|
||||
resultSizes[9] = LColorSizes[2]; //LTotalCols
|
||||
resultSizes[10] = LColorSizes[5]; //LNRFlagSize
|
||||
resultSizes[11] = LColorSizes[6]; //LDiagValsSize
|
||||
resultSizes[12] = URowSize;
|
||||
resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
|
||||
resultSizes[14] = UNumColors;
|
||||
resultSizes[15] = UColorSizes[2]; //UTotalCols
|
||||
resultSizes[16] = UColorSizes[5]; //UNRFlagSize
|
||||
resultSizes[17] = UColorSizes[6]; //UDiagValsSize
|
||||
return true;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
|
||||
template FPGABILU0<n>::~FPGABILU0(); \
|
||||
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
|
||||
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} //namespace bda
|
117
opm/simulators/linalg/bda/FPGABILU0.hpp
Normal file
117
opm/simulators/linalg/bda/FPGABILU0.hpp
Normal file
@ -0,0 +1,117 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef FPGA_BILU0_HEADER_INCLUDED
|
||||
#define FPGA_BILU0_HEADER_INCLUDED
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/*
|
||||
* This class implements a Blocked ILU0 preconditioner, with output data
|
||||
* specifically formatted for the FPGA.
|
||||
* The decomposition and reorders of the rows of the matrix are done on CPU.
|
||||
*/
|
||||
|
||||
template <unsigned int block_size>
|
||||
class FPGABILU0
|
||||
{
|
||||
|
||||
private:
|
||||
int N; // number of rows of the matrix
|
||||
int Nb; // number of blockrows of the matrix
|
||||
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||
int nnzbs; // number of blocks of the matrix
|
||||
std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
|
||||
std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
|
||||
double *invDiagVals = nullptr;
|
||||
std::vector<int> diagIndex;
|
||||
std::vector<int> toOrder, fromOrder;
|
||||
std::vector<int> rowsPerColor;
|
||||
int numColors;
|
||||
int verbosity;
|
||||
|
||||
// sizes and arrays used during RDF generation
|
||||
std::vector<std::vector<double> > nnzValues, LnnzValues, UnnzValues;
|
||||
std::vector<short int> colIndices, LColIndices, UColIndices;
|
||||
std::vector<unsigned char> NROffsets, LNROffsets, UNROffsets;
|
||||
std::vector<int> PIndicesAddr, LPIndicesAddr, UPIndicesAddr;
|
||||
std::vector<int> colorSizes, LColorSizes, UColorSizes;
|
||||
std::vector<int> nnzValsSizes, LnnzValsSizes, UnnzValsSizes;
|
||||
std::vector<std::vector<int> > colIndicesInColor, LColIndicesInColor, UColIndicesInColor;
|
||||
|
||||
int rowSize, valSize;
|
||||
int LRowSize, LValSize, LNumColors;
|
||||
int URowSize, UValSize, UNumColors;
|
||||
std::vector<double> blockDiag;
|
||||
ILUReorder opencl_ilu_reorder;
|
||||
bool level_scheduling = false, graph_coloring = false;
|
||||
int numResultPointers = 21;
|
||||
std::vector<void *> resultPointers;
|
||||
int numResultSizes = 18;
|
||||
std::vector<int> resultSizes;
|
||||
int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor
|
||||
|
||||
public:
|
||||
|
||||
FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors);
|
||||
|
||||
~FPGABILU0();
|
||||
|
||||
// analysis (optional)
|
||||
bool init(BlockedMatrix<block_size> *mat);
|
||||
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
return toOrder.data();
|
||||
}
|
||||
|
||||
int* getFromOrder()
|
||||
{
|
||||
return fromOrder.data();
|
||||
}
|
||||
|
||||
BlockedMatrix<block_size>* getRMat()
|
||||
{
|
||||
return rMat.get();
|
||||
}
|
||||
|
||||
void **getResultPointers()
|
||||
{
|
||||
return resultPointers.data();
|
||||
}
|
||||
|
||||
int *getResultSizes()
|
||||
{
|
||||
return resultSizes.data();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} //namespace bda
|
||||
|
||||
#endif // FPGA_BILU0_HEADER_INCLUDED
|
249
opm/simulators/linalg/bda/FPGAMatrix.cpp
Normal file
249
opm/simulators/linalg/bda/FPGAMatrix.cpp
Normal file
@ -0,0 +1,249 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/*Sort a row of matrix elements from a CSR-format.*/
|
||||
void sortRow(int *colIndices, double *data, int left, int right) {
|
||||
int l = left;
|
||||
int r = right;
|
||||
int middle = colIndices[(l + r) >> 1];
|
||||
do {
|
||||
while (colIndices[l] < middle)
|
||||
l++;
|
||||
while (colIndices[r] > middle)
|
||||
r--;
|
||||
if (l <= r) {
|
||||
int lColIndex = colIndices[l];
|
||||
colIndices[l] = colIndices[r];
|
||||
colIndices[r] = lColIndex;
|
||||
double lDatum = data[l];
|
||||
data[l] = data[r];
|
||||
data[r] = lDatum;
|
||||
|
||||
l++;
|
||||
r--;
|
||||
}
|
||||
} while (l < r);
|
||||
if (left < r)
|
||||
sortRow(colIndices, data, left, r);
|
||||
if (right > l)
|
||||
sortRow(colIndices, data, l, right);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
|
||||
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
|
||||
* and colors, and the size, in elements, of the NROffsets array, followed by:
|
||||
* the number of rows (rounded to the nearest 32), the number of rows (not rounded),
|
||||
* the number of columns (not rounded) and the number of non-zero values
|
||||
* (rounded to the nearest 32) for every partition.
|
||||
* This array is zero padded up to the nearest 64-byte cacheline.
|
||||
* - The "colIndicesInColor" array, which contains for every partition, from which elements
|
||||
* in the global X vector the elements of that X vector partition came.
|
||||
* For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then
|
||||
* that X vector partition will only have three elements, and the color_col_indices array
|
||||
* will contain 1, 3 and 6 for that partition.
|
||||
* This array is zero padded up to the nearest 64-byte cacheline for every partition.
|
||||
* - The "nnzValues" array contains all non-zero values of each partition of the matrix.
|
||||
* This array is zero-padded so that each color has a multiple of 32 elements (to have the
|
||||
* same number of elements per partition as the column indices array).
|
||||
* - The "colIndices" array contains all column indices of each partition of the matrix.
|
||||
* These column indices are the local indices for that partition, so to be used, first a
|
||||
* local X vector partition needs to be loaded into some local memory (this is done using
|
||||
* data from the _color_col_indices array), before these column indices can be used as
|
||||
* addresses to that local memory to read the desired X vector values.
|
||||
* This array is zero-padded so that data for every partition fills up a number of complete
|
||||
* cachelines (this means every color has a multiple of 32 elements).
|
||||
* - "NROffsets" is the name of the array that contains the new row offsets for
|
||||
* all elements of every partition of the matrix. New row offsets are 8-bit values which
|
||||
* are 0 if that element is not the first element in a row, or which, if that element is
|
||||
* the first element of a row) is equal to the amount of empty rows between that new row
|
||||
* and the row before it plus 1. This array is zero-padded so that data for every partition
|
||||
* fills up a number of complete cachelines (this means every color has a multiple of 64 elements).
|
||||
*/
|
||||
int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsThisRowLimit,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes)
|
||||
{
|
||||
auto mat = this;
|
||||
|
||||
int doneRows = 0;
|
||||
int totalRowNum = 0; // total number of non-empty rows
|
||||
int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color
|
||||
int maxNNZsPerColor = 0; // max of nnzsPerColor
|
||||
|
||||
int totalValSize = 0; // sum of nnzsPerColor, padded
|
||||
|
||||
std::vector<int> nnzRowsPerColor(numColors);
|
||||
|
||||
// find number of nnzs per color and number of non-empty rows
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
int numRows = 0;
|
||||
nnzRowsPerColor[c] = 0;
|
||||
int firstNnzOfColor = mat->rowPointers[doneRows];
|
||||
int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]];
|
||||
nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices
|
||||
totalValSize += nnzsPerColor;
|
||||
maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor);
|
||||
int row = doneRows;
|
||||
for (; row < doneRows + nodesPerColor[c]; row++) {
|
||||
if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) {
|
||||
numRows++;
|
||||
nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
doneRows = row;
|
||||
totalRowNum += numRows;
|
||||
}
|
||||
|
||||
int conseqZeroRows = 0; // number of consecutive empty rows
|
||||
int maxConseqZeroRows = 0;
|
||||
int numEmptyRows = 0; // total number of empty rows
|
||||
std::vector<int> rowOffsets(totalRowNum);
|
||||
std::vector<int> nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows
|
||||
std::vector<int> colorValPointers(numColors + 1); // points to first nnz of first row of each color
|
||||
std::vector<int> colorValZeroPointers(numColors); // points to first padded zero for each color
|
||||
|
||||
int nonEmptyRowIdx = 0; // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen
|
||||
doneRows = 0;
|
||||
|
||||
int totalPaddingSize = 0; // number of padded zeros from previous colors
|
||||
int NROffsetSize = 0; // number of NROffsets entries, padded to multiple of 64 for each color
|
||||
int maxRows = 0;
|
||||
int maxNNZsPerRow = 0;
|
||||
|
||||
// determine the row offset of each row (amount of zero rows between it and the previous non-zero row)
|
||||
// this is later converted to rowOffset for each nnz
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
conseqZeroRows = 0;
|
||||
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
|
||||
int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row];
|
||||
if (nnzsThisRow == 0) {
|
||||
conseqZeroRows++;
|
||||
numEmptyRows++;
|
||||
} else {
|
||||
maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow);
|
||||
nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1];
|
||||
rowOffsets[nonEmptyRowIdx] = conseqZeroRows;
|
||||
maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows);
|
||||
conseqZeroRows = 0;
|
||||
nonEmptyRowIdx++;
|
||||
}
|
||||
}
|
||||
// calculate sizes that include zeropadding
|
||||
colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize;
|
||||
colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32);
|
||||
totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c];
|
||||
NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64);
|
||||
|
||||
doneRows += nodesPerColor[c];
|
||||
maxRows = std::max(nodesPerColor[c], maxRows);
|
||||
}
|
||||
|
||||
if (maxNNZsPerRow > nnzsThisRowLimit) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit;
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
// create and fill RDF arrays
|
||||
colorSizes[3] = colorValPointers[numColors]; // total number of nnzs the FPGA has to process, including zeropadding
|
||||
colorSizes[5] = NROffsetSize;
|
||||
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
colorSizes[c * 4 + 8] = nnzRowsPerColor[c];
|
||||
colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c];
|
||||
}
|
||||
|
||||
int rowIndex = 0; // keep track of where to read/write
|
||||
int valIndex = 0;
|
||||
int NRIndex = 0;
|
||||
int halfwayPoint = colorValPointers[numColors] / 2;
|
||||
nnzValsSizes[0] = colorValPointers[numColors];
|
||||
|
||||
colorSizes[7] = halfwayPoint;
|
||||
|
||||
for (int c = 0; c < numColors; c++) {
|
||||
int nnzsThisRow;
|
||||
// make sure 32 values are written in batches (pad with zeros if needed)
|
||||
for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) {
|
||||
for (int vb = 0; vb < 32; vb++) {
|
||||
|
||||
// if there are enough values for the whole cacheline
|
||||
if (v + vb < colorValZeroPointers[c]) {
|
||||
ubNnzValues[0][v + vb] = mat->nnzValues[valIndex];
|
||||
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex]]);
|
||||
|
||||
// if this val is the first of a row
|
||||
if (nnzRowPointers[rowIndex] == valIndex) {
|
||||
|
||||
if (rowOffsets[rowIndex] + 1 >= 255) {
|
||||
std::ostringstream errorstring;
|
||||
errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1;
|
||||
OPM_THROW(std::logic_error, errorstring.str());
|
||||
}
|
||||
|
||||
NROffsets[NRIndex] = static_cast<unsigned char>(rowOffsets[rowIndex] + 1);
|
||||
|
||||
// skip all empty rows
|
||||
while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) {
|
||||
rowIndex++;
|
||||
nnzsThisRow = 0;
|
||||
}
|
||||
nnzsThisRow++;
|
||||
}
|
||||
else
|
||||
{
|
||||
NROffsets[NRIndex] = (unsigned char) 0;
|
||||
nnzsThisRow++;
|
||||
}
|
||||
valIndex++;
|
||||
}
|
||||
else // zeropadding is needed
|
||||
{
|
||||
ubNnzValues[0][v + vb] = 0.0;
|
||||
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex - 1]]);
|
||||
NROffsets[NRIndex] = 0;
|
||||
}
|
||||
NRIndex++;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// zeropad the NROffsets file
|
||||
while (NRIndex % 64 != 0) {
|
||||
NROffsets[NRIndex] = 0;
|
||||
NRIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // end namespace bda
|
84
opm/simulators/linalg/bda/FPGAMatrix.hpp
Normal file
84
opm/simulators/linalg/bda/FPGAMatrix.hpp
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef FPGA_MATRIX_HEADER_INCLUDED
|
||||
#define FPGA_MATRIX_HEADER_INCLUDED
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This struct resembles a csr matrix, only doubles are supported
|
||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||
class Matrix {
|
||||
|
||||
public:
|
||||
|
||||
/// Allocate Matrix and data arrays with given sizes
|
||||
/// \param[in] N number of rows
|
||||
/// \param[in] nnzs number of nonzeros
|
||||
Matrix(int N_, int nnzs_)
|
||||
: nnzValues(new double[nnzs_]),
|
||||
colIndices(new int[nnzs_]),
|
||||
rowPointers(new int[N_+1]),
|
||||
N(N_),
|
||||
nnzs(nnzs_)
|
||||
{}
|
||||
|
||||
/// All constructors allocate new memory, so always delete here
|
||||
~Matrix(){
|
||||
delete[] nnzValues;
|
||||
delete[] colIndices;
|
||||
delete[] rowPointers;
|
||||
}
|
||||
|
||||
|
||||
/// Converts this matrix to the dataformat used by the FPGA.
|
||||
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
|
||||
/// The purpose of this format is to allow the data to be streamable.
|
||||
/// The rowPointers array has an unpredictable reading pattern/timing,
|
||||
/// it also needs a extra work if a row is shorter than a cacheline.
|
||||
/// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets.
|
||||
/// The value of this offset is 0, unless the corresponding nnz is the first of a row,
|
||||
/// in that case it is 'the number of empty rows preceeding it + 1'.
|
||||
/// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx.
|
||||
/// Example:
|
||||
/// [1 0 0 3 0] nnzValues [1 3 2 2 1 4 3 4 1]
|
||||
/// [0 2 2 0 1] colIndices [0 3 1 2 4 0 1 2 4]
|
||||
/// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9]
|
||||
/// [0 0 0 0 0] rowOffsets [1 0 1 0 0 1 2 0 0]
|
||||
/// [0 3 4 0 1]
|
||||
/// The rowOffset is stored in 1 byte, meaning the maximum value is 255.
|
||||
int toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
|
||||
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
|
||||
|
||||
double *nnzValues;
|
||||
int *colIndices;
|
||||
int *rowPointers;
|
||||
int N;
|
||||
int nnzs;
|
||||
};
|
||||
|
||||
void sortRow(int *colIndices, double *data, int left, int right);
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif // FPGA_MATRIX_HEADER_INCLUDED
|
732
opm/simulators/linalg/bda/FPGASolverBackend.cpp
Normal file
732
opm/simulators/linalg/bda/FPGASolverBackend.cpp
Normal file
@ -0,0 +1,732 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/material/common/Unused.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||
|
||||
// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA
|
||||
// kernel will be disabled and execution will continue using DUNE
|
||||
#define FPGA_EXIT_WITH_HW_FAILURE
|
||||
//#undef FPGA_EXIT_WITH_HW_FAILURE
|
||||
|
||||
// if defined, the function generate_statistics will create a CSV-formatted file
|
||||
// with detailed statistics about the FPGA backend performance
|
||||
//#define FPGA_STATISTICS_FILE_ENABLED
|
||||
#undef FPGA_STATISTICS_FILE_ENABLED
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
using Opm::OpmLog;
|
||||
|
||||
template <unsigned int block_size>
|
||||
FpgaSolverBackend<block_size>::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(fpga_bitstream, verbosity_, maxit_, tolerance_)
|
||||
{
|
||||
int err;
|
||||
std::ostringstream oss;
|
||||
double start = second();
|
||||
|
||||
// currently, only block size == 3 is supported by the FPGA backend
|
||||
assert(block_size == 3);
|
||||
|
||||
if (verbosity < 1) {
|
||||
perf_call_enabled = false;
|
||||
}
|
||||
// setup bitstream name and other parameters
|
||||
if (fpga_bitstream.compare("") == 0) {
|
||||
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified");
|
||||
}
|
||||
if (!fileExists(fpga_bitstream.c_str())) {
|
||||
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable");
|
||||
}
|
||||
// -----------------------------
|
||||
// FPGA: setup the OpenCL platform
|
||||
// -----------------------------
|
||||
std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp
|
||||
// auto-select the proper FPGA device and create context and other CL objects
|
||||
err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1);
|
||||
if (err != 0) {
|
||||
oss << "Failed to setup the OpenCL device (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
oss << "Detected FPGA platform type is ";
|
||||
if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; }
|
||||
OpmLog::info(oss.str());
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
// -----------------------------
|
||||
// FPGA: setup the debug buffer
|
||||
// -----------------------------
|
||||
// set kernel debug lines depending on an environment variable
|
||||
const char *xem = getenv("XCL_EMULATION_MODE");
|
||||
if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) {
|
||||
debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU;
|
||||
oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n";
|
||||
OpmLog::info(oss.str());
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
} else {
|
||||
// set to 2 to reduce overhead in reading back and interpreting the debug lines;
|
||||
// increase to get more debug info from the kernel
|
||||
// range is 2..DEBUG_OUTBUF_WORDS_MAX-1
|
||||
debug_outbuf_words = 2;
|
||||
}
|
||||
|
||||
// host debug buffer setup
|
||||
err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// device debug buffer setup
|
||||
err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// copy debug buffer to device
|
||||
err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// ------------------------------------------------
|
||||
// FPGA: query the kernel for limits/configuration
|
||||
// ------------------------------------------------
|
||||
err = fpga_kernel_query(context, commands, kernel, cldebug,
|
||||
debugBuffer, debug_outbuf_words,
|
||||
rst_assert_cycles, rst_settle_cycles,
|
||||
&hw_x_vector_elem, &hw_max_row_size,
|
||||
&hw_max_column_size, &hw_max_colors_size,
|
||||
&hw_max_nnzs_per_row, &hw_max_matrix_size,
|
||||
&hw_use_uram, &hw_write_ilu0_results,
|
||||
&hw_dma_data_width, &hw_mult_num,
|
||||
&hw_x_vector_latency, &hw_add_latency, &hw_mult_latency,
|
||||
&hw_num_read_ports, &hw_num_write_ports,
|
||||
&hw_reset_cycles, &hw_reset_settle);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_kernel_query (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
|
||||
if (verbosity >= 1) {
|
||||
oss << "FPGA kernel limits/configuration:\n";
|
||||
oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n";
|
||||
oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n";
|
||||
oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n";
|
||||
oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n";
|
||||
oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n";
|
||||
oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n";
|
||||
oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n";
|
||||
oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle;
|
||||
OpmLog::info(oss.str());
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
}
|
||||
|
||||
// check that LU results are generated by the kernel
|
||||
if (use_LU_res && !hw_write_ilu0_results) {
|
||||
OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage");
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
use_LU_res = false;
|
||||
}
|
||||
|
||||
// setup preconditioner
|
||||
double start_prec = second();
|
||||
prec = std::make_unique<Preconditioner>(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size);
|
||||
perf_total.s_preconditioner_setup = second() - start_prec;
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||
level_scheduling = true;
|
||||
}
|
||||
|
||||
perf_total.s_initialization = second() - start;
|
||||
} // end fpgaSolverBackend
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
FpgaSolverBackend<block_size>::~FpgaSolverBackend()
|
||||
{
|
||||
if (verbosity >= 1) {
|
||||
generate_statistics();
|
||||
}
|
||||
delete[] rx;
|
||||
delete[] rb;
|
||||
if (nnzValArrays != nullptr) { free(nnzValArrays); }
|
||||
if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); }
|
||||
if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); }
|
||||
// FPGA: buffers
|
||||
free(debugBuffer);
|
||||
for (int b = 0; b < RW_BUF; b++) {
|
||||
free(dataBuffer[b]);
|
||||
}
|
||||
free(databufferSize);
|
||||
// FPGA: OpenCL objects
|
||||
if (cldebug != nullptr) { clReleaseMemObject(cldebug); }
|
||||
for (int b = 0; b < RW_BUF; b++) {
|
||||
if (cldata[b] != nullptr) {
|
||||
clReleaseMemObject(cldata[b]);
|
||||
}
|
||||
}
|
||||
clReleaseCommandQueue(commands);
|
||||
clReleaseContext(context);
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(program);
|
||||
clReleaseDevice(device_id);
|
||||
} // end ~fpgaSolverBackend()
|
||||
|
||||
|
||||
// copy result to host memory
|
||||
// caller must be sure that x is a valid array
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::get_result(double *x_)
|
||||
{
|
||||
double start = 0;
|
||||
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
// apply to results the reordering (stored in toOrder)
|
||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, rx, toOrder, x_);
|
||||
// TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private)
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_postprocess = second() - start;
|
||||
}
|
||||
} // end get_result()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res)
|
||||
{
|
||||
if (initialized == false) {
|
||||
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||
if (!analyse_matrix()) {
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
perf_call.emplace_back();
|
||||
update_system(vals, b);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
solve_system(res);
|
||||
|
||||
if (verbosity >= 1) {
|
||||
std::ostringstream oss;
|
||||
oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \
|
||||
", iterations: " << res.iterations << ", reduction: " << res.reduction << \
|
||||
", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed;
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
|
||||
{
|
||||
double start = second();
|
||||
this->N = N_;
|
||||
this->nnz = nnz_;
|
||||
this->nnzb = nnz_ / block_size / block_size;
|
||||
Nb = (N + dim - 1) / dim;
|
||||
|
||||
// allocate host memory for matrices and vectors
|
||||
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
|
||||
mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
|
||||
|
||||
std::ostringstream oss;
|
||||
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
|
||||
"block size: " << dim << ", total nnz: " << this->nnz << "\n";
|
||||
oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||
OpmLog::info(oss.str());
|
||||
|
||||
rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
|
||||
rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
|
||||
|
||||
perf_total.s_initialization += second() - start;
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FpgaSolverBackend<block_size>::analyse_matrix()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
int err;
|
||||
|
||||
double start = second();
|
||||
bool success = prec->init(mat.get());
|
||||
|
||||
if (!success) {
|
||||
OpmLog::warning("Preconditioner for FPGA solver failed to initialize");
|
||||
return success;
|
||||
}
|
||||
|
||||
toOrder = prec->getToOrder();
|
||||
fromOrder = prec->getFromOrder();
|
||||
rMat = prec->getRMat();
|
||||
processedPointers = prec->getResultPointers();
|
||||
processedSizes = prec->getResultSizes();
|
||||
processedPointers[19] = rb;
|
||||
processedPointers[20] = rx;
|
||||
nnzValArrays_size = static_cast<int*>(processedPointers[5])[0];
|
||||
L_nnzValArrays_size = static_cast<int*>(processedPointers[11])[0];
|
||||
U_nnzValArrays_size = static_cast<int*>(processedPointers[17])[0];
|
||||
// -------------------------------------
|
||||
// FPGA: setup host/device data buffers
|
||||
// -------------------------------------
|
||||
// allocate memory and setup data layout
|
||||
err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits,
|
||||
processedSizes,
|
||||
&setupArray,
|
||||
&nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray,
|
||||
&L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray,
|
||||
&U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray,
|
||||
&BLKDArray, &X1Array, &R1Array,
|
||||
&X2Array, &R2Array,
|
||||
&LresArray, &UresArray,
|
||||
&databufferSize, dataBuffer,
|
||||
result_offsets, 1 /*num_nnz_arrays*/,
|
||||
true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */
|
||||
debugbufferSize);
|
||||
if (err) {
|
||||
oss << "Failed to call fpga_setup_host_datamem (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
|
||||
// results buffers setup
|
||||
if (use_LU_res) {
|
||||
resultsBufferNum = 4;
|
||||
} else {
|
||||
resultsBufferNum = 2;
|
||||
}
|
||||
if (resultsBufferNum > RES_BUF_MAX) {
|
||||
oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
|
||||
resultsNum = processedSizes[0]; // rowSize, invariant between system solves
|
||||
for (int i = 0; i < resultsBufferNum; i++) {
|
||||
resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double);
|
||||
}
|
||||
|
||||
// device data memory setup
|
||||
err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_setup_device_datamem (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
|
||||
// ------------------------------------
|
||||
// FPGA: setup the kernel's parameters
|
||||
// ------------------------------------
|
||||
err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit,
|
||||
debug_sample_rate, tolerance, cldata, cldebug);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_set_kernel_parameters (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
|
||||
perf_total.s_analysis = second() - start;
|
||||
analysis_done = true;
|
||||
|
||||
return success;
|
||||
} // end analyse_matrix()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
bool FpgaSolverBackend<block_size>::create_preconditioner()
|
||||
{
|
||||
double start = 0;
|
||||
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
memset(rx, 0, sizeof(double) * N);
|
||||
bool result = prec->create_preconditioner(mat.get());
|
||||
if (!result) {
|
||||
OpmLog::warning("fpgaSolverBackend: create_preconditioner failed");
|
||||
}
|
||||
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_preconditioner_create = second() - start;
|
||||
}
|
||||
return result;
|
||||
} // end create_preconditioner()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::solve_system(BdaResult &res)
|
||||
{
|
||||
std::ostringstream oss;
|
||||
int err;
|
||||
double start = 0, start_total = 0;
|
||||
|
||||
// ------------------------------------
|
||||
// FPGA: return immediately if FPGA is disabled
|
||||
// ------------------------------------
|
||||
if (fpga_disabled) {
|
||||
res.converged = false;
|
||||
OpmLog::warning("FPGA is disabled, fallback to SW execution");
|
||||
return;
|
||||
}
|
||||
|
||||
fpga_calls++;
|
||||
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
start_total = start;
|
||||
}
|
||||
|
||||
// check if any buffer is larger than the size set in preconditioner->init
|
||||
// TODO: add check for all other buffer sizes that may overflow?
|
||||
err = 0;
|
||||
if ( ((int *)processedPointers[5])[0] > nnzValArrays_size ||
|
||||
((int *)processedPointers[11])[0] > L_nnzValArrays_size ||
|
||||
((int *)processedPointers[17])[0] > U_nnzValArrays_size ) {
|
||||
err = 1;
|
||||
}
|
||||
if (err != 0) {
|
||||
OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)");
|
||||
}
|
||||
|
||||
// ------------------------------------
|
||||
// FPGA: copy input data to host data buffers
|
||||
// ------------------------------------
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
err = fpga_copy_host_datamem(
|
||||
processedPointers, processedSizes, setupArray,
|
||||
nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray,
|
||||
L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray,
|
||||
U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray,
|
||||
BLKDArray, X1Array, R1Array, X2Array, R2Array,
|
||||
use_LU_res, LresArray, UresArray,
|
||||
databufferSize, dataBuffer,
|
||||
1 /* nnzValArrays_num */,
|
||||
reset_data_buffers, fill_results_buffers,
|
||||
dump_data_buffers, fpga_calls);
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_mem_setup = second() - start;
|
||||
}
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// ------------------------------------
|
||||
// FPGA: copy buffers to device
|
||||
// ------------------------------------
|
||||
// copy debug buffer to device
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
err = fpga_copy_to_device_debugbuf(commands,
|
||||
cldebug, debugBuffer, debugbufferSize,
|
||||
debug_outbuf_words);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// copy data buffers to device
|
||||
err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_mem_h2d = second() - start;
|
||||
}
|
||||
// ------------------------------------
|
||||
// FPGA: execute the kernel
|
||||
// ------------------------------------
|
||||
double time_elapsed_ms;
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
err = fpga_kernel_run(commands, kernel, &time_elapsed_ms);
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_kernel_exec = second() - start;
|
||||
}
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_kernel_run (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// ----------------------------------------
|
||||
// FPGA: read back debug buffer from device
|
||||
// ----------------------------------------
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10),
|
||||
commands,
|
||||
debug_outbuf_words, debugbufferSize,
|
||||
cldebug, debugBuffer,
|
||||
abort_cycles,
|
||||
&kernel_cycles, &kernel_iter_run,
|
||||
norms, &last_norm_idx,
|
||||
&kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults,
|
||||
&kernel_wrafterend, &kernel_dbgfifofull);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
if (kernel_wrafterend) {
|
||||
OpmLog::warning("Detected recoverable FPGA error: kernel write after end");
|
||||
}
|
||||
if (kernel_dbgfifofull) {
|
||||
OpmLog::warning("Detected recoverable FPGA error: debug FIFO full");
|
||||
}
|
||||
if (kernel_aborted || kernel_signature || kernel_overflow) {
|
||||
#if defined(FPGA_EXIT_WITH_HW_FAILURE)
|
||||
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
|
||||
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
#else
|
||||
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
|
||||
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n";
|
||||
oss << "Disabling FPGA kernel: execution will continue with SW kernel";
|
||||
OpmLog::warning(oss.str());
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
fpga_disabled = true;
|
||||
#endif
|
||||
}
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().n_kernel_exec_cycles = kernel_cycles;
|
||||
}
|
||||
// copy (back) results only if FPGA is not disabled
|
||||
if (!fpga_disabled) {
|
||||
if (kernel_noresults) {
|
||||
OpmLog::warning("FPGA kernel did not return results because the required precision is already reached");
|
||||
// rx still contains zeros from initial guess
|
||||
} else {
|
||||
// ------------------------------------
|
||||
// FPGA: read back results from device
|
||||
// ------------------------------------
|
||||
err = fpga_map_results(even(kernel_iter_run),
|
||||
use_residuals, use_LU_res, commands,
|
||||
resultsNum, resultsBufferNum, resultsBufferSize,
|
||||
debugbufferSize,
|
||||
cldata, resultsBuffer,
|
||||
result_offsets,
|
||||
dump_results, data_dir, basename, sequence);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_map_results (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
// TODO: copy results buffers to reordering output buffers
|
||||
memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double));
|
||||
err = fpga_unmap_results(even(kernel_iter_run),
|
||||
use_residuals, use_LU_res,
|
||||
commands, cldata, resultsBuffer);
|
||||
if (err != 0) {
|
||||
oss << "Failed to call fpga_unmap_results (" << err << ")";
|
||||
OPM_THROW(std::logic_error, oss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
// set results and update statistics (if enabled)
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_mem_d2h = second() - start;
|
||||
}
|
||||
float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns
|
||||
res.iterations = (int)iter;
|
||||
res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm
|
||||
res.conv_rate = pow(res.reduction, 1.0 / iter);
|
||||
res.elapsed = second() - start_total;
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_solve = res.elapsed;
|
||||
perf_call.back().n_kernel_exec_iters = iter;
|
||||
}
|
||||
// convergence depends on number of iterations reached and hw execution errors
|
||||
res.converged = true;
|
||||
if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) {
|
||||
res.converged = false;
|
||||
if (verbosity >= 1) {
|
||||
oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \
|
||||
", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \
|
||||
", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit);
|
||||
OpmLog::warning(oss.str());
|
||||
oss.str("");
|
||||
oss.clear();
|
||||
}
|
||||
}
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().converged = res.converged;
|
||||
perf_call.back().converged_flags = ((unsigned int)fpga_disabled) +
|
||||
((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) +
|
||||
((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4);
|
||||
}
|
||||
} // end solve_system()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::update_system(double *vals, double *b)
|
||||
{
|
||||
double start = 0;
|
||||
|
||||
mat->nnzValues = vals;
|
||||
// reorder inputs using previously found ordering (stored in fromOrder)
|
||||
if (perf_call_enabled) {
|
||||
start = second();
|
||||
}
|
||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
||||
if (perf_call_enabled) {
|
||||
perf_call.back().s_reorder = second() - start;
|
||||
}
|
||||
} // end update_system()
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void FpgaSolverBackend<block_size>::generate_statistics()
|
||||
{
|
||||
std::ostringstream oss;
|
||||
unsigned int conv_iter = 0, conv_ovf = 0;
|
||||
|
||||
if (!perf_call_enabled || fpga_calls == 0) {
|
||||
OpmLog::warning("FPGA statistics were not collected");
|
||||
return;
|
||||
}
|
||||
std::printf("--- FPGA statistics ---\n");
|
||||
std::printf("total solver calls..........: %u\n", fpga_calls);
|
||||
std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization);
|
||||
std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup);
|
||||
|
||||
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||
// DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file
|
||||
FILE *fout = fopen("fpga_statistics_details.csv", "w");
|
||||
if (fout != nullptr) {
|
||||
std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n");
|
||||
}
|
||||
#endif
|
||||
unsigned int num_data_points = perf_call.size();
|
||||
for (unsigned int i = 0; i < num_data_points; i++) {
|
||||
perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create;
|
||||
if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; }
|
||||
if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; }
|
||||
perf_total.s_analysis += perf_call[i].s_analysis;
|
||||
if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; }
|
||||
if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; }
|
||||
perf_total.s_reorder += perf_call[i].s_reorder;
|
||||
if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; }
|
||||
if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; }
|
||||
perf_total.s_mem_setup += perf_call[i].s_mem_setup;
|
||||
if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; }
|
||||
if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; }
|
||||
perf_total.s_mem_h2d += perf_call[i].s_mem_h2d;
|
||||
if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; }
|
||||
if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; }
|
||||
perf_total.s_kernel_exec += perf_call[i].s_kernel_exec;
|
||||
if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; }
|
||||
if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; }
|
||||
perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles;
|
||||
if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; }
|
||||
if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; }
|
||||
perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters;
|
||||
if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; }
|
||||
if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; }
|
||||
perf_total.s_mem_d2h += perf_call[i].s_mem_d2h;
|
||||
if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; }
|
||||
if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; }
|
||||
perf_total.s_solve += perf_call[i].s_solve;
|
||||
if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; }
|
||||
if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; }
|
||||
perf_total.s_postprocess += perf_call[i].s_postprocess;
|
||||
if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; }
|
||||
if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; }
|
||||
perf_total.n_converged += (unsigned int)perf_call[i].converged;
|
||||
if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; }
|
||||
if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; }
|
||||
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||
if (fout != nullptr) {
|
||||
std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n",
|
||||
i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder,
|
||||
perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles,
|
||||
perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess,
|
||||
(unsigned int)perf_call[i].converged);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||
if (fout != nullptr) {
|
||||
fclose(fout);
|
||||
}
|
||||
#endif
|
||||
perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points;
|
||||
perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points;
|
||||
perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points;
|
||||
perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points;
|
||||
perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points;
|
||||
perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points;
|
||||
perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points;
|
||||
perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points;
|
||||
perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points;
|
||||
perf_total.s_solve_avg = perf_total.s_solve / num_data_points;
|
||||
perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points;
|
||||
std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max);
|
||||
std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max);
|
||||
std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max);
|
||||
std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max);
|
||||
std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max);
|
||||
std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max);
|
||||
std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n",
|
||||
perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max);
|
||||
std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n",
|
||||
perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max);
|
||||
std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max);
|
||||
std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max);
|
||||
std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||
perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max);
|
||||
std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n",
|
||||
perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf);
|
||||
std::printf("-----------------------\n");
|
||||
} //end generate_statistics()
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template FpgaSolverBackend<n>::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||
|
||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||
|
||||
} //namespace bda
|
265
opm/simulators/linalg/bda/FPGASolverBackend.hpp
Normal file
265
opm/simulators/linalg/bda/FPGASolverBackend.hpp
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
|
||||
#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
|
||||
|
||||
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
|
||||
|
||||
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
|
||||
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
|
||||
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
/// This class implements an ilu0-bicgstab solver on FPGA
|
||||
template <unsigned int block_size>
|
||||
class FpgaSolverBackend : public BdaSolver<block_size>
|
||||
{
|
||||
typedef BdaSolver<block_size> Base;
|
||||
typedef FPGABILU0<block_size> Preconditioner;
|
||||
|
||||
using Base::N;
|
||||
using Base::Nb;
|
||||
using Base::nnz;
|
||||
using Base::nnzb;
|
||||
using Base::verbosity;
|
||||
using Base::maxit;
|
||||
using Base::tolerance;
|
||||
using Base::initialized;
|
||||
|
||||
private:
|
||||
double *rx = nullptr; // reordered x
|
||||
double *rb = nullptr; // reordered b
|
||||
int *fromOrder = nullptr, *toOrder = nullptr;
|
||||
bool analysis_done = false;
|
||||
bool level_scheduling = false;
|
||||
|
||||
// LUMat will shallow copy rowPointers and colIndices of mat/rMat
|
||||
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
|
||||
BlockedMatrix<block_size> *rMat = nullptr;
|
||||
std::unique_ptr<Preconditioner> prec = nullptr;
|
||||
|
||||
// vectors with data processed by the preconditioner (input to the kernel)
|
||||
void **processedPointers = nullptr;
|
||||
int *processedSizes = nullptr;
|
||||
|
||||
unsigned int fpga_calls = 0;
|
||||
bool perf_call_enabled = true;
|
||||
|
||||
// per call performance metrics
|
||||
typedef struct {
|
||||
double s_preconditioner_create = 0.0;
|
||||
double s_analysis = 0.0;
|
||||
double s_reorder = 0.0;
|
||||
double s_mem_setup = 0.0;
|
||||
double s_mem_h2d = 0.0;
|
||||
double s_kernel_exec = 0.0;
|
||||
unsigned int n_kernel_exec_cycles = 0;
|
||||
float n_kernel_exec_iters = 0.0;
|
||||
double s_mem_d2h = 0.0;
|
||||
double s_solve = 0.0;
|
||||
double s_postprocess = 0.0;
|
||||
bool converged = false;
|
||||
unsigned int converged_flags = 0;
|
||||
} perf_call_metrics_t;
|
||||
// cumulative performance metrics
|
||||
typedef struct {
|
||||
double s_initialization;
|
||||
double s_preconditioner_setup;
|
||||
double s_preconditioner_create;
|
||||
double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
|
||||
double s_analysis;
|
||||
double s_analysis_min,s_analysis_max,s_analysis_avg;
|
||||
double s_reorder;
|
||||
double s_reorder_min,s_reorder_max,s_reorder_avg;
|
||||
double s_mem_setup;
|
||||
double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
|
||||
double s_mem_h2d;
|
||||
double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
|
||||
double s_kernel_exec;
|
||||
double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
|
||||
unsigned long n_kernel_exec_cycles;
|
||||
unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
|
||||
float n_kernel_exec_iters;
|
||||
float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
|
||||
double s_mem_d2h;
|
||||
double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
|
||||
double s_solve;
|
||||
double s_solve_min,s_solve_max,s_solve_avg;
|
||||
double s_postprocess;
|
||||
double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
|
||||
unsigned int n_converged;
|
||||
} perf_total_metrics_t;
|
||||
std::vector<perf_call_metrics_t> perf_call;
|
||||
perf_total_metrics_t perf_total;
|
||||
// fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
|
||||
// fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
|
||||
unsigned int fpga_config_bits = 0;
|
||||
bool fpga_disabled = false;
|
||||
bool platform_awsf1;
|
||||
unsigned int debugbufferSize;
|
||||
unsigned long int *debugBuffer = nullptr;
|
||||
unsigned int *databufferSize = nullptr;
|
||||
unsigned char *dataBuffer[RW_BUF] = {nullptr};
|
||||
unsigned int debug_outbuf_words;
|
||||
int resultsNum;
|
||||
int resultsBufferNum;
|
||||
unsigned int resultsBufferSize[RES_BUF_MAX];
|
||||
unsigned int result_offsets[6];
|
||||
unsigned int kernel_cycles, kernel_iter_run;
|
||||
double norms[4];
|
||||
unsigned char last_norm_idx;
|
||||
bool kernel_aborted, kernel_signature, kernel_overflow;
|
||||
bool kernel_noresults;
|
||||
bool kernel_wrafterend, kernel_dbgfifofull;
|
||||
bool use_residuals = false;
|
||||
bool use_LU_res = false;
|
||||
int sequence = 0;
|
||||
// TODO: these values may be sent via command line parameters
|
||||
unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
|
||||
unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
|
||||
int nnzValArrays_size = 0;
|
||||
int L_nnzValArrays_size = 0;
|
||||
int U_nnzValArrays_size = 0;
|
||||
// aliases to areas of the host data buffers
|
||||
long unsigned int *setupArray = nullptr;
|
||||
double **nnzValArrays = nullptr;
|
||||
short unsigned int *columnIndexArray = nullptr;
|
||||
unsigned char *newRowOffsetArray = nullptr;
|
||||
unsigned int *PIndexArray = nullptr;
|
||||
unsigned int *colorSizesArray = nullptr;
|
||||
double **L_nnzValArrays = nullptr;
|
||||
short unsigned int *L_columnIndexArray = nullptr;
|
||||
unsigned char *L_newRowOffsetArray = nullptr;
|
||||
unsigned int *L_PIndexArray = nullptr;
|
||||
unsigned int *L_colorSizesArray = nullptr;
|
||||
double **U_nnzValArrays = nullptr;
|
||||
short unsigned int *U_columnIndexArray = nullptr;
|
||||
unsigned char *U_newRowOffsetArray = nullptr;
|
||||
unsigned int *U_PIndexArray = nullptr;
|
||||
unsigned int *U_colorSizesArray = nullptr;
|
||||
double *BLKDArray = nullptr;
|
||||
double *X1Array = nullptr, *X2Array = nullptr;
|
||||
double *R1Array = nullptr, *R2Array = nullptr;
|
||||
double *LresArray = nullptr, *UresArray = nullptr;
|
||||
double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
|
||||
// OpenCL variables
|
||||
cl_device_id device_id;
|
||||
cl_context context;
|
||||
cl_command_queue commands;
|
||||
cl_program program;
|
||||
cl_kernel kernel;
|
||||
cl_mem cldata[RW_BUF] = {nullptr};
|
||||
cl_mem cldebug = nullptr;
|
||||
// HW limits/configuration variables
|
||||
unsigned int hw_x_vector_elem;
|
||||
unsigned int hw_max_row_size;
|
||||
unsigned int hw_max_column_size;
|
||||
unsigned int hw_max_colors_size;
|
||||
unsigned short hw_max_nnzs_per_row;
|
||||
unsigned int hw_max_matrix_size;
|
||||
bool hw_use_uram;
|
||||
bool hw_write_ilu0_results;
|
||||
unsigned short hw_dma_data_width;
|
||||
unsigned char hw_x_vector_latency;
|
||||
unsigned char hw_add_latency;
|
||||
unsigned char hw_mult_latency;
|
||||
unsigned char hw_mult_num;
|
||||
unsigned char hw_num_read_ports;
|
||||
unsigned char hw_num_write_ports;
|
||||
unsigned short hw_reset_cycles;
|
||||
unsigned short hw_reset_settle;
|
||||
// debug
|
||||
bool reset_data_buffers = false;
|
||||
bool fill_results_buffers = false;
|
||||
int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
|
||||
bool dump_results = false;
|
||||
char *data_dir = nullptr;
|
||||
char *basename = nullptr;
|
||||
unsigned short rst_assert_cycles = 0;
|
||||
unsigned short rst_settle_cycles = 0;
|
||||
|
||||
/// Allocate host memory
|
||||
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
/// Reorder the linear system so it corresponds with the coloring
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vector
|
||||
void update_system(double *vals, double *b);
|
||||
|
||||
/// Analyse sparsity pattern to extract parallelism
|
||||
/// \return true iff analysis was successful
|
||||
bool analyse_matrix();
|
||||
|
||||
/// Perform ilu0-decomposition
|
||||
/// \return true iff decomposition was successful
|
||||
bool create_preconditioner();
|
||||
|
||||
/// Solve linear system
|
||||
/// \param[inout] res summary of solver result
|
||||
void solve_system(BdaResult &res);
|
||||
|
||||
/// Generate FPGA backend statistics
|
||||
void generate_statistics(void);
|
||||
|
||||
public:
|
||||
|
||||
/// Construct an fpgaSolver
|
||||
/// \param[in] fpga_bitstream FPGA bitstream file name
|
||||
/// \param[in] linear_solver_verbosity verbosity of fpgaSolver
|
||||
/// \param[in] maxit maximum number of iterations for fpgaSolver
|
||||
/// \param[in] tolerance required relative tolerance for fpgaSolver
|
||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||
FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||
|
||||
/// Destroy an fpgaSolver, and free memory
|
||||
~FpgaSolverBackend();
|
||||
|
||||
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||
/// \param[in] dim size of block
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
/// \param[in] b input vector, contains N values
|
||||
/// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A
|
||||
/// \param[inout] res summary of solver result
|
||||
/// \return status code
|
||||
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||
|
||||
/// Get result after linear solve, and peform postprocessing if necessary
|
||||
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||
void get_result(double *x) override;
|
||||
|
||||
}; // end class fpgaSolverBackend
|
||||
|
||||
} //namespace bda
|
||||
|
||||
#endif
|
||||
|
63
opm/simulators/linalg/bda/FPGAUtils.cpp
Normal file
63
opm/simulators/linalg/bda/FPGAUtils.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <fstream>
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
double second(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, nullptr);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||
}
|
||||
|
||||
bool even(int n)
|
||||
{
|
||||
if (n % 2 == 0) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int roundUpTo(int i, int n)
|
||||
{
|
||||
if (i % n == 0) {
|
||||
return i;
|
||||
} else {
|
||||
return (i / n + 1) * n;
|
||||
}
|
||||
}
|
||||
|
||||
bool fileExists(const char *filename)
|
||||
{
|
||||
FILE *fin;
|
||||
fin = fopen(filename, "r");
|
||||
if (fin == nullptr) {
|
||||
return false;
|
||||
} else {
|
||||
fclose(fin);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
} //namespace bda
|
39
opm/simulators/linalg/bda/FPGAUtils.hpp
Normal file
39
opm/simulators/linalg/bda/FPGAUtils.hpp
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
Copyright 2020 Equinor ASA
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef FPGA_UTILS_HEADER_INCLUDED
|
||||
#define FPGA_UTILS_HEADER_INCLUDED
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
union double2int
|
||||
{
|
||||
unsigned long int int_val;
|
||||
double double_val;
|
||||
};
|
||||
|
||||
double second(void);
|
||||
bool even(int n);
|
||||
int roundUpTo(int i, int n);
|
||||
bool fileExists(const char *filename);
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif // FPGA_UTILS_HEADER_INCLUDED
|
@ -17,12 +17,7 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
#include <algorithm> // for fill()
|
||||
#include <random>
|
||||
#include <limits>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
|
||||
@ -60,7 +55,7 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<> uniform(0, std::numeric_limits<int>::max());
|
||||
{
|
||||
for(int i = 0; i < rows; ++i){
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
randoms[i] = uniform(gen);
|
||||
}
|
||||
}
|
||||
@ -180,7 +175,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
|
||||
// put thisRow from the old matrix into row i of the new matrix
|
||||
rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
|
||||
for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
|
||||
for (j = 0; j < bs * bs; j++){
|
||||
for (j = 0; j < bs * bs; j++) {
|
||||
rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
|
||||
}
|
||||
rmat->colIndices[rIndex] = mat->colIndices[k];
|
||||
|
@ -20,6 +20,8 @@
|
||||
#ifndef REORDER_HPP
|
||||
#define REORDER_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||
|
||||
namespace bda
|
||||
|
@ -28,15 +28,18 @@
|
||||
namespace Opm
|
||||
{
|
||||
|
||||
WellContributions::WellContributions(std::string gpu_mode){
|
||||
if(gpu_mode.compare("cusparse") == 0){
|
||||
WellContributions::WellContributions(std::string accelerator_mode){
|
||||
if(accelerator_mode.compare("cusparse") == 0){
|
||||
cuda_gpu = true;
|
||||
}
|
||||
else if(gpu_mode.compare("opencl") == 0){
|
||||
else if(accelerator_mode.compare("opencl") == 0){
|
||||
opencl_gpu = true;
|
||||
}
|
||||
else if(accelerator_mode.compare("fpga") == 0){
|
||||
// unused for FPGA, but must be defined to avoid error
|
||||
}
|
||||
else{
|
||||
OPM_THROW(std::logic_error, "Error: invalid GPU mode");
|
||||
OPM_THROW(std::logic_error, "Invalid accelerator mode");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,7 +176,7 @@ public:
|
||||
void alloc();
|
||||
|
||||
/// Create a new WellContributions
|
||||
WellContributions(std::string gpu_mode);
|
||||
WellContributions(std::string accelerator_mode);
|
||||
|
||||
/// Destroy a WellContributions, and free memory
|
||||
~WellContributions();
|
||||
|
Loading…
Reference in New Issue
Block a user