mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #2998 from g-marchiori/fpgasolver-integration
Added fpgaSolver, requires Xilinx Alveo U280 FPGA board
This commit is contained in:
commit
13f62a718b
@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for
|
|||||||
option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
|
option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
|
||||||
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
|
option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
|
||||||
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
|
option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
|
||||||
|
option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
|
||||||
|
|
||||||
if(SIBLING_SEARCH AND NOT opm-common_DIR)
|
if(SIBLING_SEARCH AND NOT opm-common_DIR)
|
||||||
# guess the sibling dir
|
# guess the sibling dir
|
||||||
@ -132,9 +133,72 @@ if(CUDA_FOUND)
|
|||||||
include_directories(${CUDA_INCLUDE_DIRS})
|
include_directories(${CUDA_INCLUDE_DIRS})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
find_package(OpenCL)
|
find_package(OpenCL)
|
||||||
|
|
||||||
|
# include FPGA target only when ENABLE_FPGA is set to ON
|
||||||
|
if(ENABLE_FPGA)
|
||||||
|
# must include opencl.h which is a wrapper for all the OpenCL extensions
|
||||||
|
find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS})
|
||||||
|
|
||||||
|
# FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in <opm-simulators>/../FPGA
|
||||||
|
find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA)
|
||||||
|
|
||||||
|
# this code only runs if the user explicitly enabled the FPGA
|
||||||
|
# hence fatal_error instead of warning
|
||||||
|
if(NOT OPENCL_H OR NOT OpenCL_FOUND)
|
||||||
|
set(HAVE_FPGA 0)
|
||||||
|
message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.")
|
||||||
|
elseif(NOT FPGA_MODULE)
|
||||||
|
set(HAVE_FPGA 0)
|
||||||
|
message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.")
|
||||||
|
elseif(NOT DEFINED ENV{XILINX_XRT})
|
||||||
|
# Xilinx XRT must be installed and properly setup
|
||||||
|
set(HAVE_FPGA 0)
|
||||||
|
message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.")
|
||||||
|
else()
|
||||||
|
set(HAVE_FPGA 1)
|
||||||
|
message(STATUS "FPGA library and kernel integration active.")
|
||||||
|
|
||||||
|
# FIXME: set the correct path to the FPGA module
|
||||||
|
set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA)
|
||||||
|
|
||||||
|
# configuration variables with default values: they can be overridden on the cmake command line
|
||||||
|
# FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream
|
||||||
|
if(NOT FPGA_PORTS_CONFIG)
|
||||||
|
set(FPGA_PORTS_CONFIG 2r_3r3w_ddr)
|
||||||
|
endif()
|
||||||
|
# FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build)
|
||||||
|
if(NOT FPGA_DEBUG_LEVEL)
|
||||||
|
set(FPGA_DEBUG_LEVEL 0)
|
||||||
|
endif()
|
||||||
|
message(STATUS "Using the following settings for the FPGA library compilation: "
|
||||||
|
"FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, "
|
||||||
|
"FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}")
|
||||||
|
add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG})
|
||||||
|
add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL})
|
||||||
|
# include directories for the FPGA library
|
||||||
|
include_directories(${FPGA_SOURCE_DIR})
|
||||||
|
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app)
|
||||||
|
include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common)
|
||||||
|
# add external project to compile the FPGA library
|
||||||
|
include (ExternalProject)
|
||||||
|
ExternalProject_Add(FPGA_library
|
||||||
|
# force the build step to always be run because source dependencies cannot be made explicit
|
||||||
|
BUILD_ALWAYS 1
|
||||||
|
DOWNLOAD_COMMAND ""
|
||||||
|
UPDATE_COMMAND ""
|
||||||
|
CONFIGURE_COMMAND ""
|
||||||
|
BUILD_COMMAND make
|
||||||
|
-f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile
|
||||||
|
SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app
|
||||||
|
PORTS_CONFIG=${FPGA_PORTS_CONFIG}
|
||||||
|
DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}
|
||||||
|
INSTALL_COMMAND ""
|
||||||
|
TEST_COMMAND ""
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if(OpenCL_FOUND)
|
if(OpenCL_FOUND)
|
||||||
# the current OpenCL implementation relies on cl.hpp, not cl2.hpp
|
# the current OpenCL implementation relies on cl.hpp, not cl2.hpp
|
||||||
# make sure it is available, otherwise disable OpenCL
|
# make sure it is available, otherwise disable OpenCL
|
||||||
@ -454,7 +518,8 @@ endif()
|
|||||||
|
|
||||||
add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
|
add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
|
||||||
|
|
||||||
# must link libraries after target 'flow' has been defined
|
# must link libraries after target 'opmsimulators' has been defined
|
||||||
|
|
||||||
if(CUDA_FOUND)
|
if(CUDA_FOUND)
|
||||||
target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
|
target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
|
||||||
target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
|
target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
|
||||||
@ -463,3 +528,9 @@ endif()
|
|||||||
if(OpenCL_FOUND)
|
if(OpenCL_FOUND)
|
||||||
target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
|
target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(HAVE_FPGA)
|
||||||
|
add_dependencies(opmsimulators FPGA_library)
|
||||||
|
ExternalProject_Get_Property(FPGA_library binary_dir)
|
||||||
|
target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a)
|
||||||
|
endif()
|
||||||
|
@ -66,6 +66,16 @@ if(OPENCL_FOUND)
|
|||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
if(HAVE_FPGA)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
|
||||||
|
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(MPI_FOUND)
|
if(MPI_FOUND)
|
||||||
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
|
||||||
opm/simulators/utils/ParallelSerialization.cpp)
|
opm/simulators/utils/ParallelSerialization.cpp)
|
||||||
@ -184,6 +194,10 @@ list (APPEND PUBLIC_HEADER_FILES
|
|||||||
opm/simulators/linalg/bda/cuda_header.hpp
|
opm/simulators/linalg/bda/cuda_header.hpp
|
||||||
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
opm/simulators/linalg/bda/cusparseSolverBackend.hpp
|
||||||
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
opm/simulators/linalg/bda/ChowPatelIlu.hpp
|
||||||
|
opm/simulators/linalg/bda/FPGAMatrix.hpp
|
||||||
|
opm/simulators/linalg/bda/FPGABILU0.hpp
|
||||||
|
opm/simulators/linalg/bda/FPGASolverBackend.hpp
|
||||||
|
opm/simulators/linalg/bda/FPGAUtils.hpp
|
||||||
opm/simulators/linalg/bda/Reorder.hpp
|
opm/simulators/linalg/bda/Reorder.hpp
|
||||||
opm/simulators/linalg/bda/ILUReorder.hpp
|
opm/simulators/linalg/bda/ILUReorder.hpp
|
||||||
opm/simulators/linalg/bda/opencl.hpp
|
opm/simulators/linalg/bda/opencl.hpp
|
||||||
|
@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR
|
|||||||
HAVE_PETSC
|
HAVE_PETSC
|
||||||
HAVE_CUDA
|
HAVE_CUDA
|
||||||
HAVE_OPENCL
|
HAVE_OPENCL
|
||||||
|
HAVE_FPGA
|
||||||
HAVE_SUITESPARSE_UMFPACK_H
|
HAVE_SUITESPARSE_UMFPACK_H
|
||||||
HAVE_DUNE_ISTL
|
HAVE_DUNE_ISTL
|
||||||
DUNE_ISTL_VERSION_MAJOR
|
DUNE_ISTL_VERSION_MAJOR
|
||||||
|
@ -118,7 +118,7 @@ struct Linsolver {
|
|||||||
using type = UndefinedProperty;
|
using type = UndefinedProperty;
|
||||||
};
|
};
|
||||||
template<class TypeTag, class MyTypeTag>
|
template<class TypeTag, class MyTypeTag>
|
||||||
struct GpuMode {
|
struct AcceleratorMode {
|
||||||
using type = UndefinedProperty;
|
using type = UndefinedProperty;
|
||||||
};
|
};
|
||||||
template<class TypeTag, class MyTypeTag>
|
template<class TypeTag, class MyTypeTag>
|
||||||
@ -133,6 +133,10 @@ template<class TypeTag, class MyTypeTag>
|
|||||||
struct OpenclIluReorder {
|
struct OpenclIluReorder {
|
||||||
using type = UndefinedProperty;
|
using type = UndefinedProperty;
|
||||||
};
|
};
|
||||||
|
template<class TypeTag, class MyTypeTag>
|
||||||
|
struct FpgaBitstream {
|
||||||
|
using type = UndefinedProperty;
|
||||||
|
};
|
||||||
|
|
||||||
template<class TypeTag>
|
template<class TypeTag>
|
||||||
struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
|
struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
|
||||||
@ -213,7 +217,7 @@ struct Linsolver<TypeTag, TTag::FlowIstlSolverParams> {
|
|||||||
static constexpr auto value = "ilu0";
|
static constexpr auto value = "ilu0";
|
||||||
};
|
};
|
||||||
template<class TypeTag>
|
template<class TypeTag>
|
||||||
struct GpuMode<TypeTag, TTag::FlowIstlSolverParams> {
|
struct AcceleratorMode<TypeTag, TTag::FlowIstlSolverParams> {
|
||||||
static constexpr auto value = "none";
|
static constexpr auto value = "none";
|
||||||
};
|
};
|
||||||
template<class TypeTag>
|
template<class TypeTag>
|
||||||
@ -226,7 +230,11 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
|
|||||||
};
|
};
|
||||||
template<class TypeTag>
|
template<class TypeTag>
|
||||||
struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
|
struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
|
||||||
static constexpr auto value = "graph_coloring";
|
static constexpr auto value = ""; // note: default value is chosen depending on the solver used
|
||||||
|
};
|
||||||
|
template<class TypeTag>
|
||||||
|
struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
|
||||||
|
static constexpr auto value = "";
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace Opm::Properties
|
} // namespace Opm::Properties
|
||||||
@ -252,12 +260,13 @@ namespace Opm
|
|||||||
bool ignoreConvergenceFailure_;
|
bool ignoreConvergenceFailure_;
|
||||||
bool scale_linear_system_;
|
bool scale_linear_system_;
|
||||||
std::string linsolver_;
|
std::string linsolver_;
|
||||||
std::string gpu_mode_;
|
std::string accelerator_mode_;
|
||||||
int bda_device_id_;
|
int bda_device_id_;
|
||||||
int opencl_platform_id_;
|
int opencl_platform_id_;
|
||||||
int cpr_max_ell_iter_ = 20;
|
int cpr_max_ell_iter_ = 20;
|
||||||
int cpr_reuse_setup_ = 0;
|
int cpr_reuse_setup_ = 0;
|
||||||
std::string opencl_ilu_reorder_;
|
std::string opencl_ilu_reorder_;
|
||||||
|
std::string fpga_bitstream_;
|
||||||
|
|
||||||
template <class TypeTag>
|
template <class TypeTag>
|
||||||
void init()
|
void init()
|
||||||
@ -279,10 +288,11 @@ namespace Opm
|
|||||||
cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
|
cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
|
||||||
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
|
cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
|
||||||
linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
|
linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
|
||||||
gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||||
bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
||||||
opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
||||||
opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||||
|
fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class TypeTag>
|
template <class TypeTag>
|
||||||
@ -304,10 +314,11 @@ namespace Opm
|
|||||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
|
EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
|
EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
|
EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'");
|
EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
|
EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
|
EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
|
||||||
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
|
EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
|
||||||
|
EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
|
||||||
}
|
}
|
||||||
|
|
||||||
FlowLinearSolverParameters() { reset(); }
|
FlowLinearSolverParameters() { reset(); }
|
||||||
@ -327,10 +338,11 @@ namespace Opm
|
|||||||
ilu_milu_ = MILU_VARIANT::ILU;
|
ilu_milu_ = MILU_VARIANT::ILU;
|
||||||
ilu_redblack_ = false;
|
ilu_redblack_ = false;
|
||||||
ilu_reorder_sphere_ = true;
|
ilu_reorder_sphere_ = true;
|
||||||
gpu_mode_ = "none";
|
accelerator_mode_ = "none";
|
||||||
bda_device_id_ = 0;
|
bda_device_id_ = 0;
|
||||||
opencl_platform_id_ = 0;
|
opencl_platform_id_ = 0;
|
||||||
opencl_ilu_reorder_ = "graph_coloring";
|
opencl_ilu_reorder_ = ""; // note: the default value is chosen depending on the solver used
|
||||||
|
fpga_bitstream_ = "";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@
|
|||||||
#include <opm/simulators/linalg/setupPropertyTree.hpp>
|
#include <opm/simulators/linalg/setupPropertyTree.hpp>
|
||||||
|
|
||||||
|
|
||||||
#if HAVE_CUDA || HAVE_OPENCL
|
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||||
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
#include <opm/simulators/linalg/bda/BdaBridge.hpp>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -92,7 +92,7 @@ namespace Opm
|
|||||||
using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
|
using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
|
||||||
using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
|
using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
|
||||||
|
|
||||||
#if HAVE_CUDA || HAVE_OPENCL
|
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||||
static const unsigned int block_size = Matrix::block_type::rows;
|
static const unsigned int block_size = Matrix::block_type::rows;
|
||||||
std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
|
std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
|
||||||
#endif
|
#endif
|
||||||
@ -126,14 +126,14 @@ namespace Opm
|
|||||||
#endif
|
#endif
|
||||||
parameters_.template init<TypeTag>();
|
parameters_.template init<TypeTag>();
|
||||||
prm_ = setupPropertyTree<TypeTag>(parameters_);
|
prm_ = setupPropertyTree<TypeTag>(parameters_);
|
||||||
#if HAVE_CUDA || HAVE_OPENCL
|
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||||
{
|
{
|
||||||
std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||||
if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) {
|
if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
|
||||||
if (on_io_rank) {
|
if (on_io_rank) {
|
||||||
OpmLog::warning("Cannot use GPU with MPI, GPU is disabled");
|
OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled");
|
||||||
}
|
}
|
||||||
gpu_mode = "none";
|
accelerator_mode = "none";
|
||||||
}
|
}
|
||||||
const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
|
||||||
const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
|
||||||
@ -141,11 +141,12 @@ namespace Opm
|
|||||||
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
|
||||||
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
|
||||||
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
|
||||||
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
|
||||||
|
bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") {
|
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
|
||||||
OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
|
OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
|
||||||
@ -157,6 +158,12 @@ namespace Opm
|
|||||||
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
|
detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
|
||||||
|
|
||||||
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
|
||||||
|
#if HAVE_FPGA
|
||||||
|
// check usage of MatrixAddWellContributions: for FPGA they must be included
|
||||||
|
if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) {
|
||||||
|
OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
|
const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
|
||||||
if (!ownersFirst) {
|
if (!ownersFirst) {
|
||||||
const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
|
const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
|
||||||
@ -242,43 +249,42 @@ namespace Opm
|
|||||||
|
|
||||||
// Solve system.
|
// Solve system.
|
||||||
Dune::InverseOperatorResult result;
|
Dune::InverseOperatorResult result;
|
||||||
bool gpu_was_used = false;
|
bool accelerator_was_used = false;
|
||||||
|
|
||||||
// Use GPU if: available, chosen by user, and successful.
|
// Use GPU if: available, chosen by user, and successful.
|
||||||
#if HAVE_CUDA || HAVE_OPENCL
|
// Use FPGA if: support compiled, chosen by user, and successful.
|
||||||
|
#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
|
||||||
bool use_gpu = bdaBridge->getUseGpu();
|
bool use_gpu = bdaBridge->getUseGpu();
|
||||||
if (use_gpu) {
|
bool use_fpga = bdaBridge->getUseFpga();
|
||||||
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
|
if (use_gpu || use_fpga) {
|
||||||
WellContributions wellContribs(gpu_mode);
|
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
|
||||||
|
WellContributions wellContribs(accelerator_mode);
|
||||||
bdaBridge->initWellContributions(wellContribs);
|
bdaBridge->initWellContributions(wellContribs);
|
||||||
|
|
||||||
if (!useWellConn_) {
|
if (!useWellConn_) {
|
||||||
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
simulator_.problem().wellModel().getWellContributions(wellContribs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
|
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
|
||||||
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
|
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
|
||||||
if (result.converged) {
|
if (result.converged) {
|
||||||
// get result vector x from non-Dune backend, iff solve was successful
|
// get result vector x from non-Dune backend, iff solve was successful
|
||||||
bdaBridge->get_result(x);
|
bdaBridge->get_result(x);
|
||||||
gpu_was_used = true;
|
accelerator_was_used = true;
|
||||||
} else {
|
} else {
|
||||||
// CPU fallback
|
// warn about CPU fallback
|
||||||
use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver
|
// BdaBridge might have disabled its BdaSolver for this simulation due to some error
|
||||||
if (use_gpu && simulator_.gridView().comm().rank() == 0) {
|
// in that case the BdaBridge is disabled and flexibleSolver is always used
|
||||||
if (gpu_mode.compare("cusparse") == 0) {
|
// or maybe the BdaSolver did not converge in time, then it will be used next linear solve
|
||||||
OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
|
if (simulator_.gridView().comm().rank() == 0) {
|
||||||
}
|
OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
|
||||||
if (gpu_mode.compare("opencl") == 0) {
|
|
||||||
OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Otherwise, use flexible istl solver.
|
// Otherwise, use flexible istl solver.
|
||||||
if (!gpu_was_used) {
|
if (!accelerator_was_used) {
|
||||||
assert(flexibleSolver_);
|
assert(flexibleSolver_);
|
||||||
flexibleSolver_->apply(x, *rhs_, result);
|
flexibleSolver_->apply(x, *rhs_, result);
|
||||||
}
|
}
|
||||||
|
@ -45,11 +45,6 @@ namespace bda
|
|||||||
BILU0<block_size>::~BILU0()
|
BILU0<block_size>::~BILU0()
|
||||||
{
|
{
|
||||||
delete[] invDiagVals;
|
delete[] invDiagVals;
|
||||||
delete[] diagIndex;
|
|
||||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
|
||||||
delete[] toOrder;
|
|
||||||
delete[] fromOrder;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
@ -68,8 +63,8 @@ namespace bda
|
|||||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||||
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
|
||||||
} else {
|
} else {
|
||||||
toOrder = new int[Nb];
|
toOrder.resize(Nb);
|
||||||
fromOrder = new int[Nb];
|
fromOrder.resize(Nb);
|
||||||
CSCRowIndices = new int[nnzbs];
|
CSCRowIndices = new int[nnzbs];
|
||||||
CSCColPointers = new int[Nb + 1];
|
CSCColPointers = new int[Nb + 1];
|
||||||
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||||
@ -88,10 +83,10 @@ namespace bda
|
|||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||||
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
out << "BILU0 reordering strategy: " << "level_scheduling\n";
|
||||||
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||||
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||||
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
out << "BILU0 reordering strategy: " << "graph_coloring\n";
|
||||||
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
|
findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||||
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
} else if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||||
out << "BILU0 reordering strategy: none\n";
|
out << "BILU0 reordering strategy: none\n";
|
||||||
// numColors = 1;
|
// numColors = 1;
|
||||||
@ -111,12 +106,13 @@ namespace bda
|
|||||||
}
|
}
|
||||||
OpmLog::info(out.str());
|
OpmLog::info(out.str());
|
||||||
|
|
||||||
|
|
||||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
delete[] CSCRowIndices;
|
delete[] CSCRowIndices;
|
||||||
delete[] CSCColPointers;
|
delete[] CSCColPointers;
|
||||||
}
|
}
|
||||||
|
|
||||||
diagIndex = new int[mat->Nb];
|
diagIndex.resize(mat->Nb);
|
||||||
invDiagVals = new double[mat->Nb * bs * bs];
|
invDiagVals = new double[mat->Nb * bs * bs];
|
||||||
|
|
||||||
#if CHOW_PATEL
|
#if CHOW_PATEL
|
||||||
@ -482,7 +478,7 @@ namespace bda
|
|||||||
diagIndex[row] = candidate - LUmat->colIndices;
|
diagIndex[row] = candidate - LUmat->colIndices;
|
||||||
}
|
}
|
||||||
events.resize(8);
|
events.resize(8);
|
||||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
|
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
|
||||||
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
|
queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
|
||||||
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
|
queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
|
||||||
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
|
queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
|
||||||
@ -516,7 +512,7 @@ namespace bda
|
|||||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||||
m = rmat.get();
|
m = rmat.get();
|
||||||
Timer t_reorder;
|
Timer t_reorder;
|
||||||
reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
|
reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
|
||||||
|
|
||||||
if (verbosity >= 3){
|
if (verbosity >= 3){
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
@ -556,7 +552,7 @@ namespace bda
|
|||||||
diagIndex[row] = candidate - LUmat->colIndices;
|
diagIndex[row] = candidate - LUmat->colIndices;
|
||||||
}
|
}
|
||||||
events.resize(4);
|
events.resize(4);
|
||||||
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
|
queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
|
||||||
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
|
||||||
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
|
||||||
});
|
});
|
||||||
|
@ -61,10 +61,10 @@ namespace bda
|
|||||||
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
|
std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
|
||||||
#endif
|
#endif
|
||||||
double *invDiagVals = nullptr;
|
double *invDiagVals = nullptr;
|
||||||
int *diagIndex = nullptr;
|
std::vector<int> diagIndex;
|
||||||
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
|
||||||
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
|
||||||
int *toOrder = nullptr, *fromOrder = nullptr;
|
std::vector<int> toOrder, fromOrder;
|
||||||
int numColors;
|
int numColors;
|
||||||
int verbosity;
|
int verbosity;
|
||||||
std::once_flag pattern_uploaded;
|
std::once_flag pattern_uploaded;
|
||||||
@ -128,12 +128,12 @@ namespace bda
|
|||||||
|
|
||||||
int* getToOrder()
|
int* getToOrder()
|
||||||
{
|
{
|
||||||
return toOrder;
|
return toOrder.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
int* getFromOrder()
|
int* getFromOrder()
|
||||||
{
|
{
|
||||||
return fromOrder;
|
return fromOrder.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
BlockedMatrix<block_size>* getRMat()
|
BlockedMatrix<block_size>* getRMat()
|
||||||
|
@ -18,8 +18,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <config.h>
|
#include <config.h>
|
||||||
#include <memory>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
@ -54,21 +52,23 @@ namespace Opm
|
|||||||
using bda::ILUReorder;
|
using bda::ILUReorder;
|
||||||
|
|
||||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||||
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
|
BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
|
||||||
: gpu_mode(gpu_mode_)
|
: accelerator_mode(accelerator_mode_)
|
||||||
{
|
{
|
||||||
if (gpu_mode.compare("cusparse") == 0) {
|
if (accelerator_mode.compare("cusparse") == 0) {
|
||||||
#if HAVE_CUDA
|
#if HAVE_CUDA
|
||||||
use_gpu = true;
|
use_gpu = true;
|
||||||
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
|
||||||
#else
|
#else
|
||||||
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
|
||||||
#endif
|
#endif
|
||||||
} else if (gpu_mode.compare("opencl") == 0) {
|
} else if (accelerator_mode.compare("opencl") == 0) {
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
use_gpu = true;
|
use_gpu = true;
|
||||||
ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
ILUReorder ilu_reorder;
|
||||||
if (opencl_ilu_reorder == "level_scheduling") {
|
if (opencl_ilu_reorder == "") {
|
||||||
|
ilu_reorder = bda::ILUReorder::GRAPH_COLORING; // default when not selected by user
|
||||||
|
} else if (opencl_ilu_reorder == "level_scheduling") {
|
||||||
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
||||||
} else if (opencl_ilu_reorder == "graph_coloring") {
|
} else if (opencl_ilu_reorder == "graph_coloring") {
|
||||||
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||||
@ -81,10 +81,28 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
|
|||||||
#else
|
#else
|
||||||
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
|
||||||
#endif
|
#endif
|
||||||
} else if (gpu_mode.compare("none") == 0) {
|
} else if (accelerator_mode.compare("fpga") == 0) {
|
||||||
use_gpu = false;
|
#if HAVE_FPGA
|
||||||
|
use_fpga = true;
|
||||||
|
ILUReorder ilu_reorder;
|
||||||
|
if (opencl_ilu_reorder == "") {
|
||||||
|
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user
|
||||||
|
} else if (opencl_ilu_reorder == "level_scheduling") {
|
||||||
|
ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
|
||||||
|
} else if (opencl_ilu_reorder == "graph_coloring") {
|
||||||
|
ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
|
||||||
} else {
|
} else {
|
||||||
OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
|
OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
|
||||||
|
}
|
||||||
|
backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
|
||||||
|
#else
|
||||||
|
OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
|
||||||
|
#endif
|
||||||
|
} else if (accelerator_mode.compare("none") == 0) {
|
||||||
|
use_gpu = false;
|
||||||
|
use_fpga = false;
|
||||||
|
} else {
|
||||||
|
OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -130,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
|
|||||||
|
|
||||||
|
|
||||||
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
// iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
|
||||||
// sparsity pattern should stay the same due to matrix-add-well-contributions
|
// sparsity pattern should stay the same
|
||||||
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
// this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
|
||||||
template <class BridgeMatrix>
|
template <class BridgeMatrix>
|
||||||
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
|
||||||
@ -161,14 +179,16 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
|
|||||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
|
||||||
{
|
{
|
||||||
|
|
||||||
if (use_gpu) {
|
if (use_gpu || use_fpga) {
|
||||||
BdaResult result;
|
BdaResult result;
|
||||||
result.converged = false;
|
result.converged = false;
|
||||||
static std::vector<int> h_rows;
|
static std::vector<int> h_rows;
|
||||||
static std::vector<int> h_cols;
|
static std::vector<int> h_cols;
|
||||||
const int dim = (*mat)[0][0].N();
|
const int dim = (*mat)[0][0].N();
|
||||||
const int N = mat->N()*dim;
|
const int Nb = mat->N();
|
||||||
const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
|
const int N = Nb * dim;
|
||||||
|
const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
|
||||||
|
const int nnz = nnzb * dim * dim;
|
||||||
|
|
||||||
if (dim != 3) {
|
if (dim != 3) {
|
||||||
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
|
||||||
@ -177,8 +197,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (h_rows.capacity() == 0) {
|
if (h_rows.capacity() == 0) {
|
||||||
h_rows.reserve(N+1);
|
h_rows.reserve(Nb+1);
|
||||||
h_cols.reserve(nnz);
|
h_cols.reserve(nnzb);
|
||||||
#if PRINT_TIMERS_BRIDGE
|
#if PRINT_TIMERS_BRIDGE
|
||||||
Dune::Timer t;
|
Dune::Timer t;
|
||||||
#endif
|
#endif
|
||||||
@ -233,14 +253,14 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
|
|||||||
|
|
||||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
|
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
|
||||||
if (use_gpu) {
|
if (use_gpu || use_fpga) {
|
||||||
backend->get_result(static_cast<double*>(&(x[0][0])));
|
backend->get_result(static_cast<double*>(&(x[0][0])));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class BridgeMatrix, class BridgeVector, int block_size>
|
template <class BridgeMatrix, class BridgeVector, int block_size>
|
||||||
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
|
void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
|
||||||
if(gpu_mode.compare("opencl") == 0){
|
if(accelerator_mode.compare("opencl") == 0){
|
||||||
#if HAVE_OPENCL
|
#if HAVE_OPENCL
|
||||||
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
|
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
|
||||||
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
|
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
|
||||||
@ -250,12 +270,12 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(We
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||||
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
||||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||||
n>::BdaBridge \
|
n>::BdaBridge \
|
||||||
(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \
|
(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, \
|
||||||
|
unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \
|
||||||
\
|
\
|
||||||
template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
|
||||||
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
|
||||||
|
@ -30,6 +30,10 @@
|
|||||||
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
#include <opm/simulators/linalg/bda/WellContributions.hpp>
|
||||||
|
|
||||||
|
|
||||||
|
#if HAVE_FPGA
|
||||||
|
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -42,19 +46,22 @@ class BdaBridge
|
|||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
bool use_gpu = false;
|
bool use_gpu = false;
|
||||||
std::string gpu_mode;
|
bool use_fpga = false;
|
||||||
|
std::string accelerator_mode;
|
||||||
std::unique_ptr<bda::BdaSolver<block_size> > backend;
|
std::unique_ptr<bda::BdaSolver<block_size> > backend;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// Construct a BdaBridge
|
/// Construct a BdaBridge
|
||||||
/// \param[in] gpu_mode to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
|
/// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]'
|
||||||
|
/// \param[in] fpga_bitstream FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[<filename>]'
|
||||||
/// \param[in] linear_solver_verbosity verbosity of BdaSolver
|
/// \param[in] linear_solver_verbosity verbosity of BdaSolver
|
||||||
/// \param[in] maxit maximum number of iterations for BdaSolver
|
/// \param[in] maxit maximum number of iterations for BdaSolver
|
||||||
/// \param[in] tolerance required relative tolerance for BdaSolver
|
/// \param[in] tolerance required relative tolerance for BdaSolver
|
||||||
/// \param[in] platformID the OpenCL platform ID to be used
|
/// \param[in] platformID the OpenCL platform ID to be used
|
||||||
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
/// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
|
||||||
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
|
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||||
BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
|
||||||
|
|
||||||
|
|
||||||
/// Solve linear system, A*x = b
|
/// Solve linear system, A*x = b
|
||||||
/// \warning Values of A might get overwritten!
|
/// \warning Values of A might get overwritten!
|
||||||
@ -79,6 +86,16 @@ public:
|
|||||||
/// \param[in] wellContribs container to hold all WellContributions
|
/// \param[in] wellContribs container to hold all WellContributions
|
||||||
void initWellContributions(WellContributions& wellContribs);
|
void initWellContributions(WellContributions& wellContribs);
|
||||||
|
|
||||||
|
/// Return whether the BdaBridge will use the FPGA or not
|
||||||
|
/// return whether the BdaBridge will use the FPGA or not
|
||||||
|
bool getUseFpga(){
|
||||||
|
return use_fpga;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the selected accelerator mode, this is input via the command-line
|
||||||
|
std::string getAccleratorName(){
|
||||||
|
return accelerator_mode;
|
||||||
|
}
|
||||||
|
|
||||||
}; // end class BdaBridge
|
}; // end class BdaBridge
|
||||||
|
|
||||||
|
@ -55,6 +55,8 @@ namespace bda
|
|||||||
int maxit = 200;
|
int maxit = 200;
|
||||||
double tolerance = 1e-2;
|
double tolerance = 1e-2;
|
||||||
|
|
||||||
|
std::string bitstream = "";
|
||||||
|
|
||||||
int N; // number of rows
|
int N; // number of rows
|
||||||
int Nb; // number of blocked rows (Nb*block_size == N)
|
int Nb; // number of blocked rows (Nb*block_size == N)
|
||||||
int nnz; // number of nonzeroes (scalars)
|
int nnz; // number of nonzeroes (scalars)
|
||||||
@ -66,7 +68,8 @@ namespace bda
|
|||||||
bool initialized = false;
|
bool initialized = false;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
/// Construct a BdaSolver, can be cusparseSolver or openclSolver
|
/// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver
|
||||||
|
/// \param[in] fpga_bitstream FPGA bitstream file name (only for fpgaSolver)
|
||||||
/// \param[in] linear_solver_verbosity verbosity of solver
|
/// \param[in] linear_solver_verbosity verbosity of solver
|
||||||
/// \param[in] maxit maximum number of iterations for solver
|
/// \param[in] maxit maximum number of iterations for solver
|
||||||
/// \param[in] tolerance required relative tolerance for solver
|
/// \param[in] tolerance required relative tolerance for solver
|
||||||
@ -74,6 +77,7 @@ namespace bda
|
|||||||
/// \param[in] deviceID the device to be used
|
/// \param[in] deviceID the device to be used
|
||||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
|
||||||
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
|
||||||
|
BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
|
||||||
|
|
||||||
/// Define virtual destructor, so that the derivedclass destructor will be called
|
/// Define virtual destructor, so that the derivedclass destructor will be called
|
||||||
virtual ~BdaSolver() {};
|
virtual ~BdaSolver() {};
|
||||||
|
@ -20,13 +20,19 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
#include <config.h>
|
||||||
|
|
||||||
using bda::BlockedMatrix;
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||||
|
|
||||||
namespace bda
|
namespace bda
|
||||||
{
|
{
|
||||||
|
|
||||||
|
using Opm::OpmLog;
|
||||||
|
|
||||||
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
/*Sort a row of matrix elements from a blocked CSR-format.*/
|
||||||
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// subtract c from b and store in a
|
#if HAVE_FPGA
|
||||||
// a = b - c
|
|
||||||
|
/*Subtract two blocks from one another element by element*/
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void blockSub(double *a, double *b, double *c)
|
void blockSub(double *mat1, double *mat2, double *resMat) {
|
||||||
{
|
|
||||||
for (unsigned int row = 0; row < block_size; row++) {
|
for (unsigned int row = 0; row < block_size; row++) {
|
||||||
for (unsigned int col = 0; col < block_size; col++) {
|
for (unsigned int col = 0; col < block_size; col++) {
|
||||||
a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
|
resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
|
||||||
|
for (unsigned int row = 0; row < block_size; row++) {
|
||||||
|
if (resetRes) {
|
||||||
|
resVect[row] = 0.0;
|
||||||
|
}
|
||||||
|
for (unsigned int col = 0; col < block_size; col++) {
|
||||||
|
resVect[row] += scale * mat[row * block_size + col] * vect[col];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
int BlockedMatrix<block_size>::countUnblockedNnzs() {
|
||||||
|
int numNnzsOverThreshold = 0;
|
||||||
|
int totalNnzs = rowPointers[Nb];
|
||||||
|
for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
|
||||||
|
if (fabs(nnzValues[idx]) > nnzThreshold) {
|
||||||
|
numNnzsOverThreshold++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return numNnzsOverThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
|
||||||
|
* If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
|
||||||
|
*/
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
int valIndex = 0, nnzsPerRow;
|
||||||
|
|
||||||
|
mat->rowPointers[0] = 0;
|
||||||
|
// go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and
|
||||||
|
// write all non-zero values and corresponding column indices that belong to the same row into the new matrix.
|
||||||
|
for (int row = 0; row < Nb; row++) {
|
||||||
|
for (unsigned int bRow = 0; bRow < bs; bRow++) {
|
||||||
|
nnzsPerRow = 0;
|
||||||
|
for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) {
|
||||||
|
for (unsigned int bCol = 0; bCol < bs; bCol++) {
|
||||||
|
int idx = 0;
|
||||||
|
// If the matrix is the U matrix, store the rows inside a block in reverse order.
|
||||||
|
if (isUMatrix) {
|
||||||
|
idx = col * bs * bs + (bs - bRow - 1) * bs + bCol;
|
||||||
|
} else {
|
||||||
|
idx = col * bs * bs + bRow * bs + bCol;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fabs(nnzValues[idx]) > nnzThreshold) {
|
||||||
|
mat->nnzValues[valIndex] = nnzValues[idx];
|
||||||
|
mat->colIndices[valIndex] = colIndices[col] * bs + bCol;
|
||||||
|
valIndex++;
|
||||||
|
nnzsPerRow++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Update the rowpointers of the new matrix
|
||||||
|
mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*Optimized version*/
|
||||||
|
// ub* prefixes indicate unblocked data
|
||||||
|
template <unsigned int block_size>
|
||||||
|
int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
|
||||||
|
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
|
||||||
|
{
|
||||||
|
int res;
|
||||||
|
int numUnblockedNnzs = countUnblockedNnzs();
|
||||||
|
|
||||||
|
// initialize the non-blocked matrix with the obtained size.
|
||||||
|
std::unique_ptr<Matrix> ubMat = std::make_unique<Matrix>(Nb * block_size, numUnblockedNnzs);
|
||||||
|
|
||||||
|
unblock(ubMat.get(), isUMatrix);
|
||||||
|
|
||||||
|
std::vector<int> ubNodesPerColor(numColors);
|
||||||
|
for (int i = 0; i < numColors; i++) {
|
||||||
|
ubNodesPerColor[i] = nodesPerColor[i] * block_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
*valSize = ubMat->nnzs;
|
||||||
|
|
||||||
|
res = ubMat->toRDF(numColors, ubNodesPerColor,
|
||||||
|
colIndicesInColor, nnzsPerRowLimit,
|
||||||
|
ubNnzValues, ubColIndices, nnzValsSizes,
|
||||||
|
NROffsets, colorSizes);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// coloring is already done, numColors and nodesPerColor are set
|
||||||
|
// [rows|columns]PerColorLimit are already queried from the FPGA
|
||||||
|
// colIndicesInColor, PIndicesAddr and colorSizes are written here
|
||||||
|
// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU
|
||||||
|
// node == row
|
||||||
|
// color == partition
|
||||||
|
// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs
|
||||||
|
// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF().
|
||||||
|
// due to partitioning, lots of columns are removed, this matrix keeps track of the mapping
|
||||||
|
// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
|
||||||
|
// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
|
||||||
|
// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
|
||||||
|
template <unsigned int block_size>
|
||||||
|
int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
|
||||||
|
int rowsPerColorLimit, int columnsPerColorLimit,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
|
||||||
|
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
|
||||||
|
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes)
|
||||||
|
{
|
||||||
|
// Data related to column indices per partition
|
||||||
|
int doneRows = 0;
|
||||||
|
std::vector<bool> isColAccessed(Nb); // std::vector<bool> might have some different optimized implementation, initialize in a loop
|
||||||
|
std::vector<bool> isLColAccessed(Nb);
|
||||||
|
int totalCols = 0; // sum of numColAccesses for each color, blocked
|
||||||
|
int LTotalCols = 0, UTotalCols = 0;
|
||||||
|
int maxCols = 0; // max value of numColAccesses for any color
|
||||||
|
int maxRowsPerColor = 0; // max value of numRows for any color
|
||||||
|
int maxColsPerRow = 0; // max value of colsPerRow for any color
|
||||||
|
// colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates
|
||||||
|
// colsInColor[c][i] contains the ith column that color c accesses
|
||||||
|
// initial size allows for each color to access all columns, with space for padding
|
||||||
|
std::vector<std::vector<int> > colsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||||
|
std::vector<std::vector<int> > LColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||||
|
std::vector<std::vector<int> > UColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
|
||||||
|
|
||||||
|
// find which columns are accessed in each color, as well as how many non-zeroes there are per color.
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
int numRows = 0;
|
||||||
|
// initialize
|
||||||
|
for (int row = 0; row < Nb; row++) {
|
||||||
|
isColAccessed[row] = false;
|
||||||
|
isLColAccessed[row] = false;
|
||||||
|
}
|
||||||
|
if (c > 0) {
|
||||||
|
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
|
||||||
|
isLColAccessed[i] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0; // number of unique accesses, blocked
|
||||||
|
// for every row in this color
|
||||||
|
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
|
||||||
|
int colsPerRow = 0; // number of blocks for this row
|
||||||
|
bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]);
|
||||||
|
for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) {
|
||||||
|
// for every column in the current row, check if that column was accessed before this color
|
||||||
|
int col = colIndices[idx];
|
||||||
|
if (isColAccessed[col] == false) {
|
||||||
|
colsInColor[c][numColAccesses] = col;
|
||||||
|
isColAccessed[col] = true;
|
||||||
|
numColAccesses++;
|
||||||
|
if (col > row) {
|
||||||
|
UColsInColor[numColors - c - 1][UNumColAccesses] = col;
|
||||||
|
UNumColAccesses++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (isLColAccessed[col] == false) {
|
||||||
|
if (col < row) {
|
||||||
|
LColsInColor[c][LNumColAccesses] = col;
|
||||||
|
LNumColAccesses++;
|
||||||
|
isLColAccessed[col] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
colsPerRow++;
|
||||||
|
}
|
||||||
|
if (rowIsEmpty != true) {
|
||||||
|
numRows++;
|
||||||
|
}
|
||||||
|
maxColsPerRow = std::max(maxColsPerRow, colsPerRow);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add columns from previous color into L partition to simplify data forwarding
|
||||||
|
if (c > 0) {
|
||||||
|
for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
|
||||||
|
LColsInColor[c][LNumColAccesses] = i;
|
||||||
|
LNumColAccesses++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
colorSizes[c * 4 + 10] = numColAccesses * block_size;
|
||||||
|
LColorSizes[c * 4 + 10] = LNumColAccesses * block_size;
|
||||||
|
UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size;
|
||||||
|
|
||||||
|
// store mapping
|
||||||
|
for (int col = 0; col < numColAccesses; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int col = 0; col < LNumColAccesses; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int col = 0; col < UNumColAccesses; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline
|
||||||
|
while (numColAccesses % 16 != 0) {
|
||||||
|
colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1];
|
||||||
|
numColAccesses++;
|
||||||
|
}
|
||||||
|
while (LNumColAccesses % 16 != 0) {
|
||||||
|
LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1];
|
||||||
|
LNumColAccesses++;
|
||||||
|
}
|
||||||
|
while (UNumColAccesses % 16 != 0) {
|
||||||
|
UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1];
|
||||||
|
UNumColAccesses++;
|
||||||
|
}
|
||||||
|
maxCols = std::max(numColAccesses, maxCols);
|
||||||
|
totalCols += numColAccesses;
|
||||||
|
LTotalCols += LNumColAccesses;
|
||||||
|
UTotalCols += UNumColAccesses;
|
||||||
|
doneRows = doneRows + nodesPerColor[c];
|
||||||
|
maxRowsPerColor = std::max(numRows, maxRowsPerColor);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxCols * static_cast<int>(block_size) > columnsPerColorLimit) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit;
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
doneRows = 0;
|
||||||
|
int diagValsSize = 0;
|
||||||
|
int maxRows = 0;
|
||||||
|
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
// calculate sizes that include zeropadding
|
||||||
|
diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8);
|
||||||
|
doneRows += nodesPerColor[c];
|
||||||
|
if (nodesPerColor[c] * static_cast<int>(block_size) > maxRows)
|
||||||
|
maxRows = nodesPerColor[c];
|
||||||
|
colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
|
||||||
|
LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
|
||||||
|
UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxRows * static_cast<int>(block_size) > rowsPerColorLimit) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit;
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// create and fill sizes array as far as already possible
|
||||||
|
colorSizes[0] = Nb * block_size;
|
||||||
|
LColorSizes[0] = Nb * block_size;
|
||||||
|
UColorSizes[0] = Nb * block_size;
|
||||||
|
// col_sizes (but the matrix is square)
|
||||||
|
colorSizes[1] = Nb * block_size;
|
||||||
|
LColorSizes[1] = Nb * block_size;
|
||||||
|
UColorSizes[1] = Nb * block_size;
|
||||||
|
colorSizes[2] = totalCols * block_size;
|
||||||
|
LColorSizes[2] = LTotalCols * block_size;
|
||||||
|
UColorSizes[2] = UTotalCols * block_size;
|
||||||
|
// missing val_size, written in Matrix::toRDF()
|
||||||
|
colorSizes[4] = numColors;
|
||||||
|
LColorSizes[4] = numColors;
|
||||||
|
UColorSizes[4] = numColors;
|
||||||
|
// missing NRFlagsSize, written in Matrix::toRDF()
|
||||||
|
colorSizes[6] = diagValsSize;
|
||||||
|
LColorSizes[6] = diagValsSize;
|
||||||
|
UColorSizes[6] = diagValsSize;
|
||||||
|
|
||||||
|
int paddingIdx = numColors;
|
||||||
|
while (paddingIdx % 4 != 0) {
|
||||||
|
for (unsigned int i = 0; i < 4; i++) {
|
||||||
|
colorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||||
|
LColorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||||
|
UColorSizes[paddingIdx * 4 + 8 + i] = 0;
|
||||||
|
}
|
||||||
|
paddingIdx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = 0, Lindex = 0, Uindex = 0;
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
// for each unique col access
|
||||||
|
for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
PIndicesAddr[index] = colsInColor[c][col] * block_size + i;
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// add padding
|
||||||
|
while (index % 16 != 0) {
|
||||||
|
PIndicesAddr[index] = PIndicesAddr[index - 1];
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i;
|
||||||
|
Lindex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (Lindex % 16 != 0) {
|
||||||
|
LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1];
|
||||||
|
Lindex++;
|
||||||
|
}
|
||||||
|
for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
|
||||||
|
for (unsigned int i = 0; i < block_size; i++) {
|
||||||
|
UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i;
|
||||||
|
Uindex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (Uindex % 16 != 0) {
|
||||||
|
UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1];
|
||||||
|
Uindex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag) {
|
||||||
|
const unsigned int block_size = 3;
|
||||||
|
int doneRows = rowSize - 1; // since the rows of U are reversed, the rows of the diag are also reversed
|
||||||
|
int RDFIndex = 0;
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
for (int r = 0; r < rowsPerColor[c]; r++) {
|
||||||
|
|
||||||
|
// the rows in the block are reversed
|
||||||
|
for (int i = static_cast<int>(block_size) - 1; i >= 0; i--) {
|
||||||
|
for (unsigned int j = 0; j < block_size; j++) {
|
||||||
|
RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j];
|
||||||
|
RDFIndex++;
|
||||||
|
}
|
||||||
|
// add 4th column, zeropadding
|
||||||
|
RDFDiag[RDFIndex] = 0.0;
|
||||||
|
RDFIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
doneRows -= rowsPerColor[c];
|
||||||
|
|
||||||
|
// make sure the color completely fills a cacheline
|
||||||
|
// a color with 3 blocks would otherwise leave space
|
||||||
|
while (RDFIndex % 8 != 0) {
|
||||||
|
RDFDiag[RDFIndex] = 0.0;
|
||||||
|
RDFIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(RDFIndex % 8 == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // HAVE_FPGA
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||||
template void sortBlockedRow<n>(int *, double *, int, int); \
|
template void sortBlockedRow<n>(int *, double *, int, int); \
|
||||||
template void blockMultSub<n>(double *, double *, double *); \
|
template void blockMultSub<n>(double *, double *, double *); \
|
||||||
template void blockMult<n>(double *, double *, double *); \
|
template void blockMult<n>(double *, double *, double *); \
|
||||||
template void blockSub<n>(double *, double *, double *); \
|
|
||||||
|
|
||||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||||
@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4);
|
|||||||
|
|
||||||
#undef INSTANTIATE_BDA_FUNCTIONS
|
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||||
|
|
||||||
|
#if HAVE_FPGA
|
||||||
|
#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \
|
||||||
|
template void blockSub<n>(double *, double *, double *); \
|
||||||
|
template void blockVectMult<n>(double *, double *, double, double *, bool); \
|
||||||
|
template int BlockedMatrix<n>::toRDF(int, int *, bool, \
|
||||||
|
std::vector<std::vector<int> >& , int, int *, \
|
||||||
|
std::vector<std::vector<double> >&, short int *, unsigned char *, int *, int *); \
|
||||||
|
template int BlockedMatrix<n>::findPartitionColumns(int, int *, \
|
||||||
|
int, int, \
|
||||||
|
std::vector<std::vector<int> >& , int *, int *, \
|
||||||
|
std::vector<std::vector<int> >& , int *, int *, \
|
||||||
|
std::vector<std::vector<int> >& , int *, int *);
|
||||||
|
|
||||||
|
INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
|
||||||
|
INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
|
||||||
|
INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
|
||||||
|
INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
|
||||||
|
|
||||||
|
#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
|
||||||
|
#endif // HAVE_FPGA
|
||||||
|
|
||||||
|
|
||||||
} // end namespace bda
|
} // end namespace bda
|
||||||
|
@ -20,29 +20,40 @@
|
|||||||
#ifndef BLOCKED_MATRIX_HPP
|
#ifndef BLOCKED_MATRIX_HPP
|
||||||
#define BLOCKED_MATRIX_HPP
|
#define BLOCKED_MATRIX_HPP
|
||||||
|
|
||||||
|
#if HAVE_FPGA
|
||||||
|
#include <vector>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||||
|
|
||||||
namespace bda
|
namespace bda
|
||||||
{
|
{
|
||||||
|
|
||||||
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
|
/// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
|
||||||
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||||
template<int BS>
|
template<unsigned int block_size>
|
||||||
struct BlockedMatrix{
|
class BlockedMatrix
|
||||||
|
{
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
/// Allocate BlockedMatrix and data arrays with given sizes
|
/// Allocate BlockedMatrix and data arrays with given sizes
|
||||||
/// \param[in] Nb number of blockrows
|
/// \param[in] Nb number of blockrows
|
||||||
/// \param[in] nnzbs number of nonzero blocks
|
/// \param[in] nnzbs number of nonzero blocks
|
||||||
BlockedMatrix(int Nb_, int nnzbs_)
|
BlockedMatrix(int Nb_, int nnzbs_)
|
||||||
: nnzValues(new double[nnzbs_*BS*BS]),
|
: nnzValues(new double[nnzbs_*block_size*block_size]),
|
||||||
colIndices(new int[nnzbs_*BS*BS]),
|
colIndices(new int[nnzbs_*block_size*block_size]),
|
||||||
rowPointers(new int[Nb_+1]),
|
rowPointers(new int[Nb_+1]),
|
||||||
Nb(Nb_),
|
Nb(Nb_),
|
||||||
nnzbs(nnzbs_),
|
nnzbs(nnzbs_),
|
||||||
deleteNnzs(true),
|
deleteNnzs(true),
|
||||||
deleteSparsity(true)
|
deleteSparsity(true)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
|
/// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
|
||||||
/// \param[in] M matrix to be copied
|
/// \param[in] M matrix to be copied
|
||||||
BlockedMatrix(const BlockedMatrix& M)
|
BlockedMatrix(const BlockedMatrix& M)
|
||||||
: nnzValues(new double[M.nnzbs*BS*BS]),
|
: nnzValues(new double[M.nnzbs*block_size*block_size]),
|
||||||
colIndices(M.colIndices),
|
colIndices(M.colIndices),
|
||||||
rowPointers(M.rowPointers),
|
rowPointers(M.rowPointers),
|
||||||
Nb(M.Nb),
|
Nb(M.Nb),
|
||||||
@ -50,10 +61,11 @@ struct BlockedMatrix{
|
|||||||
deleteNnzs(true),
|
deleteNnzs(true),
|
||||||
deleteSparsity(false)
|
deleteSparsity(false)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
|
/// Allocate BlockedMatrix, but let data arrays point to existing arrays
|
||||||
/// \param[in] Nb number of blockrows
|
/// \param[in] Nb number of blockrows
|
||||||
/// \param[in] nnzbs number of nonzero blocks
|
/// \param[in] nnzbs number of nonzero blocks
|
||||||
/// \param[in] nnzValues array of nonzero values, contains nnzb*BS*BS scalars
|
/// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars
|
||||||
/// \param[in] colIndices array of column indices, contains nnzb entries
|
/// \param[in] colIndices array of column indices, contains nnzb entries
|
||||||
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
|
/// \param[in] rowPointers array of row pointers, contains Nb+1 entries
|
||||||
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
|
||||||
@ -76,6 +88,28 @@ struct BlockedMatrix{
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if HAVE_FPGA
|
||||||
|
constexpr static double nnzThreshold = 1e-80; // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero
|
||||||
|
|
||||||
|
int countUnblockedNnzs();
|
||||||
|
|
||||||
|
void unblock(Matrix *mat, bool isUMatrix);
|
||||||
|
|
||||||
|
/// Converts this matrix to the dataformat used by the FPGA
|
||||||
|
/// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking
|
||||||
|
int toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
|
||||||
|
std::vector<std::vector<double> >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize);
|
||||||
|
|
||||||
|
/// Analyses the sparsity pattern and prepares for toRDF()
|
||||||
|
/// Is only called once
|
||||||
|
int findPartitionColumns(int numColors, int *nodesPerColor,
|
||||||
|
int rowsPerColorLimit, int columnsPerColorLimit,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
|
||||||
|
std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
|
||||||
|
std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes);
|
||||||
|
#endif
|
||||||
|
|
||||||
double *nnzValues;
|
double *nnzValues;
|
||||||
int *colIndices;
|
int *colIndices;
|
||||||
int *rowPointers;
|
int *rowPointers;
|
||||||
@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c);
|
|||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void blockMult(double *mat1, double *mat2, double *resMat);
|
void blockMult(double *mat1, double *mat2, double *resMat);
|
||||||
|
|
||||||
/// Subtract blocks
|
|
||||||
/// a = b - c
|
#if HAVE_FPGA
|
||||||
/// \param[out] a result block
|
|
||||||
/// \param[in] b input block
|
|
||||||
/// \param[in] c input block
|
|
||||||
template <unsigned int block_size>
|
template <unsigned int block_size>
|
||||||
void blockSub(double *a, double *b, double *c);
|
void blockSub(double *mat1, double *mat2, double *resMat);
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
|
||||||
|
|
||||||
|
/// Convert a blocked inverse diagonal to the FPGA format.
|
||||||
|
/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
|
||||||
|
/// Since the rows of U are reversed, the rows of the diag are also reversed.
|
||||||
|
/// The cachelines can hold 8 doubles, a block has 9 doubles.
|
||||||
|
/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows.
|
||||||
|
/// Then 2 blocks (24 doubles) fit on 3 cachelines.
|
||||||
|
/// Example:
|
||||||
|
/// [1 2 3] [1 2 3 0] [1 2 3 0 4 5 6 0]
|
||||||
|
/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1]
|
||||||
|
/// [7 8 9] [7 8 9 0] [block2 row2 block2 row3]
|
||||||
|
void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
|
||||||
|
#endif
|
||||||
|
|
||||||
} // end namespace bda
|
} // end namespace bda
|
||||||
|
|
||||||
|
413
opm/simulators/linalg/bda/FPGABILU0.cpp
Normal file
413
opm/simulators/linalg/bda/FPGABILU0.cpp
Normal file
@ -0,0 +1,413 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/MatrixBlock.hpp>
|
||||||
|
#include <dune/common/timer.hh>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
using Opm::OpmLog;
|
||||||
|
using Dune::Timer;
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
|
||||||
|
verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
|
||||||
|
{
|
||||||
|
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||||
|
level_scheduling = true;
|
||||||
|
} else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
|
||||||
|
graph_coloring = true;
|
||||||
|
} else {
|
||||||
|
OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
FPGABILU0<block_size>::~FPGABILU0()
|
||||||
|
{
|
||||||
|
delete[] invDiagVals;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
|
||||||
|
{
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
|
||||||
|
resultPointers.resize(numResultPointers, nullptr);
|
||||||
|
resultSizes.resize(numResultSizes);
|
||||||
|
|
||||||
|
// Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
|
||||||
|
const unsigned int nnzSplit = 1;
|
||||||
|
|
||||||
|
this->N = mat->Nb * block_size;
|
||||||
|
this->Nb = mat->Nb;
|
||||||
|
this->nnz = mat->nnzbs * block_size * block_size;
|
||||||
|
this->nnzbs = mat->nnzbs;
|
||||||
|
|
||||||
|
toOrder.resize(Nb);
|
||||||
|
fromOrder.resize(Nb);
|
||||||
|
|
||||||
|
std::vector<int> CSCRowIndices(nnzbs);
|
||||||
|
std::vector<int> CSCColPointers(Nb + 1);
|
||||||
|
|
||||||
|
if (level_scheduling) {
|
||||||
|
Timer t_convert;
|
||||||
|
csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
|
||||||
|
if (verbosity >= 3) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Timer t_analysis;
|
||||||
|
rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
|
||||||
|
LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
|
||||||
|
std::ostringstream out;
|
||||||
|
if (level_scheduling) {
|
||||||
|
out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
|
||||||
|
findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||||
|
} else if (graph_coloring) {
|
||||||
|
out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
|
||||||
|
findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numColors > maxNumColors) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity >= 3) {
|
||||||
|
out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
|
||||||
|
}
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
|
||||||
|
int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
|
||||||
|
int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
|
||||||
|
int blockDiagSize = 0;
|
||||||
|
// This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
|
||||||
|
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||||
|
int doneRows = 0;
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
|
||||||
|
for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
|
||||||
|
int columnIndex = rMat->colIndices[j];
|
||||||
|
if (columnIndex < i) {
|
||||||
|
LColorRoundedValSize += 9;
|
||||||
|
LNROffsetSize += 9;
|
||||||
|
}
|
||||||
|
if (columnIndex > i) {
|
||||||
|
UColorRoundedValSize += 9;
|
||||||
|
UNROffsetSize += 9;
|
||||||
|
}
|
||||||
|
colorRoundedValSize += 9;
|
||||||
|
NROffsetSize += 9;
|
||||||
|
}
|
||||||
|
blockDiagSize += 12;
|
||||||
|
}
|
||||||
|
// End of color: round all sizes to nearest cacheline
|
||||||
|
colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
|
||||||
|
LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
|
||||||
|
UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
|
||||||
|
NROffsetSize = roundUpTo(NROffsetSize, 64);
|
||||||
|
LNROffsetSize = roundUpTo(LNROffsetSize, 64);
|
||||||
|
UNROffsetSize = roundUpTo(UNROffsetSize, 64);
|
||||||
|
blockDiagSize = roundUpTo(blockDiagSize, 8);
|
||||||
|
|
||||||
|
doneRows += rowsPerColor[c];
|
||||||
|
}
|
||||||
|
int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
|
||||||
|
int worstCaseColumnAccessNum = numColors * maxColsPerColor;
|
||||||
|
|
||||||
|
nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
|
||||||
|
LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
|
||||||
|
UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
|
||||||
|
// initial number of nnz, used to allocate
|
||||||
|
nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
|
||||||
|
LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
|
||||||
|
UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
|
||||||
|
colIndices.resize(colorRoundedValSize);
|
||||||
|
LColIndices.resize(LColorRoundedValSize);
|
||||||
|
UColIndices.resize(UColorRoundedValSize);
|
||||||
|
NROffsets.resize(NROffsetSize);
|
||||||
|
LNROffsets.resize(LNROffsetSize);
|
||||||
|
UNROffsets.resize(UNROffsetSize);
|
||||||
|
PIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||||
|
LPIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||||
|
UPIndicesAddr.resize(worstCaseColumnAccessNum);
|
||||||
|
colorSizes.resize(colorSizesNum);
|
||||||
|
LColorSizes.resize(colorSizesNum);
|
||||||
|
UColorSizes.resize(colorSizesNum);
|
||||||
|
blockDiag.resize(blockDiagSize);
|
||||||
|
colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||||
|
LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||||
|
UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
|
||||||
|
|
||||||
|
int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
|
||||||
|
maxRowsPerColor, maxColsPerColor,
|
||||||
|
colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
|
||||||
|
LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
|
||||||
|
UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
|
||||||
|
if (err != 0) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
diagIndex.resize(mat->Nb, 0);
|
||||||
|
invDiagVals = new double[mat->Nb * bs * bs];
|
||||||
|
LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||||
|
UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||||
|
resultPointers[0] = (void *) colorSizes.data();
|
||||||
|
resultPointers[1] = (void *) PIndicesAddr.data();
|
||||||
|
resultPointers[2] = (void *) nnzValues.data();
|
||||||
|
resultPointers[3] = (void *) colIndices.data();
|
||||||
|
resultPointers[4] = (void *) NROffsets.data();
|
||||||
|
resultPointers[5] = (void *) nnzValsSizes.data();
|
||||||
|
resultPointers[6] = (void *) LColorSizes.data();
|
||||||
|
resultPointers[7] = (void *) LPIndicesAddr.data();
|
||||||
|
resultPointers[8] = (void *) LnnzValues.data();
|
||||||
|
resultPointers[9] = (void *) LColIndices.data();
|
||||||
|
resultPointers[10] = (void *) LNROffsets.data();
|
||||||
|
resultPointers[11] = (void *) LnnzValsSizes.data();
|
||||||
|
resultPointers[12] = (void *) UColorSizes.data();
|
||||||
|
resultPointers[13] = (void *) UPIndicesAddr.data();
|
||||||
|
resultPointers[14] = (void *) UnnzValues.data();
|
||||||
|
resultPointers[15] = (void *) UColIndices.data();
|
||||||
|
resultPointers[16] = (void *) UNROffsets.data();
|
||||||
|
resultPointers[17] = (void *) UnnzValsSizes.data();
|
||||||
|
resultPointers[18] = (void *) blockDiag.data();
|
||||||
|
//resultPointers[19] and [20] are set by the caller
|
||||||
|
resultSizes[0] = mat->Nb * block_size;
|
||||||
|
resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
|
||||||
|
resultSizes[2] = numColors;
|
||||||
|
resultSizes[3] = worstCaseColumnAccessNum; //totalCols
|
||||||
|
resultSizes[4] = NROffsetSize; //NRFlagSize
|
||||||
|
resultSizes[5] = blockDiagSize; //diagValsSize
|
||||||
|
resultSizes[6] = mat->Nb * block_size;
|
||||||
|
resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
|
||||||
|
resultSizes[8] = numColors;
|
||||||
|
resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
|
||||||
|
resultSizes[10] = LNROffsetSize; //LNRFlagSize
|
||||||
|
resultSizes[11] = blockDiagSize; //LDiagValsSize
|
||||||
|
resultSizes[12] = mat->Nb * block_size;
|
||||||
|
resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
|
||||||
|
resultSizes[14] = numColors;
|
||||||
|
resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
|
||||||
|
resultSizes[16] = UNROffsetSize; //UNRFlagSize
|
||||||
|
resultSizes[17] = blockDiagSize; //UDiagValsSize
|
||||||
|
return true;
|
||||||
|
} // end init()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
|
||||||
|
{
|
||||||
|
const unsigned int bs = block_size;
|
||||||
|
Timer t_reorder;
|
||||||
|
reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
|
||||||
|
|
||||||
|
if (verbosity >= 3) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
|
||||||
|
Timer t_memcpy;
|
||||||
|
memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
|
||||||
|
|
||||||
|
if (verbosity >= 3) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int i, j, ij, ik, jk;
|
||||||
|
int iRowStart, iRowEnd, jRowEnd;
|
||||||
|
double pivot[bs * bs];
|
||||||
|
int LSize = 0;
|
||||||
|
Opm::Detail::Inverter<bs> inverter; // reuse inverter to invert blocks
|
||||||
|
|
||||||
|
Timer t_decomposition;
|
||||||
|
|
||||||
|
// go through all rows
|
||||||
|
for (i = 0; i < LUMat->Nb; i++) {
|
||||||
|
iRowStart = LUMat->rowPointers[i];
|
||||||
|
iRowEnd = LUMat->rowPointers[i + 1];
|
||||||
|
|
||||||
|
// go through all elements of the row
|
||||||
|
for (ij = iRowStart; ij < iRowEnd; ij++) {
|
||||||
|
j = LUMat->colIndices[ij];
|
||||||
|
// if the element is the diagonal, store the index and go to next row
|
||||||
|
if (j == i) {
|
||||||
|
diagIndex[i] = ij;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// if an element beyond the diagonal is reach, no diagonal was found
|
||||||
|
// throw an error now. TODO: perform reordering earlier to prevent this
|
||||||
|
if (j > i) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "BILU0 Error could not find diagonal value in row: " << i;
|
||||||
|
OpmLog::error(out.str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
LSize++;
|
||||||
|
// calculate the pivot of this row
|
||||||
|
blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
|
||||||
|
|
||||||
|
memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
|
||||||
|
|
||||||
|
jRowEnd = LUMat->rowPointers[j + 1];
|
||||||
|
jk = diagIndex[j] + 1;
|
||||||
|
ik = ij + 1;
|
||||||
|
// substract that row scaled by the pivot from this row.
|
||||||
|
while (ik < iRowEnd && jk < jRowEnd) {
|
||||||
|
if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
|
||||||
|
blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
|
||||||
|
ik++;
|
||||||
|
jk++;
|
||||||
|
} else {
|
||||||
|
if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
|
||||||
|
{ ik++; }
|
||||||
|
else
|
||||||
|
{ jk++; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// store the inverse in the diagonal!
|
||||||
|
inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
|
||||||
|
memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
}
|
||||||
|
|
||||||
|
LMat->rowPointers[0] = 0;
|
||||||
|
UMat->rowPointers[0] = 0;
|
||||||
|
|
||||||
|
// Split the LU matrix into two by comparing column indices to diagonal indices
|
||||||
|
for (i = 0; i < LUMat->Nb; i++) {
|
||||||
|
LMat->rowPointers[i + 1] = LMat->rowPointers[i];
|
||||||
|
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
|
||||||
|
if (j < diagIndex[i]) {
|
||||||
|
memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
|
||||||
|
LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Reverse the order or the (blocked) rows for the U matrix,
|
||||||
|
// because the rows are accessed in reverse order when applying the ILU0
|
||||||
|
int URowIndex = 0;
|
||||||
|
for (i = LUMat->Nb - 1; i >= 0; i--) {
|
||||||
|
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
|
||||||
|
for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
|
||||||
|
if (j > diagIndex[i]) {
|
||||||
|
memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
|
||||||
|
UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
|
||||||
|
UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
URowIndex++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity >= 3) {
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
|
||||||
|
OpmLog::info(out.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<int> URowsPerColor(numColors);
|
||||||
|
rowSize = block_size * rMat->Nb;
|
||||||
|
LRowSize = block_size * LMat->Nb;
|
||||||
|
URowSize = block_size * UMat->Nb;
|
||||||
|
LNumColors = numColors;
|
||||||
|
UNumColors = numColors;
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
URowsPerColor[numColors - c - 1] = rowsPerColor[c];
|
||||||
|
}
|
||||||
|
int err;
|
||||||
|
err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
|
||||||
|
colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
|
||||||
|
nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
|
||||||
|
if (err != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
|
||||||
|
LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
|
||||||
|
LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
|
||||||
|
if (err != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
|
||||||
|
UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
|
||||||
|
UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
|
||||||
|
if (err != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
|
||||||
|
// resultPointers are set in the init method
|
||||||
|
resultSizes[0] = rowSize;
|
||||||
|
resultSizes[1] = colorSizes[3]; // zeropadded valSize;
|
||||||
|
resultSizes[2] = numColors;
|
||||||
|
resultSizes[3] = colorSizes[2]; //totalCols
|
||||||
|
resultSizes[4] = colorSizes[5]; //NRFlagSize
|
||||||
|
resultSizes[5] = colorSizes[6]; //diagValsSize
|
||||||
|
resultSizes[6] = LRowSize;
|
||||||
|
resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
|
||||||
|
resultSizes[8] = LNumColors;
|
||||||
|
resultSizes[9] = LColorSizes[2]; //LTotalCols
|
||||||
|
resultSizes[10] = LColorSizes[5]; //LNRFlagSize
|
||||||
|
resultSizes[11] = LColorSizes[6]; //LDiagValsSize
|
||||||
|
resultSizes[12] = URowSize;
|
||||||
|
resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
|
||||||
|
resultSizes[14] = UNumColors;
|
||||||
|
resultSizes[15] = UColorSizes[2]; //UTotalCols
|
||||||
|
resultSizes[16] = UColorSizes[5]; //UNRFlagSize
|
||||||
|
resultSizes[17] = UColorSizes[6]; //UDiagValsSize
|
||||||
|
return true;
|
||||||
|
} // end create_preconditioner()
|
||||||
|
|
||||||
|
|
||||||
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||||
|
template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int); \
|
||||||
|
template FPGABILU0<n>::~FPGABILU0(); \
|
||||||
|
template bool FPGABILU0<n>::init(BlockedMatrix<n> *); \
|
||||||
|
template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *); \
|
||||||
|
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||||
|
|
||||||
|
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||||
|
|
||||||
|
} //namespace bda
|
117
opm/simulators/linalg/bda/FPGABILU0.hpp
Normal file
117
opm/simulators/linalg/bda/FPGABILU0.hpp
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef FPGA_BILU0_HEADER_INCLUDED
|
||||||
|
#define FPGA_BILU0_HEADER_INCLUDED
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This class implements a Blocked ILU0 preconditioner, with output data
|
||||||
|
* specifically formatted for the FPGA.
|
||||||
|
* The decomposition and reorders of the rows of the matrix are done on CPU.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
class FPGABILU0
|
||||||
|
{
|
||||||
|
|
||||||
|
private:
|
||||||
|
int N; // number of rows of the matrix
|
||||||
|
int Nb; // number of blockrows of the matrix
|
||||||
|
int nnz; // number of nonzeroes of the matrix (scalar)
|
||||||
|
int nnzbs; // number of blocks of the matrix
|
||||||
|
std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
|
||||||
|
std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
|
||||||
|
double *invDiagVals = nullptr;
|
||||||
|
std::vector<int> diagIndex;
|
||||||
|
std::vector<int> toOrder, fromOrder;
|
||||||
|
std::vector<int> rowsPerColor;
|
||||||
|
int numColors;
|
||||||
|
int verbosity;
|
||||||
|
|
||||||
|
// sizes and arrays used during RDF generation
|
||||||
|
std::vector<std::vector<double> > nnzValues, LnnzValues, UnnzValues;
|
||||||
|
std::vector<short int> colIndices, LColIndices, UColIndices;
|
||||||
|
std::vector<unsigned char> NROffsets, LNROffsets, UNROffsets;
|
||||||
|
std::vector<int> PIndicesAddr, LPIndicesAddr, UPIndicesAddr;
|
||||||
|
std::vector<int> colorSizes, LColorSizes, UColorSizes;
|
||||||
|
std::vector<int> nnzValsSizes, LnnzValsSizes, UnnzValsSizes;
|
||||||
|
std::vector<std::vector<int> > colIndicesInColor, LColIndicesInColor, UColIndicesInColor;
|
||||||
|
|
||||||
|
int rowSize, valSize;
|
||||||
|
int LRowSize, LValSize, LNumColors;
|
||||||
|
int URowSize, UValSize, UNumColors;
|
||||||
|
std::vector<double> blockDiag;
|
||||||
|
ILUReorder opencl_ilu_reorder;
|
||||||
|
bool level_scheduling = false, graph_coloring = false;
|
||||||
|
int numResultPointers = 21;
|
||||||
|
std::vector<void *> resultPointers;
|
||||||
|
int numResultSizes = 18;
|
||||||
|
std::vector<int> resultSizes;
|
||||||
|
int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors);
|
||||||
|
|
||||||
|
~FPGABILU0();
|
||||||
|
|
||||||
|
// analysis (optional)
|
||||||
|
bool init(BlockedMatrix<block_size> *mat);
|
||||||
|
|
||||||
|
// ilu_decomposition
|
||||||
|
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||||
|
|
||||||
|
int* getToOrder()
|
||||||
|
{
|
||||||
|
return toOrder.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
int* getFromOrder()
|
||||||
|
{
|
||||||
|
return fromOrder.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
BlockedMatrix<block_size>* getRMat()
|
||||||
|
{
|
||||||
|
return rMat.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
void **getResultPointers()
|
||||||
|
{
|
||||||
|
return resultPointers.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
int *getResultSizes()
|
||||||
|
{
|
||||||
|
return resultSizes.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
} //namespace bda
|
||||||
|
|
||||||
|
#endif // FPGA_BILU0_HEADER_INCLUDED
|
249
opm/simulators/linalg/bda/FPGAMatrix.cpp
Normal file
249
opm/simulators/linalg/bda/FPGAMatrix.cpp
Normal file
@ -0,0 +1,249 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
/*Sort a row of matrix elements from a CSR-format.*/
|
||||||
|
void sortRow(int *colIndices, double *data, int left, int right) {
|
||||||
|
int l = left;
|
||||||
|
int r = right;
|
||||||
|
int middle = colIndices[(l + r) >> 1];
|
||||||
|
do {
|
||||||
|
while (colIndices[l] < middle)
|
||||||
|
l++;
|
||||||
|
while (colIndices[r] > middle)
|
||||||
|
r--;
|
||||||
|
if (l <= r) {
|
||||||
|
int lColIndex = colIndices[l];
|
||||||
|
colIndices[l] = colIndices[r];
|
||||||
|
colIndices[r] = lColIndex;
|
||||||
|
double lDatum = data[l];
|
||||||
|
data[l] = data[r];
|
||||||
|
data[r] = lDatum;
|
||||||
|
|
||||||
|
l++;
|
||||||
|
r--;
|
||||||
|
}
|
||||||
|
} while (l < r);
|
||||||
|
if (left < r)
|
||||||
|
sortRow(colIndices, data, left, r);
|
||||||
|
if (right > l)
|
||||||
|
sortRow(colIndices, data, l, right);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
|
||||||
|
* - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
|
||||||
|
* and colors, and the size, in elements, of the NROffsets array, followed by:
|
||||||
|
* the number of rows (rounded to the nearest 32), the number of rows (not rounded),
|
||||||
|
* the number of columns (not rounded) and the number of non-zero values
|
||||||
|
* (rounded to the nearest 32) for every partition.
|
||||||
|
* This array is zero padded up to the nearest 64-byte cacheline.
|
||||||
|
* - The "colIndicesInColor" array, which contains for every partition, from which elements
|
||||||
|
* in the global X vector the elements of that X vector partition came.
|
||||||
|
* For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then
|
||||||
|
* that X vector partition will only have three elements, and the color_col_indices array
|
||||||
|
* will contain 1, 3 and 6 for that partition.
|
||||||
|
* This array is zero padded up to the nearest 64-byte cacheline for every partition.
|
||||||
|
* - The "nnzValues" array contains all non-zero values of each partition of the matrix.
|
||||||
|
* This array is zero-padded so that each color has a multiple of 32 elements (to have the
|
||||||
|
* same number of elements per partition as the column indices array).
|
||||||
|
* - The "colIndices" array contains all column indices of each partition of the matrix.
|
||||||
|
* These column indices are the local indices for that partition, so to be used, first a
|
||||||
|
* local X vector partition needs to be loaded into some local memory (this is done using
|
||||||
|
* data from the _color_col_indices array), before these column indices can be used as
|
||||||
|
* addresses to that local memory to read the desired X vector values.
|
||||||
|
* This array is zero-padded so that data for every partition fills up a number of complete
|
||||||
|
* cachelines (this means every color has a multiple of 32 elements).
|
||||||
|
* - "NROffsets" is the name of the array that contains the new row offsets for
|
||||||
|
* all elements of every partition of the matrix. New row offsets are 8-bit values which
|
||||||
|
* are 0 if that element is not the first element in a row, or which, if that element is
|
||||||
|
* the first element of a row) is equal to the amount of empty rows between that new row
|
||||||
|
* and the row before it plus 1. This array is zero-padded so that data for every partition
|
||||||
|
* fills up a number of complete cachelines (this means every color has a multiple of 64 elements).
|
||||||
|
*/
|
||||||
|
int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int nnzsThisRowLimit,
|
||||||
|
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes)
|
||||||
|
{
|
||||||
|
auto mat = this;
|
||||||
|
|
||||||
|
int doneRows = 0;
|
||||||
|
int totalRowNum = 0; // total number of non-empty rows
|
||||||
|
int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color
|
||||||
|
int maxNNZsPerColor = 0; // max of nnzsPerColor
|
||||||
|
|
||||||
|
int totalValSize = 0; // sum of nnzsPerColor, padded
|
||||||
|
|
||||||
|
std::vector<int> nnzRowsPerColor(numColors);
|
||||||
|
|
||||||
|
// find number of nnzs per color and number of non-empty rows
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
int numRows = 0;
|
||||||
|
nnzRowsPerColor[c] = 0;
|
||||||
|
int firstNnzOfColor = mat->rowPointers[doneRows];
|
||||||
|
int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]];
|
||||||
|
nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices
|
||||||
|
totalValSize += nnzsPerColor;
|
||||||
|
maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor);
|
||||||
|
int row = doneRows;
|
||||||
|
for (; row < doneRows + nodesPerColor[c]; row++) {
|
||||||
|
if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) {
|
||||||
|
numRows++;
|
||||||
|
nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
doneRows = row;
|
||||||
|
totalRowNum += numRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
int conseqZeroRows = 0; // number of consecutive empty rows
|
||||||
|
int maxConseqZeroRows = 0;
|
||||||
|
int numEmptyRows = 0; // total number of empty rows
|
||||||
|
std::vector<int> rowOffsets(totalRowNum);
|
||||||
|
std::vector<int> nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows
|
||||||
|
std::vector<int> colorValPointers(numColors + 1); // points to first nnz of first row of each color
|
||||||
|
std::vector<int> colorValZeroPointers(numColors); // points to first padded zero for each color
|
||||||
|
|
||||||
|
int nonEmptyRowIdx = 0; // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen
|
||||||
|
doneRows = 0;
|
||||||
|
|
||||||
|
int totalPaddingSize = 0; // number of padded zeros from previous colors
|
||||||
|
int NROffsetSize = 0; // number of NROffsets entries, padded to multiple of 64 for each color
|
||||||
|
int maxRows = 0;
|
||||||
|
int maxNNZsPerRow = 0;
|
||||||
|
|
||||||
|
// determine the row offset of each row (amount of zero rows between it and the previous non-zero row)
|
||||||
|
// this is later converted to rowOffset for each nnz
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
conseqZeroRows = 0;
|
||||||
|
for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
|
||||||
|
int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row];
|
||||||
|
if (nnzsThisRow == 0) {
|
||||||
|
conseqZeroRows++;
|
||||||
|
numEmptyRows++;
|
||||||
|
} else {
|
||||||
|
maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow);
|
||||||
|
nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1];
|
||||||
|
rowOffsets[nonEmptyRowIdx] = conseqZeroRows;
|
||||||
|
maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows);
|
||||||
|
conseqZeroRows = 0;
|
||||||
|
nonEmptyRowIdx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// calculate sizes that include zeropadding
|
||||||
|
colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize;
|
||||||
|
colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32);
|
||||||
|
totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c];
|
||||||
|
NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64);
|
||||||
|
|
||||||
|
doneRows += nodesPerColor[c];
|
||||||
|
maxRows = std::max(nodesPerColor[c], maxRows);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (maxNNZsPerRow > nnzsThisRowLimit) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit;
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// create and fill RDF arrays
|
||||||
|
colorSizes[3] = colorValPointers[numColors]; // total number of nnzs the FPGA has to process, including zeropadding
|
||||||
|
colorSizes[5] = NROffsetSize;
|
||||||
|
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
colorSizes[c * 4 + 8] = nnzRowsPerColor[c];
|
||||||
|
colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c];
|
||||||
|
}
|
||||||
|
|
||||||
|
int rowIndex = 0; // keep track of where to read/write
|
||||||
|
int valIndex = 0;
|
||||||
|
int NRIndex = 0;
|
||||||
|
int halfwayPoint = colorValPointers[numColors] / 2;
|
||||||
|
nnzValsSizes[0] = colorValPointers[numColors];
|
||||||
|
|
||||||
|
colorSizes[7] = halfwayPoint;
|
||||||
|
|
||||||
|
for (int c = 0; c < numColors; c++) {
|
||||||
|
int nnzsThisRow;
|
||||||
|
// make sure 32 values are written in batches (pad with zeros if needed)
|
||||||
|
for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) {
|
||||||
|
for (int vb = 0; vb < 32; vb++) {
|
||||||
|
|
||||||
|
// if there are enough values for the whole cacheline
|
||||||
|
if (v + vb < colorValZeroPointers[c]) {
|
||||||
|
ubNnzValues[0][v + vb] = mat->nnzValues[valIndex];
|
||||||
|
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex]]);
|
||||||
|
|
||||||
|
// if this val is the first of a row
|
||||||
|
if (nnzRowPointers[rowIndex] == valIndex) {
|
||||||
|
|
||||||
|
if (rowOffsets[rowIndex] + 1 >= 255) {
|
||||||
|
std::ostringstream errorstring;
|
||||||
|
errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1;
|
||||||
|
OPM_THROW(std::logic_error, errorstring.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
NROffsets[NRIndex] = static_cast<unsigned char>(rowOffsets[rowIndex] + 1);
|
||||||
|
|
||||||
|
// skip all empty rows
|
||||||
|
while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) {
|
||||||
|
rowIndex++;
|
||||||
|
nnzsThisRow = 0;
|
||||||
|
}
|
||||||
|
nnzsThisRow++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
NROffsets[NRIndex] = (unsigned char) 0;
|
||||||
|
nnzsThisRow++;
|
||||||
|
}
|
||||||
|
valIndex++;
|
||||||
|
}
|
||||||
|
else // zeropadding is needed
|
||||||
|
{
|
||||||
|
ubNnzValues[0][v + vb] = 0.0;
|
||||||
|
ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex - 1]]);
|
||||||
|
NROffsets[NRIndex] = 0;
|
||||||
|
}
|
||||||
|
NRIndex++;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// zeropad the NROffsets file
|
||||||
|
while (NRIndex % 64 != 0) {
|
||||||
|
NROffsets[NRIndex] = 0;
|
||||||
|
NRIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // end namespace bda
|
84
opm/simulators/linalg/bda/FPGAMatrix.hpp
Normal file
84
opm/simulators/linalg/bda/FPGAMatrix.hpp
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef FPGA_MATRIX_HEADER_INCLUDED
|
||||||
|
#define FPGA_MATRIX_HEADER_INCLUDED
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
/// This struct resembles a csr matrix, only doubles are supported
|
||||||
|
/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
|
||||||
|
class Matrix {
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/// Allocate Matrix and data arrays with given sizes
|
||||||
|
/// \param[in] N number of rows
|
||||||
|
/// \param[in] nnzs number of nonzeros
|
||||||
|
Matrix(int N_, int nnzs_)
|
||||||
|
: nnzValues(new double[nnzs_]),
|
||||||
|
colIndices(new int[nnzs_]),
|
||||||
|
rowPointers(new int[N_+1]),
|
||||||
|
N(N_),
|
||||||
|
nnzs(nnzs_)
|
||||||
|
{}
|
||||||
|
|
||||||
|
/// All constructors allocate new memory, so always delete here
|
||||||
|
~Matrix(){
|
||||||
|
delete[] nnzValues;
|
||||||
|
delete[] colIndices;
|
||||||
|
delete[] rowPointers;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Converts this matrix to the dataformat used by the FPGA.
|
||||||
|
/// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
|
||||||
|
/// The purpose of this format is to allow the data to be streamable.
|
||||||
|
/// The rowPointers array has an unpredictable reading pattern/timing,
|
||||||
|
/// it also needs a extra work if a row is shorter than a cacheline.
|
||||||
|
/// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets.
|
||||||
|
/// The value of this offset is 0, unless the corresponding nnz is the first of a row,
|
||||||
|
/// in that case it is 'the number of empty rows preceeding it + 1'.
|
||||||
|
/// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx.
|
||||||
|
/// Example:
|
||||||
|
/// [1 0 0 3 0] nnzValues [1 3 2 2 1 4 3 4 1]
|
||||||
|
/// [0 2 2 0 1] colIndices [0 3 1 2 4 0 1 2 4]
|
||||||
|
/// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9]
|
||||||
|
/// [0 0 0 0 0] rowOffsets [1 0 1 0 0 1 2 0 0]
|
||||||
|
/// [0 3 4 0 1]
|
||||||
|
/// The rowOffset is stored in 1 byte, meaning the maximum value is 255.
|
||||||
|
int toRDF(int numColors, std::vector<int>& nodesPerColor,
|
||||||
|
std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit,
|
||||||
|
std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
|
||||||
|
|
||||||
|
double *nnzValues;
|
||||||
|
int *colIndices;
|
||||||
|
int *rowPointers;
|
||||||
|
int N;
|
||||||
|
int nnzs;
|
||||||
|
};
|
||||||
|
|
||||||
|
void sortRow(int *colIndices, double *data, int left, int right);
|
||||||
|
|
||||||
|
} // end namespace bda
|
||||||
|
|
||||||
|
#endif // FPGA_MATRIX_HEADER_INCLUDED
|
732
opm/simulators/linalg/bda/FPGASolverBackend.cpp
Normal file
732
opm/simulators/linalg/bda/FPGASolverBackend.cpp
Normal file
@ -0,0 +1,732 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||||
|
#include <opm/material/common/Unused.hpp>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/Reorder.hpp>
|
||||||
|
|
||||||
|
// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA
|
||||||
|
// kernel will be disabled and execution will continue using DUNE
|
||||||
|
#define FPGA_EXIT_WITH_HW_FAILURE
|
||||||
|
//#undef FPGA_EXIT_WITH_HW_FAILURE
|
||||||
|
|
||||||
|
// if defined, the function generate_statistics will create a CSV-formatted file
|
||||||
|
// with detailed statistics about the FPGA backend performance
|
||||||
|
//#define FPGA_STATISTICS_FILE_ENABLED
|
||||||
|
#undef FPGA_STATISTICS_FILE_ENABLED
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
using Opm::OpmLog;
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
FpgaSolverBackend<block_size>::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(fpga_bitstream, verbosity_, maxit_, tolerance_)
|
||||||
|
{
|
||||||
|
int err;
|
||||||
|
std::ostringstream oss;
|
||||||
|
double start = second();
|
||||||
|
|
||||||
|
// currently, only block size == 3 is supported by the FPGA backend
|
||||||
|
assert(block_size == 3);
|
||||||
|
|
||||||
|
if (verbosity < 1) {
|
||||||
|
perf_call_enabled = false;
|
||||||
|
}
|
||||||
|
// setup bitstream name and other parameters
|
||||||
|
if (fpga_bitstream.compare("") == 0) {
|
||||||
|
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified");
|
||||||
|
}
|
||||||
|
if (!fileExists(fpga_bitstream.c_str())) {
|
||||||
|
OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable");
|
||||||
|
}
|
||||||
|
// -----------------------------
|
||||||
|
// FPGA: setup the OpenCL platform
|
||||||
|
// -----------------------------
|
||||||
|
std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp
|
||||||
|
// auto-select the proper FPGA device and create context and other CL objects
|
||||||
|
err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to setup the OpenCL device (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
oss << "Detected FPGA platform type is ";
|
||||||
|
if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; }
|
||||||
|
OpmLog::info(oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
// -----------------------------
|
||||||
|
// FPGA: setup the debug buffer
|
||||||
|
// -----------------------------
|
||||||
|
// set kernel debug lines depending on an environment variable
|
||||||
|
const char *xem = getenv("XCL_EMULATION_MODE");
|
||||||
|
if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) {
|
||||||
|
debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU;
|
||||||
|
oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n";
|
||||||
|
OpmLog::info(oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
} else {
|
||||||
|
// set to 2 to reduce overhead in reading back and interpreting the debug lines;
|
||||||
|
// increase to get more debug info from the kernel
|
||||||
|
// range is 2..DEBUG_OUTBUF_WORDS_MAX-1
|
||||||
|
debug_outbuf_words = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
// host debug buffer setup
|
||||||
|
err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// device debug buffer setup
|
||||||
|
err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// copy debug buffer to device
|
||||||
|
err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// ------------------------------------------------
|
||||||
|
// FPGA: query the kernel for limits/configuration
|
||||||
|
// ------------------------------------------------
|
||||||
|
err = fpga_kernel_query(context, commands, kernel, cldebug,
|
||||||
|
debugBuffer, debug_outbuf_words,
|
||||||
|
rst_assert_cycles, rst_settle_cycles,
|
||||||
|
&hw_x_vector_elem, &hw_max_row_size,
|
||||||
|
&hw_max_column_size, &hw_max_colors_size,
|
||||||
|
&hw_max_nnzs_per_row, &hw_max_matrix_size,
|
||||||
|
&hw_use_uram, &hw_write_ilu0_results,
|
||||||
|
&hw_dma_data_width, &hw_mult_num,
|
||||||
|
&hw_x_vector_latency, &hw_add_latency, &hw_mult_latency,
|
||||||
|
&hw_num_read_ports, &hw_num_write_ports,
|
||||||
|
&hw_reset_cycles, &hw_reset_settle);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_kernel_query (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbosity >= 1) {
|
||||||
|
oss << "FPGA kernel limits/configuration:\n";
|
||||||
|
oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n";
|
||||||
|
oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n";
|
||||||
|
oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n";
|
||||||
|
oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n";
|
||||||
|
oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n";
|
||||||
|
oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n";
|
||||||
|
oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n";
|
||||||
|
oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle;
|
||||||
|
OpmLog::info(oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
// check that LU results are generated by the kernel
|
||||||
|
if (use_LU_res && !hw_write_ilu0_results) {
|
||||||
|
OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage");
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
use_LU_res = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// setup preconditioner
|
||||||
|
double start_prec = second();
|
||||||
|
prec = std::make_unique<Preconditioner>(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size);
|
||||||
|
perf_total.s_preconditioner_setup = second() - start_prec;
|
||||||
|
|
||||||
|
if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
|
||||||
|
level_scheduling = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
perf_total.s_initialization = second() - start;
|
||||||
|
} // end fpgaSolverBackend
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
FpgaSolverBackend<block_size>::~FpgaSolverBackend()
|
||||||
|
{
|
||||||
|
if (verbosity >= 1) {
|
||||||
|
generate_statistics();
|
||||||
|
}
|
||||||
|
delete[] rx;
|
||||||
|
delete[] rb;
|
||||||
|
if (nnzValArrays != nullptr) { free(nnzValArrays); }
|
||||||
|
if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); }
|
||||||
|
if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); }
|
||||||
|
// FPGA: buffers
|
||||||
|
free(debugBuffer);
|
||||||
|
for (int b = 0; b < RW_BUF; b++) {
|
||||||
|
free(dataBuffer[b]);
|
||||||
|
}
|
||||||
|
free(databufferSize);
|
||||||
|
// FPGA: OpenCL objects
|
||||||
|
if (cldebug != nullptr) { clReleaseMemObject(cldebug); }
|
||||||
|
for (int b = 0; b < RW_BUF; b++) {
|
||||||
|
if (cldata[b] != nullptr) {
|
||||||
|
clReleaseMemObject(cldata[b]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
clReleaseCommandQueue(commands);
|
||||||
|
clReleaseContext(context);
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
clReleaseProgram(program);
|
||||||
|
clReleaseDevice(device_id);
|
||||||
|
} // end ~fpgaSolverBackend()
|
||||||
|
|
||||||
|
|
||||||
|
// copy result to host memory
|
||||||
|
// caller must be sure that x is a valid array
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void FpgaSolverBackend<block_size>::get_result(double *x_)
|
||||||
|
{
|
||||||
|
double start = 0;
|
||||||
|
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
// apply to results the reordering (stored in toOrder)
|
||||||
|
reorderBlockedVectorByPattern<block_size>(mat->Nb, rx, toOrder, x_);
|
||||||
|
// TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private)
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_postprocess = second() - start;
|
||||||
|
}
|
||||||
|
} // end get_result()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res)
|
||||||
|
{
|
||||||
|
if (initialized == false) {
|
||||||
|
initialize(N_, nnz_, dim, vals, rows, cols);
|
||||||
|
if (!analyse_matrix()) {
|
||||||
|
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
perf_call.emplace_back();
|
||||||
|
update_system(vals, b);
|
||||||
|
if (!create_preconditioner()) {
|
||||||
|
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||||
|
}
|
||||||
|
solve_system(res);
|
||||||
|
|
||||||
|
if (verbosity >= 1) {
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \
|
||||||
|
", iterations: " << res.iterations << ", reduction: " << res.reduction << \
|
||||||
|
", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed;
|
||||||
|
OpmLog::info(oss.str());
|
||||||
|
}
|
||||||
|
return SolverStatus::BDA_SOLVER_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
|
||||||
|
{
|
||||||
|
double start = second();
|
||||||
|
this->N = N_;
|
||||||
|
this->nnz = nnz_;
|
||||||
|
this->nnzb = nnz_ / block_size / block_size;
|
||||||
|
Nb = (N + dim - 1) / dim;
|
||||||
|
|
||||||
|
// allocate host memory for matrices and vectors
|
||||||
|
// actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
|
||||||
|
mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
|
||||||
|
|
||||||
|
std::ostringstream oss;
|
||||||
|
oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
|
||||||
|
"block size: " << dim << ", total nnz: " << this->nnz << "\n";
|
||||||
|
oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
|
||||||
|
OpmLog::info(oss.str());
|
||||||
|
|
||||||
|
rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
|
||||||
|
rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
|
||||||
|
|
||||||
|
perf_total.s_initialization += second() - start;
|
||||||
|
initialized = true;
|
||||||
|
} // end initialize()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
bool FpgaSolverBackend<block_size>::analyse_matrix()
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
double start = second();
|
||||||
|
bool success = prec->init(mat.get());
|
||||||
|
|
||||||
|
if (!success) {
|
||||||
|
OpmLog::warning("Preconditioner for FPGA solver failed to initialize");
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
|
toOrder = prec->getToOrder();
|
||||||
|
fromOrder = prec->getFromOrder();
|
||||||
|
rMat = prec->getRMat();
|
||||||
|
processedPointers = prec->getResultPointers();
|
||||||
|
processedSizes = prec->getResultSizes();
|
||||||
|
processedPointers[19] = rb;
|
||||||
|
processedPointers[20] = rx;
|
||||||
|
nnzValArrays_size = static_cast<int*>(processedPointers[5])[0];
|
||||||
|
L_nnzValArrays_size = static_cast<int*>(processedPointers[11])[0];
|
||||||
|
U_nnzValArrays_size = static_cast<int*>(processedPointers[17])[0];
|
||||||
|
// -------------------------------------
|
||||||
|
// FPGA: setup host/device data buffers
|
||||||
|
// -------------------------------------
|
||||||
|
// allocate memory and setup data layout
|
||||||
|
err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits,
|
||||||
|
processedSizes,
|
||||||
|
&setupArray,
|
||||||
|
&nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray,
|
||||||
|
&L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray,
|
||||||
|
&U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray,
|
||||||
|
&BLKDArray, &X1Array, &R1Array,
|
||||||
|
&X2Array, &R2Array,
|
||||||
|
&LresArray, &UresArray,
|
||||||
|
&databufferSize, dataBuffer,
|
||||||
|
result_offsets, 1 /*num_nnz_arrays*/,
|
||||||
|
true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */
|
||||||
|
debugbufferSize);
|
||||||
|
if (err) {
|
||||||
|
oss << "Failed to call fpga_setup_host_datamem (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// results buffers setup
|
||||||
|
if (use_LU_res) {
|
||||||
|
resultsBufferNum = 4;
|
||||||
|
} else {
|
||||||
|
resultsBufferNum = 2;
|
||||||
|
}
|
||||||
|
if (resultsBufferNum > RES_BUF_MAX) {
|
||||||
|
oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
resultsNum = processedSizes[0]; // rowSize, invariant between system solves
|
||||||
|
for (int i = 0; i < resultsBufferNum; i++) {
|
||||||
|
resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double);
|
||||||
|
}
|
||||||
|
|
||||||
|
// device data memory setup
|
||||||
|
err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_setup_device_datamem (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: setup the kernel's parameters
|
||||||
|
// ------------------------------------
|
||||||
|
err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit,
|
||||||
|
debug_sample_rate, tolerance, cldata, cldebug);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_set_kernel_parameters (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
perf_total.s_analysis = second() - start;
|
||||||
|
analysis_done = true;
|
||||||
|
|
||||||
|
return success;
|
||||||
|
} // end analyse_matrix()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
bool FpgaSolverBackend<block_size>::create_preconditioner()
|
||||||
|
{
|
||||||
|
double start = 0;
|
||||||
|
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
memset(rx, 0, sizeof(double) * N);
|
||||||
|
bool result = prec->create_preconditioner(mat.get());
|
||||||
|
if (!result) {
|
||||||
|
OpmLog::warning("fpgaSolverBackend: create_preconditioner failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_preconditioner_create = second() - start;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} // end create_preconditioner()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void FpgaSolverBackend<block_size>::solve_system(BdaResult &res)
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
int err;
|
||||||
|
double start = 0, start_total = 0;
|
||||||
|
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: return immediately if FPGA is disabled
|
||||||
|
// ------------------------------------
|
||||||
|
if (fpga_disabled) {
|
||||||
|
res.converged = false;
|
||||||
|
OpmLog::warning("FPGA is disabled, fallback to SW execution");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fpga_calls++;
|
||||||
|
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
start_total = start;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if any buffer is larger than the size set in preconditioner->init
|
||||||
|
// TODO: add check for all other buffer sizes that may overflow?
|
||||||
|
err = 0;
|
||||||
|
if ( ((int *)processedPointers[5])[0] > nnzValArrays_size ||
|
||||||
|
((int *)processedPointers[11])[0] > L_nnzValArrays_size ||
|
||||||
|
((int *)processedPointers[17])[0] > U_nnzValArrays_size ) {
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
if (err != 0) {
|
||||||
|
OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)");
|
||||||
|
}
|
||||||
|
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: copy input data to host data buffers
|
||||||
|
// ------------------------------------
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
err = fpga_copy_host_datamem(
|
||||||
|
processedPointers, processedSizes, setupArray,
|
||||||
|
nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray,
|
||||||
|
L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray,
|
||||||
|
U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray,
|
||||||
|
BLKDArray, X1Array, R1Array, X2Array, R2Array,
|
||||||
|
use_LU_res, LresArray, UresArray,
|
||||||
|
databufferSize, dataBuffer,
|
||||||
|
1 /* nnzValArrays_num */,
|
||||||
|
reset_data_buffers, fill_results_buffers,
|
||||||
|
dump_data_buffers, fpga_calls);
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_mem_setup = second() - start;
|
||||||
|
}
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: copy buffers to device
|
||||||
|
// ------------------------------------
|
||||||
|
// copy debug buffer to device
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
err = fpga_copy_to_device_debugbuf(commands,
|
||||||
|
cldebug, debugBuffer, debugbufferSize,
|
||||||
|
debug_outbuf_words);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// copy data buffers to device
|
||||||
|
err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_mem_h2d = second() - start;
|
||||||
|
}
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: execute the kernel
|
||||||
|
// ------------------------------------
|
||||||
|
double time_elapsed_ms;
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
err = fpga_kernel_run(commands, kernel, &time_elapsed_ms);
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_kernel_exec = second() - start;
|
||||||
|
}
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_kernel_run (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// ----------------------------------------
|
||||||
|
// FPGA: read back debug buffer from device
|
||||||
|
// ----------------------------------------
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10),
|
||||||
|
commands,
|
||||||
|
debug_outbuf_words, debugbufferSize,
|
||||||
|
cldebug, debugBuffer,
|
||||||
|
abort_cycles,
|
||||||
|
&kernel_cycles, &kernel_iter_run,
|
||||||
|
norms, &last_norm_idx,
|
||||||
|
&kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults,
|
||||||
|
&kernel_wrafterend, &kernel_dbgfifofull);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
if (kernel_wrafterend) {
|
||||||
|
OpmLog::warning("Detected recoverable FPGA error: kernel write after end");
|
||||||
|
}
|
||||||
|
if (kernel_dbgfifofull) {
|
||||||
|
OpmLog::warning("Detected recoverable FPGA error: debug FIFO full");
|
||||||
|
}
|
||||||
|
if (kernel_aborted || kernel_signature || kernel_overflow) {
|
||||||
|
#if defined(FPGA_EXIT_WITH_HW_FAILURE)
|
||||||
|
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
|
||||||
|
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
#else
|
||||||
|
oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
|
||||||
|
",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n";
|
||||||
|
oss << "Disabling FPGA kernel: execution will continue with SW kernel";
|
||||||
|
OpmLog::warning(oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
fpga_disabled = true;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().n_kernel_exec_cycles = kernel_cycles;
|
||||||
|
}
|
||||||
|
// copy (back) results only if FPGA is not disabled
|
||||||
|
if (!fpga_disabled) {
|
||||||
|
if (kernel_noresults) {
|
||||||
|
OpmLog::warning("FPGA kernel did not return results because the required precision is already reached");
|
||||||
|
// rx still contains zeros from initial guess
|
||||||
|
} else {
|
||||||
|
// ------------------------------------
|
||||||
|
// FPGA: read back results from device
|
||||||
|
// ------------------------------------
|
||||||
|
err = fpga_map_results(even(kernel_iter_run),
|
||||||
|
use_residuals, use_LU_res, commands,
|
||||||
|
resultsNum, resultsBufferNum, resultsBufferSize,
|
||||||
|
debugbufferSize,
|
||||||
|
cldata, resultsBuffer,
|
||||||
|
result_offsets,
|
||||||
|
dump_results, data_dir, basename, sequence);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_map_results (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
// TODO: copy results buffers to reordering output buffers
|
||||||
|
memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double));
|
||||||
|
err = fpga_unmap_results(even(kernel_iter_run),
|
||||||
|
use_residuals, use_LU_res,
|
||||||
|
commands, cldata, resultsBuffer);
|
||||||
|
if (err != 0) {
|
||||||
|
oss << "Failed to call fpga_unmap_results (" << err << ")";
|
||||||
|
OPM_THROW(std::logic_error, oss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// set results and update statistics (if enabled)
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_mem_d2h = second() - start;
|
||||||
|
}
|
||||||
|
float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns
|
||||||
|
res.iterations = (int)iter;
|
||||||
|
res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm
|
||||||
|
res.conv_rate = pow(res.reduction, 1.0 / iter);
|
||||||
|
res.elapsed = second() - start_total;
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_solve = res.elapsed;
|
||||||
|
perf_call.back().n_kernel_exec_iters = iter;
|
||||||
|
}
|
||||||
|
// convergence depends on number of iterations reached and hw execution errors
|
||||||
|
res.converged = true;
|
||||||
|
if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) {
|
||||||
|
res.converged = false;
|
||||||
|
if (verbosity >= 1) {
|
||||||
|
oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \
|
||||||
|
", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \
|
||||||
|
", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit);
|
||||||
|
OpmLog::warning(oss.str());
|
||||||
|
oss.str("");
|
||||||
|
oss.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().converged = res.converged;
|
||||||
|
perf_call.back().converged_flags = ((unsigned int)fpga_disabled) +
|
||||||
|
((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) +
|
||||||
|
((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4);
|
||||||
|
}
|
||||||
|
} // end solve_system()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void FpgaSolverBackend<block_size>::update_system(double *vals, double *b)
|
||||||
|
{
|
||||||
|
double start = 0;
|
||||||
|
|
||||||
|
mat->nnzValues = vals;
|
||||||
|
// reorder inputs using previously found ordering (stored in fromOrder)
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
start = second();
|
||||||
|
}
|
||||||
|
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
||||||
|
if (perf_call_enabled) {
|
||||||
|
perf_call.back().s_reorder = second() - start;
|
||||||
|
}
|
||||||
|
} // end update_system()
|
||||||
|
|
||||||
|
|
||||||
|
template <unsigned int block_size>
|
||||||
|
void FpgaSolverBackend<block_size>::generate_statistics()
|
||||||
|
{
|
||||||
|
std::ostringstream oss;
|
||||||
|
unsigned int conv_iter = 0, conv_ovf = 0;
|
||||||
|
|
||||||
|
if (!perf_call_enabled || fpga_calls == 0) {
|
||||||
|
OpmLog::warning("FPGA statistics were not collected");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::printf("--- FPGA statistics ---\n");
|
||||||
|
std::printf("total solver calls..........: %u\n", fpga_calls);
|
||||||
|
std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization);
|
||||||
|
std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup);
|
||||||
|
|
||||||
|
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||||
|
// DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file
|
||||||
|
FILE *fout = fopen("fpga_statistics_details.csv", "w");
|
||||||
|
if (fout != nullptr) {
|
||||||
|
std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
unsigned int num_data_points = perf_call.size();
|
||||||
|
for (unsigned int i = 0; i < num_data_points; i++) {
|
||||||
|
perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create;
|
||||||
|
if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; }
|
||||||
|
if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; }
|
||||||
|
perf_total.s_analysis += perf_call[i].s_analysis;
|
||||||
|
if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; }
|
||||||
|
if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; }
|
||||||
|
perf_total.s_reorder += perf_call[i].s_reorder;
|
||||||
|
if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; }
|
||||||
|
if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; }
|
||||||
|
perf_total.s_mem_setup += perf_call[i].s_mem_setup;
|
||||||
|
if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; }
|
||||||
|
if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; }
|
||||||
|
perf_total.s_mem_h2d += perf_call[i].s_mem_h2d;
|
||||||
|
if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; }
|
||||||
|
if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; }
|
||||||
|
perf_total.s_kernel_exec += perf_call[i].s_kernel_exec;
|
||||||
|
if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; }
|
||||||
|
if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; }
|
||||||
|
perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles;
|
||||||
|
if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; }
|
||||||
|
if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; }
|
||||||
|
perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters;
|
||||||
|
if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; }
|
||||||
|
if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; }
|
||||||
|
perf_total.s_mem_d2h += perf_call[i].s_mem_d2h;
|
||||||
|
if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; }
|
||||||
|
if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; }
|
||||||
|
perf_total.s_solve += perf_call[i].s_solve;
|
||||||
|
if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; }
|
||||||
|
if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; }
|
||||||
|
perf_total.s_postprocess += perf_call[i].s_postprocess;
|
||||||
|
if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; }
|
||||||
|
if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; }
|
||||||
|
perf_total.n_converged += (unsigned int)perf_call[i].converged;
|
||||||
|
if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; }
|
||||||
|
if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; }
|
||||||
|
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||||
|
if (fout != nullptr) {
|
||||||
|
std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n",
|
||||||
|
i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder,
|
||||||
|
perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles,
|
||||||
|
perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess,
|
||||||
|
(unsigned int)perf_call[i].converged);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#if defined(FPGA_STATISTICS_FILE_ENABLED)
|
||||||
|
if (fout != nullptr) {
|
||||||
|
fclose(fout);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points;
|
||||||
|
perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points;
|
||||||
|
perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points;
|
||||||
|
perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points;
|
||||||
|
perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points;
|
||||||
|
perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points;
|
||||||
|
perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points;
|
||||||
|
perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points;
|
||||||
|
perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points;
|
||||||
|
perf_total.s_solve_avg = perf_total.s_solve / num_data_points;
|
||||||
|
perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points;
|
||||||
|
std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max);
|
||||||
|
std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max);
|
||||||
|
std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max);
|
||||||
|
std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max);
|
||||||
|
std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max);
|
||||||
|
std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max);
|
||||||
|
std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n",
|
||||||
|
perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max);
|
||||||
|
std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n",
|
||||||
|
perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max);
|
||||||
|
std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max);
|
||||||
|
std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max);
|
||||||
|
std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
|
||||||
|
perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max);
|
||||||
|
std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n",
|
||||||
|
perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf);
|
||||||
|
std::printf("-----------------------\n");
|
||||||
|
} //end generate_statistics()
|
||||||
|
|
||||||
|
|
||||||
|
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||||
|
template FpgaSolverBackend<n>::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \
|
||||||
|
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(3);
|
||||||
|
INSTANTIATE_BDA_FUNCTIONS(4);
|
||||||
|
|
||||||
|
#undef INSTANTIATE_BDA_FUNCTIONS
|
||||||
|
|
||||||
|
} //namespace bda
|
265
opm/simulators/linalg/bda/FPGASolverBackend.hpp
Normal file
265
opm/simulators/linalg/bda/FPGASolverBackend.hpp
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
|
||||||
|
#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
|
||||||
|
|
||||||
|
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
|
||||||
|
#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
|
||||||
|
|
||||||
|
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
|
||||||
|
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
|
||||||
|
#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
/// This class implements an ilu0-bicgstab solver on FPGA
|
||||||
|
template <unsigned int block_size>
|
||||||
|
class FpgaSolverBackend : public BdaSolver<block_size>
|
||||||
|
{
|
||||||
|
typedef BdaSolver<block_size> Base;
|
||||||
|
typedef FPGABILU0<block_size> Preconditioner;
|
||||||
|
|
||||||
|
using Base::N;
|
||||||
|
using Base::Nb;
|
||||||
|
using Base::nnz;
|
||||||
|
using Base::nnzb;
|
||||||
|
using Base::verbosity;
|
||||||
|
using Base::maxit;
|
||||||
|
using Base::tolerance;
|
||||||
|
using Base::initialized;
|
||||||
|
|
||||||
|
private:
|
||||||
|
double *rx = nullptr; // reordered x
|
||||||
|
double *rb = nullptr; // reordered b
|
||||||
|
int *fromOrder = nullptr, *toOrder = nullptr;
|
||||||
|
bool analysis_done = false;
|
||||||
|
bool level_scheduling = false;
|
||||||
|
|
||||||
|
// LUMat will shallow copy rowPointers and colIndices of mat/rMat
|
||||||
|
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
|
||||||
|
BlockedMatrix<block_size> *rMat = nullptr;
|
||||||
|
std::unique_ptr<Preconditioner> prec = nullptr;
|
||||||
|
|
||||||
|
// vectors with data processed by the preconditioner (input to the kernel)
|
||||||
|
void **processedPointers = nullptr;
|
||||||
|
int *processedSizes = nullptr;
|
||||||
|
|
||||||
|
unsigned int fpga_calls = 0;
|
||||||
|
bool perf_call_enabled = true;
|
||||||
|
|
||||||
|
// per call performance metrics
|
||||||
|
typedef struct {
|
||||||
|
double s_preconditioner_create = 0.0;
|
||||||
|
double s_analysis = 0.0;
|
||||||
|
double s_reorder = 0.0;
|
||||||
|
double s_mem_setup = 0.0;
|
||||||
|
double s_mem_h2d = 0.0;
|
||||||
|
double s_kernel_exec = 0.0;
|
||||||
|
unsigned int n_kernel_exec_cycles = 0;
|
||||||
|
float n_kernel_exec_iters = 0.0;
|
||||||
|
double s_mem_d2h = 0.0;
|
||||||
|
double s_solve = 0.0;
|
||||||
|
double s_postprocess = 0.0;
|
||||||
|
bool converged = false;
|
||||||
|
unsigned int converged_flags = 0;
|
||||||
|
} perf_call_metrics_t;
|
||||||
|
// cumulative performance metrics
|
||||||
|
typedef struct {
|
||||||
|
double s_initialization;
|
||||||
|
double s_preconditioner_setup;
|
||||||
|
double s_preconditioner_create;
|
||||||
|
double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
|
||||||
|
double s_analysis;
|
||||||
|
double s_analysis_min,s_analysis_max,s_analysis_avg;
|
||||||
|
double s_reorder;
|
||||||
|
double s_reorder_min,s_reorder_max,s_reorder_avg;
|
||||||
|
double s_mem_setup;
|
||||||
|
double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
|
||||||
|
double s_mem_h2d;
|
||||||
|
double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
|
||||||
|
double s_kernel_exec;
|
||||||
|
double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
|
||||||
|
unsigned long n_kernel_exec_cycles;
|
||||||
|
unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
|
||||||
|
float n_kernel_exec_iters;
|
||||||
|
float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
|
||||||
|
double s_mem_d2h;
|
||||||
|
double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
|
||||||
|
double s_solve;
|
||||||
|
double s_solve_min,s_solve_max,s_solve_avg;
|
||||||
|
double s_postprocess;
|
||||||
|
double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
|
||||||
|
unsigned int n_converged;
|
||||||
|
} perf_total_metrics_t;
|
||||||
|
std::vector<perf_call_metrics_t> perf_call;
|
||||||
|
perf_total_metrics_t perf_total;
|
||||||
|
// fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
|
||||||
|
// fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
|
||||||
|
unsigned int fpga_config_bits = 0;
|
||||||
|
bool fpga_disabled = false;
|
||||||
|
bool platform_awsf1;
|
||||||
|
unsigned int debugbufferSize;
|
||||||
|
unsigned long int *debugBuffer = nullptr;
|
||||||
|
unsigned int *databufferSize = nullptr;
|
||||||
|
unsigned char *dataBuffer[RW_BUF] = {nullptr};
|
||||||
|
unsigned int debug_outbuf_words;
|
||||||
|
int resultsNum;
|
||||||
|
int resultsBufferNum;
|
||||||
|
unsigned int resultsBufferSize[RES_BUF_MAX];
|
||||||
|
unsigned int result_offsets[6];
|
||||||
|
unsigned int kernel_cycles, kernel_iter_run;
|
||||||
|
double norms[4];
|
||||||
|
unsigned char last_norm_idx;
|
||||||
|
bool kernel_aborted, kernel_signature, kernel_overflow;
|
||||||
|
bool kernel_noresults;
|
||||||
|
bool kernel_wrafterend, kernel_dbgfifofull;
|
||||||
|
bool use_residuals = false;
|
||||||
|
bool use_LU_res = false;
|
||||||
|
int sequence = 0;
|
||||||
|
// TODO: these values may be sent via command line parameters
|
||||||
|
unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
|
||||||
|
unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
|
||||||
|
int nnzValArrays_size = 0;
|
||||||
|
int L_nnzValArrays_size = 0;
|
||||||
|
int U_nnzValArrays_size = 0;
|
||||||
|
// aliases to areas of the host data buffers
|
||||||
|
long unsigned int *setupArray = nullptr;
|
||||||
|
double **nnzValArrays = nullptr;
|
||||||
|
short unsigned int *columnIndexArray = nullptr;
|
||||||
|
unsigned char *newRowOffsetArray = nullptr;
|
||||||
|
unsigned int *PIndexArray = nullptr;
|
||||||
|
unsigned int *colorSizesArray = nullptr;
|
||||||
|
double **L_nnzValArrays = nullptr;
|
||||||
|
short unsigned int *L_columnIndexArray = nullptr;
|
||||||
|
unsigned char *L_newRowOffsetArray = nullptr;
|
||||||
|
unsigned int *L_PIndexArray = nullptr;
|
||||||
|
unsigned int *L_colorSizesArray = nullptr;
|
||||||
|
double **U_nnzValArrays = nullptr;
|
||||||
|
short unsigned int *U_columnIndexArray = nullptr;
|
||||||
|
unsigned char *U_newRowOffsetArray = nullptr;
|
||||||
|
unsigned int *U_PIndexArray = nullptr;
|
||||||
|
unsigned int *U_colorSizesArray = nullptr;
|
||||||
|
double *BLKDArray = nullptr;
|
||||||
|
double *X1Array = nullptr, *X2Array = nullptr;
|
||||||
|
double *R1Array = nullptr, *R2Array = nullptr;
|
||||||
|
double *LresArray = nullptr, *UresArray = nullptr;
|
||||||
|
double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
|
||||||
|
// OpenCL variables
|
||||||
|
cl_device_id device_id;
|
||||||
|
cl_context context;
|
||||||
|
cl_command_queue commands;
|
||||||
|
cl_program program;
|
||||||
|
cl_kernel kernel;
|
||||||
|
cl_mem cldata[RW_BUF] = {nullptr};
|
||||||
|
cl_mem cldebug = nullptr;
|
||||||
|
// HW limits/configuration variables
|
||||||
|
unsigned int hw_x_vector_elem;
|
||||||
|
unsigned int hw_max_row_size;
|
||||||
|
unsigned int hw_max_column_size;
|
||||||
|
unsigned int hw_max_colors_size;
|
||||||
|
unsigned short hw_max_nnzs_per_row;
|
||||||
|
unsigned int hw_max_matrix_size;
|
||||||
|
bool hw_use_uram;
|
||||||
|
bool hw_write_ilu0_results;
|
||||||
|
unsigned short hw_dma_data_width;
|
||||||
|
unsigned char hw_x_vector_latency;
|
||||||
|
unsigned char hw_add_latency;
|
||||||
|
unsigned char hw_mult_latency;
|
||||||
|
unsigned char hw_mult_num;
|
||||||
|
unsigned char hw_num_read_ports;
|
||||||
|
unsigned char hw_num_write_ports;
|
||||||
|
unsigned short hw_reset_cycles;
|
||||||
|
unsigned short hw_reset_settle;
|
||||||
|
// debug
|
||||||
|
bool reset_data_buffers = false;
|
||||||
|
bool fill_results_buffers = false;
|
||||||
|
int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
|
||||||
|
bool dump_results = false;
|
||||||
|
char *data_dir = nullptr;
|
||||||
|
char *basename = nullptr;
|
||||||
|
unsigned short rst_assert_cycles = 0;
|
||||||
|
unsigned short rst_settle_cycles = 0;
|
||||||
|
|
||||||
|
/// Allocate host memory
|
||||||
|
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
|
||||||
|
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||||
|
/// \param[in] dim size of block
|
||||||
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||||
|
/// \param[in] cols array of columnIndices, contains nnz values
|
||||||
|
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||||
|
|
||||||
|
/// Reorder the linear system so it corresponds with the coloring
|
||||||
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
|
/// \param[in] b input vector
|
||||||
|
void update_system(double *vals, double *b);
|
||||||
|
|
||||||
|
/// Analyse sparsity pattern to extract parallelism
|
||||||
|
/// \return true iff analysis was successful
|
||||||
|
bool analyse_matrix();
|
||||||
|
|
||||||
|
/// Perform ilu0-decomposition
|
||||||
|
/// \return true iff decomposition was successful
|
||||||
|
bool create_preconditioner();
|
||||||
|
|
||||||
|
/// Solve linear system
|
||||||
|
/// \param[inout] res summary of solver result
|
||||||
|
void solve_system(BdaResult &res);
|
||||||
|
|
||||||
|
/// Generate FPGA backend statistics
|
||||||
|
void generate_statistics(void);
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
/// Construct an fpgaSolver
|
||||||
|
/// \param[in] fpga_bitstream FPGA bitstream file name
|
||||||
|
/// \param[in] linear_solver_verbosity verbosity of fpgaSolver
|
||||||
|
/// \param[in] maxit maximum number of iterations for fpgaSolver
|
||||||
|
/// \param[in] tolerance required relative tolerance for fpgaSolver
|
||||||
|
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
|
||||||
|
FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
|
||||||
|
|
||||||
|
/// Destroy an fpgaSolver, and free memory
|
||||||
|
~FpgaSolverBackend();
|
||||||
|
|
||||||
|
/// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
|
||||||
|
/// \param[in] N number of rows, divide by dim to get number of blockrows
|
||||||
|
/// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks
|
||||||
|
/// \param[in] dim size of block
|
||||||
|
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||||
|
/// \param[in] rows array of rowPointers, contains N/dim+1 values
|
||||||
|
/// \param[in] cols array of columnIndices, contains nnz values
|
||||||
|
/// \param[in] b input vector, contains N values
|
||||||
|
/// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A
|
||||||
|
/// \param[inout] res summary of solver result
|
||||||
|
/// \return status code
|
||||||
|
SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
|
||||||
|
|
||||||
|
/// Get result after linear solve, and peform postprocessing if necessary
|
||||||
|
/// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array
|
||||||
|
void get_result(double *x) override;
|
||||||
|
|
||||||
|
}; // end class fpgaSolverBackend
|
||||||
|
|
||||||
|
} //namespace bda
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
63
opm/simulators/linalg/bda/FPGAUtils.cpp
Normal file
63
opm/simulators/linalg/bda/FPGAUtils.cpp
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
double second(void)
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, nullptr);
|
||||||
|
return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool even(int n)
|
||||||
|
{
|
||||||
|
if (n % 2 == 0) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int roundUpTo(int i, int n)
|
||||||
|
{
|
||||||
|
if (i % n == 0) {
|
||||||
|
return i;
|
||||||
|
} else {
|
||||||
|
return (i / n + 1) * n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fileExists(const char *filename)
|
||||||
|
{
|
||||||
|
FILE *fin;
|
||||||
|
fin = fopen(filename, "r");
|
||||||
|
if (fin == nullptr) {
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
fclose(fin);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} //namespace bda
|
39
opm/simulators/linalg/bda/FPGAUtils.hpp
Normal file
39
opm/simulators/linalg/bda/FPGAUtils.hpp
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 Equinor ASA
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef FPGA_UTILS_HEADER_INCLUDED
|
||||||
|
#define FPGA_UTILS_HEADER_INCLUDED
|
||||||
|
|
||||||
|
namespace bda
|
||||||
|
{
|
||||||
|
|
||||||
|
union double2int
|
||||||
|
{
|
||||||
|
unsigned long int int_val;
|
||||||
|
double double_val;
|
||||||
|
};
|
||||||
|
|
||||||
|
double second(void);
|
||||||
|
bool even(int n);
|
||||||
|
int roundUpTo(int i, int n);
|
||||||
|
bool fileExists(const char *filename);
|
||||||
|
|
||||||
|
} // end namespace bda
|
||||||
|
|
||||||
|
#endif // FPGA_UTILS_HEADER_INCLUDED
|
@ -17,12 +17,7 @@
|
|||||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
#include <cstring>
|
|
||||||
#include <algorithm> // for fill()
|
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <limits>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
|
||||||
|
@ -20,6 +20,8 @@
|
|||||||
#ifndef REORDER_HPP
|
#ifndef REORDER_HPP
|
||||||
#define REORDER_HPP
|
#define REORDER_HPP
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
|
||||||
|
|
||||||
namespace bda
|
namespace bda
|
||||||
|
@ -28,15 +28,18 @@
|
|||||||
namespace Opm
|
namespace Opm
|
||||||
{
|
{
|
||||||
|
|
||||||
WellContributions::WellContributions(std::string gpu_mode){
|
WellContributions::WellContributions(std::string accelerator_mode){
|
||||||
if(gpu_mode.compare("cusparse") == 0){
|
if(accelerator_mode.compare("cusparse") == 0){
|
||||||
cuda_gpu = true;
|
cuda_gpu = true;
|
||||||
}
|
}
|
||||||
else if(gpu_mode.compare("opencl") == 0){
|
else if(accelerator_mode.compare("opencl") == 0){
|
||||||
opencl_gpu = true;
|
opencl_gpu = true;
|
||||||
}
|
}
|
||||||
|
else if(accelerator_mode.compare("fpga") == 0){
|
||||||
|
// unused for FPGA, but must be defined to avoid error
|
||||||
|
}
|
||||||
else{
|
else{
|
||||||
OPM_THROW(std::logic_error, "Error: invalid GPU mode");
|
OPM_THROW(std::logic_error, "Invalid accelerator mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -176,7 +176,7 @@ public:
|
|||||||
void alloc();
|
void alloc();
|
||||||
|
|
||||||
/// Create a new WellContributions
|
/// Create a new WellContributions
|
||||||
WellContributions(std::string gpu_mode);
|
WellContributions(std::string accelerator_mode);
|
||||||
|
|
||||||
/// Destroy a WellContributions, and free memory
|
/// Destroy a WellContributions, and free memory
|
||||||
~WellContributions();
|
~WellContributions();
|
||||||
|
Loading…
Reference in New Issue
Block a user