From b4aa28771f63a52c8a9fef58f3f569871666eab2 Mon Sep 17 00:00:00 2001 From: Giacomo Marchiori Date: Tue, 22 Dec 2020 12:57:01 +0100 Subject: [PATCH 1/4] Added fpgaSolver --- CMakeLists.txt | 74 +- CMakeLists_files.cmake | 18 + opm-simulators-prereqs.cmake | 1 + .../linalg/FlowLinearSolverParameters.hpp | 30 +- opm/simulators/linalg/ISTLSolverEbos.hpp | 55 +- opm/simulators/linalg/bda/BILU0.cpp | 96 ++- opm/simulators/linalg/bda/BILU0.hpp | 10 +- opm/simulators/linalg/bda/BdaBridge.cpp | 52 +- opm/simulators/linalg/bda/BdaBridge.hpp | 20 +- opm/simulators/linalg/bda/BdaSolver.hpp | 6 +- opm/simulators/linalg/bda/BlockedMatrix.cpp | 409 +++++++++- opm/simulators/linalg/bda/BlockedMatrix.hpp | 71 +- opm/simulators/linalg/bda/FPGABILU0.cpp | 413 ++++++++++ opm/simulators/linalg/bda/FPGABILU0.hpp | 117 +++ opm/simulators/linalg/bda/FPGAMatrix.cpp | 249 ++++++ opm/simulators/linalg/bda/FPGAMatrix.hpp | 84 ++ .../linalg/bda/FPGASolverBackend.cpp | 732 ++++++++++++++++++ .../linalg/bda/FPGASolverBackend.hpp | 265 +++++++ opm/simulators/linalg/bda/FPGAUtils.cpp | 63 ++ opm/simulators/linalg/bda/FPGAUtils.hpp | 39 + opm/simulators/linalg/bda/Reorder.cpp | 21 +- opm/simulators/linalg/bda/Reorder.hpp | 2 + .../linalg/bda/WellContributions.cpp | 11 +- .../linalg/bda/WellContributions.hpp | 2 +- 24 files changed, 2689 insertions(+), 151 deletions(-) create mode 100644 opm/simulators/linalg/bda/FPGABILU0.cpp create mode 100644 opm/simulators/linalg/bda/FPGABILU0.hpp create mode 100644 opm/simulators/linalg/bda/FPGAMatrix.cpp create mode 100644 opm/simulators/linalg/bda/FPGAMatrix.hpp create mode 100644 opm/simulators/linalg/bda/FPGASolverBackend.cpp create mode 100644 opm/simulators/linalg/bda/FPGASolverBackend.hpp create mode 100644 opm/simulators/linalg/bda/FPGAUtils.cpp create mode 100644 opm/simulators/linalg/bda/FPGAUtils.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 678fb41ca..77dc99efc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,9 +132,72 @@ if(CUDA_FOUND) include_directories(${CUDA_INCLUDE_DIRS}) endif() - find_package(OpenCL) +# include FPGA target only when ENABLE_FPGA is set to ON +if(ENABLE_FPGA) + # must include opencl.h which is a wrapper for all the OpenCL extensions + find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS}) + + # FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in /../FPGA + find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA) + + # this code only runs if the user explicitly enabled the FPGA + # hence fatal_error instead of warning + if(NOT OPENCL_H OR NOT OpenCL_FOUND) + set(HAVE_FPGA 0) + message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.") + elseif(NOT FPGA_MODULE) + set(HAVE_FPGA 0) + message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.") + elseif(NOT DEFINED ENV{XILINX_XRT}) + # Xilinx XRT must be installed and properly setup + set(HAVE_FPGA 0) + message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.") + else() + set(HAVE_FPGA 1) + message(STATUS "FPGA library and kernel integration active.") + + # FIXME: set the correct path to the FPGA module + set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA) + + # configuration variables with default values: they can be overridden on the cmake command line + # FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream + if(NOT FPGA_PORTS_CONFIG) + set(FPGA_PORTS_CONFIG 2r_3r3w_ddr) + endif() + # FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build) + if(NOT FPGA_DEBUG_LEVEL) + set(FPGA_DEBUG_LEVEL 0) + endif() + message(STATUS "Using the following settings for the FPGA library compilation: " + "FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, " + "FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}") + add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG}) + add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}) + # include directories for the FPGA library + include_directories(${FPGA_SOURCE_DIR}) + include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app) + include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common) + # add external project to compile the FPGA library + include (ExternalProject) + ExternalProject_Add(FPGA_library + # force the build step to always be run because source dependencies cannot be made explicit + BUILD_ALWAYS 1 + DOWNLOAD_COMMAND "" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND make + -f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile + SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app + PORTS_CONFIG=${FPGA_PORTS_CONFIG} + DEBUG_LEVEL=${FPGA_DEBUG_LEVEL} + INSTALL_COMMAND "" + TEST_COMMAND "" + ) + endif() +endif() + if(OpenCL_FOUND) # the current OpenCL implementation relies on cl.hpp, not cl2.hpp # make sure it is available, otherwise disable OpenCL @@ -466,7 +529,8 @@ endif() add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests) -# must link libraries after target 'flow' has been defined +# must link libraries after target 'opmsimulators' has been defined + if(CUDA_FOUND) target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} ) target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} ) @@ -475,3 +539,9 @@ endif() if(OpenCL_FOUND) target_link_libraries( opmsimulators ${OpenCL_LIBRARIES} ) endif() + +if(HAVE_FPGA) + add_dependencies(opmsimulators FPGA_library) + ExternalProject_Get_Property(FPGA_library binary_dir) + target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a) +endif() diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 500c31102..6b3675c4d 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -66,6 +66,16 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) endif() +if(CMAKE_ENABLE_FPGA) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) + list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) +endif() + if(MPI_FOUND) list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp opm/simulators/utils/ParallelSerialization.cpp) @@ -171,6 +181,14 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/cuda_header.hpp opm/simulators/linalg/bda/cusparseSolverBackend.hpp opm/simulators/linalg/bda/ChowPatelIlu.hpp + opm/simulators/linalg/bda/FPGAMatrix.hpp + opm/simulators/linalg/bda/FPGABILU0.hpp + opm/simulators/linalg/bda/FPGASolverBackend.hpp + opm/simulators/linalg/bda/fpga/src/sda_app/bicgstab_solver_config.hpp + opm/simulators/linalg/bda/fpga/src/sda_app/common/dev_config.hpp + opm/simulators/linalg/bda/fpga/src/sda_app/common/fpga_functions_bicgstab.hpp + opm/simulators/linalg/bda/fpga/src/sda_app/common/opencl_lib.hpp + opm/simulators/linalg/bda/FPGAUtils.hpp opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/ILUReorder.hpp opm/simulators/linalg/bda/opencl.hpp diff --git a/opm-simulators-prereqs.cmake b/opm-simulators-prereqs.cmake index 330dc3ea4..e66fddaa8 100644 --- a/opm-simulators-prereqs.cmake +++ b/opm-simulators-prereqs.cmake @@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR HAVE_PETSC HAVE_CUDA HAVE_OPENCL + HAVE_FPGA HAVE_SUITESPARSE_UMFPACK_H HAVE_DUNE_ISTL DUNE_ISTL_VERSION_MAJOR diff --git a/opm/simulators/linalg/FlowLinearSolverParameters.hpp b/opm/simulators/linalg/FlowLinearSolverParameters.hpp index ae14f9ac0..3244d4d40 100644 --- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp +++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp @@ -118,7 +118,7 @@ struct Linsolver { using type = UndefinedProperty; }; template -struct GpuMode { +struct AcceleratorMode { using type = UndefinedProperty; }; template @@ -133,6 +133,10 @@ template struct OpenclIluReorder { using type = UndefinedProperty; }; +template +struct FpgaBitstream { + using type = UndefinedProperty; +}; template struct LinearSolverReduction { @@ -213,7 +217,7 @@ struct Linsolver { static constexpr auto value = "ilu0"; }; template -struct GpuMode { +struct AcceleratorMode { static constexpr auto value = "none"; }; template @@ -226,7 +230,11 @@ struct OpenclPlatformId { }; template struct OpenclIluReorder { - static constexpr auto value = "graph_coloring"; + static constexpr auto value = ""; // note: default value is chosen depending on the solver used +}; +template +struct FpgaBitstream { + static constexpr auto value = ""; }; } // namespace Opm::Properties @@ -252,12 +260,13 @@ namespace Opm bool ignoreConvergenceFailure_; bool scale_linear_system_; std::string linsolver_; - std::string gpu_mode_; + std::string accelerator_mode_; int bda_device_id_; int opencl_platform_id_; int cpr_max_ell_iter_ = 20; int cpr_reuse_setup_ = 0; std::string opencl_ilu_reorder_; + std::string fpga_bitstream_; template void init() @@ -279,10 +288,11 @@ namespace Opm cpr_max_ell_iter_ = EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter); cpr_reuse_setup_ = EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup); linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver); - gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); + accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); + fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); } template @@ -304,10 +314,11 @@ namespace Opm EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver"); EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate"); EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'"); - EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'"); + EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'"); EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs"); - EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly."); + EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly."); + EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream='"); } FlowLinearSolverParameters() { reset(); } @@ -327,10 +338,11 @@ namespace Opm ilu_milu_ = MILU_VARIANT::ILU; ilu_redblack_ = false; ilu_reorder_sphere_ = true; - gpu_mode_ = "none"; + accelerator_mode_ = "none"; bda_device_id_ = 0; opencl_platform_id_ = 0; - opencl_ilu_reorder_ = "graph_coloring"; + opencl_ilu_reorder_ = ""; // note: the default value is chosen depending on the solver used + fpga_bitstream_ = ""; } }; diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index eb83b7d3f..e18687702 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -35,7 +35,7 @@ #include -#if HAVE_CUDA || HAVE_OPENCL +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA #include #endif @@ -92,7 +92,7 @@ namespace Opm using WellModelOperator = WellModelAsLinearOperator; using ElementMapper = GetPropType; -#if HAVE_CUDA || HAVE_OPENCL +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA static const unsigned int block_size = Matrix::block_type::rows; std::unique_ptr> bdaBridge; #endif @@ -126,14 +126,14 @@ namespace Opm #endif parameters_.template init(); prm_ = setupPropertyTree(parameters_); -#if HAVE_CUDA || HAVE_OPENCL +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA { - std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); - if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) { + std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); + if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) { if (on_io_rank) { - OpmLog::warning("Cannot use GPU with MPI, GPU is disabled"); + OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled"); } - gpu_mode = "none"; + accelerator_mode = "none"; } const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); @@ -141,11 +141,12 @@ namespace Opm const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder); const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; - bdaBridge.reset(new BdaBridge(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder)); + std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); + bdaBridge.reset(new BdaBridge(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder)); } #else - if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") { - OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake"); + if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") { + OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled"); } #endif extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); @@ -157,6 +158,12 @@ namespace Opm detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_); useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); +#if HAVE_FPGA + // check usage of MatrixAddWellContributions: for FPGA they must be included + if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) { + OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true"); + } +#endif const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst); if (!ownersFirst) { const std::string msg = "The linear solver no longer supports --owner-cells-first=false."; @@ -242,43 +249,49 @@ namespace Opm // Solve system. Dune::InverseOperatorResult result; - bool gpu_was_used = false; + bool accelerator_was_used = false; // Use GPU if: available, chosen by user, and successful. -#if HAVE_CUDA || HAVE_OPENCL + // Use FPGA if: support compiled, chosen by user, and successful. +#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA bool use_gpu = bdaBridge->getUseGpu(); - if (use_gpu) { - const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode); - WellContributions wellContribs(gpu_mode); - + bool use_fpga = bdaBridge->getUseFpga(); + if (use_gpu || use_fpga) { + const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); + WellContributions wellContribs(accelerator_mode); bdaBridge->initWellContributions(wellContribs); if (!useWellConn_) { simulator_.problem().wellModel().getWellContributions(wellContribs); } + // Const_cast needed since the CUDA stuff overwrites values for better matrix condition.. bdaBridge->solve_system(const_cast(&getMatrix()), *rhs_, wellContribs, result); if (result.converged) { // get result vector x from non-Dune backend, iff solve was successful bdaBridge->get_result(x); - gpu_was_used = true; + accelerator_was_used = true; } else { // CPU fallback use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver - if (use_gpu && simulator_.gridView().comm().rank() == 0) { - if (gpu_mode.compare("cusparse") == 0) { + use_fpga = bdaBridge->getUseFpga(); + if (simulator_.gridView().comm().rank() == 0) { + if (use_gpu && accelerator_mode.compare("cusparse") == 0) { OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); } - if (gpu_mode.compare("opencl") == 0) { + if (use_gpu && accelerator_mode.compare("opencl") == 0) { OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system..."); } + if (use_fpga && accelerator_mode.compare("fpga") == 0) { + OpmLog::warning("fpgaSolver did not converge, now trying Dune to solve current linear system..."); + } } } } #endif // Otherwise, use flexible istl solver. - if (!gpu_was_used) { + if (!accelerator_was_used) { assert(flexibleSolver_); flexibleSolver_->apply(x, *rhs_, result); } diff --git a/opm/simulators/linalg/bda/BILU0.cpp b/opm/simulators/linalg/bda/BILU0.cpp index 6dc037468..013b04b93 100644 --- a/opm/simulators/linalg/bda/BILU0.cpp +++ b/opm/simulators/linalg/bda/BILU0.cpp @@ -33,24 +33,19 @@ namespace bda { - using Opm::OpmLog; - using Dune::Timer; +using Opm::OpmLog; +using Dune::Timer; - template - BILU0::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) : - verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_) - {} +template +BILU0::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) : + verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_) +{} - template - BILU0::~BILU0() - { - delete[] invDiagVals; - delete[] diagIndex; - if (opencl_ilu_reorder != ILUReorder::NONE) { - delete[] toOrder; - delete[] fromOrder; - } - } +template +BILU0::~BILU0() +{ + delete[] invDiagVals; +} template bool BILU0::init(BlockedMatrix *mat) @@ -68,8 +63,8 @@ namespace bda if (opencl_ilu_reorder == ILUReorder::NONE) { LUmat = std::make_unique >(*mat); } else { - toOrder = new int[Nb]; - fromOrder = new int[Nb]; + toOrder.resize(Nb); + fromOrder.resize(Nb); CSCRowIndices = new int[nnzbs]; CSCColPointers = new int[Nb + 1]; rmat = std::make_shared >(mat->Nb, mat->nnzbs); @@ -88,10 +83,10 @@ namespace bda std::ostringstream out; if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { out << "BILU0 reordering strategy: " << "level_scheduling\n"; - findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); + findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { out << "BILU0 reordering strategy: " << "graph_coloring\n"; - findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor); + findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); } else if (opencl_ilu_reorder == ILUReorder::NONE) { out << "BILU0 reordering strategy: none\n"; // numColors = 1; @@ -111,12 +106,13 @@ namespace bda } OpmLog::info(out.str()); + if (opencl_ilu_reorder != ILUReorder::NONE) { delete[] CSCRowIndices; delete[] CSCColPointers; } - diagIndex = new int[mat->Nb]; + diagIndex.resize(mat->Nb); invDiagVals = new double[mat->Nb * bs * bs]; #if CHOW_PATEL @@ -158,8 +154,8 @@ namespace bda OPM_THROW(std::logic_error, "BILU0 OpenCL enqueueWriteBuffer error"); } - return true; - } // end init() + return true; +} // end init() // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015 @@ -482,7 +478,7 @@ namespace bda diagIndex[row] = candidate - LUmat->colIndices; } events.resize(8); - queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]); queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]); queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]); queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]); @@ -516,7 +512,7 @@ namespace bda if (opencl_ilu_reorder != ILUReorder::NONE) { m = rmat.get(); Timer t_reorder; - reorderBlockedMatrixByPattern(mat, toOrder, fromOrder, rmat.get()); + reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rmat.get()); if (verbosity >= 3){ std::ostringstream out; @@ -556,7 +552,7 @@ namespace bda diagIndex[row] = candidate - LUmat->colIndices; } events.resize(4); - queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]); + queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]); queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]); queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]); }); @@ -637,30 +633,30 @@ namespace bda } - template - void BILU0::setOpenCLContext(cl::Context *context_){ - this->context = context_; - } - template - void BILU0::setOpenCLQueue(cl::CommandQueue *queue_){ - this->queue = queue_; - } - template - void BILU0::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_){ - this->work_group_size = work_group_size_; - this->total_work_items = total_work_items_; - this->lmem_per_work_group = lmem_per_work_group_; - } - template - void BILU0::setKernels( - cl::make_kernel *ILU_apply1_, - cl::make_kernel *ILU_apply2_, - cl::make_kernel *ilu_decomp_k_ - ){ - this->ILU_apply1 = ILU_apply1_; - this->ILU_apply2 = ILU_apply2_; - this->ilu_decomp_k = ilu_decomp_k_; - } +template +void BILU0::setOpenCLContext(cl::Context *context_) { + this->context = context_; +} +template +void BILU0::setOpenCLQueue(cl::CommandQueue *queue_) { + this->queue = queue_; +} +template +void BILU0::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) { + this->work_group_size = work_group_size_; + this->total_work_items = total_work_items_; + this->lmem_per_work_group = lmem_per_work_group_; +} +template +void BILU0::setKernels( + cl::make_kernel *ILU_apply1_, + cl::make_kernel *ILU_apply2_, + cl::make_kernel *ilu_decomp_k_ +){ + this->ILU_apply1 = ILU_apply1_; + this->ILU_apply2 = ILU_apply2_; + this->ilu_decomp_k = ilu_decomp_k_; +} #define INSTANTIATE_BDA_FUNCTIONS(n) \ diff --git a/opm/simulators/linalg/bda/BILU0.hpp b/opm/simulators/linalg/bda/BILU0.hpp index aeca4817e..33c5966bc 100644 --- a/opm/simulators/linalg/bda/BILU0.hpp +++ b/opm/simulators/linalg/bda/BILU0.hpp @@ -61,10 +61,10 @@ namespace bda std::unique_ptr > Lmat = nullptr, Umat = nullptr; #endif double *invDiagVals = nullptr; - int *diagIndex = nullptr; - std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel + std::vector diagIndex; + std::vector rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel std::vector rowsPerColorPrefix; // the prefix sum of rowsPerColor - int *toOrder = nullptr, *fromOrder = nullptr; + std::vector toOrder, fromOrder; int numColors; int verbosity; std::once_flag pattern_uploaded; @@ -128,12 +128,12 @@ namespace bda int* getToOrder() { - return toOrder; + return toOrder.data(); } int* getFromOrder() { - return fromOrder; + return fromOrder.data(); } BlockedMatrix* getRMat() diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 9f1bbca16..293994122 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -18,8 +18,6 @@ */ #include -#include -#include #include #include @@ -54,21 +52,23 @@ namespace Opm using bda::ILUReorder; template -BdaBridge::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED) -: gpu_mode(gpu_mode_) +BdaBridge::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED) +: accelerator_mode(accelerator_mode_) { - if (gpu_mode.compare("cusparse") == 0) { + if (accelerator_mode.compare("cusparse") == 0) { #if HAVE_CUDA use_gpu = true; backend.reset(new bda::cusparseSolverBackend(linear_solver_verbosity, maxit, tolerance, deviceID)); #else OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake"); #endif - } else if (gpu_mode.compare("opencl") == 0) { + } else if (accelerator_mode.compare("opencl") == 0) { #if HAVE_OPENCL use_gpu = true; - ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING; - if (opencl_ilu_reorder == "level_scheduling") { + ILUReorder ilu_reorder; + if (opencl_ilu_reorder == "") { + ilu_reorder = bda::ILUReorder::GRAPH_COLORING; // default when not selected by user + } else if (opencl_ilu_reorder == "level_scheduling") { ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; } else if (opencl_ilu_reorder == "graph_coloring") { ilu_reorder = bda::ILUReorder::GRAPH_COLORING; @@ -81,10 +81,28 @@ BdaBridge::BdaBridge(std::string gpu_mod #else OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); #endif - } else if (gpu_mode.compare("none") == 0) { + } else if (accelerator_mode.compare("fpga") == 0) { +#if HAVE_FPGA + use_fpga = true; + ILUReorder ilu_reorder; + if (opencl_ilu_reorder == "") { + ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; // default when not selected by user + } else if (opencl_ilu_reorder == "level_scheduling") { + ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING; + } else if (opencl_ilu_reorder == "graph_coloring") { + ilu_reorder = bda::ILUReorder::GRAPH_COLORING; + } else { + OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'"); + } + backend.reset(new bda::FpgaSolverBackend(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder)); +#else + OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake"); +#endif + } else if (accelerator_mode.compare("none") == 0) { use_gpu = false; + use_fpga = false; } else { - OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]"); + OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]"); } } @@ -161,7 +179,7 @@ template void BdaBridge::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED) { - if (use_gpu) { + if (use_gpu || use_fpga) { BdaResult result; result.converged = false; static std::vector h_rows; @@ -225,7 +243,7 @@ void BdaBridge::solve_system(BridgeMatri res.converged = result.converged; res.conv_rate = result.conv_rate; res.elapsed = result.elapsed; - }else{ + } else { res.converged = false; } } @@ -233,14 +251,14 @@ void BdaBridge::solve_system(BridgeMatri template void BdaBridge::get_result(BridgeVector &x OPM_UNUSED) { - if (use_gpu) { + if (use_gpu || use_fpga) { backend->get_result(static_cast(&(x[0][0]))); } } template void BdaBridge::initWellContributions(WellContributions& wellContribs) { - if(gpu_mode.compare("opencl") == 0){ + if(accelerator_mode.compare("opencl") == 0){ #if HAVE_OPENCL const auto openclBackend = static_cast*>(backend.get()); wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get()); @@ -250,12 +268,12 @@ void BdaBridge::initWellContributions(We } } - -#define INSTANTIATE_BDA_FUNCTIONS(n) \ +#define INSTANTIATE_BDA_FUNCTIONS(n) \ template BdaBridge, std::allocator > >, \ Dune::BlockVector, std::allocator > >, \ n>::BdaBridge \ -(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \ +(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, \ +unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); \ \ template void BdaBridge, std::allocator > >, \ Dune::BlockVector, std::allocator > >, \ diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index a8d3bcc1a..5954de4c8 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -30,6 +30,10 @@ #include +#if HAVE_FPGA +#include +#endif + namespace Opm { @@ -42,19 +46,22 @@ class BdaBridge { private: bool use_gpu = false; - std::string gpu_mode; + bool use_fpga = false; + std::string accelerator_mode; std::unique_ptr > backend; public: /// Construct a BdaBridge - /// \param[in] gpu_mode to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]' + /// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]' + /// \param[in] fpga_bitstream FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[]' /// \param[in] linear_solver_verbosity verbosity of BdaSolver /// \param[in] maxit maximum number of iterations for BdaSolver /// \param[in] tolerance required relative tolerance for BdaSolver /// \param[in] platformID the OpenCL platform ID to be used /// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors - /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation - BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); + /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation + BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder); + /// Solve linear system, A*x = b /// \warning Values of A might get overwritten! @@ -79,6 +86,11 @@ public: /// \param[in] wellContribs container to hold all WellContributions void initWellContributions(WellContributions& wellContribs); + /// Return whether the BdaBridge will use the FPGA or not + /// return whether the BdaBridge will use the FPGA or not + bool getUseFpga(){ + return use_fpga; + } }; // end class BdaBridge diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index cef79b5e1..2548693d9 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -55,6 +55,8 @@ namespace bda int maxit = 200; double tolerance = 1e-2; + std::string bitstream = ""; + int N; // number of rows int Nb; // number of blocked rows (Nb*block_size == N) int nnz; // number of nonzeroes (scalars) @@ -66,7 +68,8 @@ namespace bda bool initialized = false; public: - /// Construct a BdaSolver, can be cusparseSolver or openclSolver + /// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver + /// \param[in] fpga_bitstream FPGA bitstream file name (only for fpgaSolver) /// \param[in] linear_solver_verbosity verbosity of solver /// \param[in] maxit maximum number of iterations for solver /// \param[in] tolerance required relative tolerance for solver @@ -74,6 +77,7 @@ namespace bda /// \param[in] deviceID the device to be used BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {}; + BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {}; /// Define virtual destructor, so that the derivedclass destructor will be called virtual ~BdaSolver() {}; diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp index 050a17a95..811d1b484 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.cpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp @@ -20,13 +20,19 @@ #include #include -#include +#include -using bda::BlockedMatrix; +#include +#include + +#include +#include namespace bda { +using Opm::OpmLog; + /*Sort a row of matrix elements from a blocked CSR-format.*/ template @@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) { } } -// subtract c from b and store in a -// a = b - c +#if HAVE_FPGA + +/*Subtract two blocks from one another element by element*/ template -void blockSub(double *a, double *b, double *c) -{ +void blockSub(double *mat1, double *mat2, double *resMat) { for (unsigned int row = 0; row < block_size; row++) { for (unsigned int col = 0; col < block_size; col++) { - a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col]; + resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col]; } } } -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template void sortBlockedRow(int *, double *, int, int); \ -template void blockMultSub(double *, double *, double *); \ -template void blockMult(double *, double *, double *); \ -template void blockSub(double *, double *, double *); \ +/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/ +template +void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) { + for (unsigned int row = 0; row < block_size; row++) { + if (resetRes) { + resVect[row] = 0.0; + } + for (unsigned int col = 0; col < block_size; col++) { + resVect[row] += scale * mat[row * block_size + col] * vect[col]; + } + } +} + + + +template +int BlockedMatrix::countUnblockedNnzs() { + int numNnzsOverThreshold = 0; + int totalNnzs = rowPointers[Nb]; + for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) { + if (fabs(nnzValues[idx]) > nnzThreshold) { + numNnzsOverThreshold++; + } + } + return numNnzsOverThreshold; +} + +/* + * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks. + * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order. +*/ +template +void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) { + const unsigned int bs = block_size; + int valIndex = 0, nnzsPerRow; + + mat->rowPointers[0] = 0; + // go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and + // write all non-zero values and corresponding column indices that belong to the same row into the new matrix. + for (int row = 0; row < Nb; row++) { + for (unsigned int bRow = 0; bRow < bs; bRow++) { + nnzsPerRow = 0; + for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) { + for (unsigned int bCol = 0; bCol < bs; bCol++) { + int idx = 0; + // If the matrix is the U matrix, store the rows inside a block in reverse order. + if (isUMatrix) { + idx = col * bs * bs + (bs - bRow - 1) * bs + bCol; + } else { + idx = col * bs * bs + bRow * bs + bCol; + } + + if (fabs(nnzValues[idx]) > nnzThreshold) { + mat->nnzValues[valIndex] = nnzValues[idx]; + mat->colIndices[valIndex] = colIndices[col] * bs + bCol; + valIndex++; + nnzsPerRow++; + } + } + } + // Update the rowpointers of the new matrix + mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow; + } + } +} + + + +/*Optimized version*/ +// ub* prefixes indicate unblocked data +template +int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix, + std::vector >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes, + std::vector >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize) +{ + int res; + int numUnblockedNnzs = countUnblockedNnzs(); + + // initialize the non-blocked matrix with the obtained size. + std::unique_ptr ubMat = std::make_unique(Nb * block_size, numUnblockedNnzs); + + unblock(ubMat.get(), isUMatrix); + + std::vector ubNodesPerColor(numColors); + for (int i = 0; i < numColors; i++) { + ubNodesPerColor[i] = nodesPerColor[i] * block_size; + } + + *valSize = ubMat->nnzs; + + res = ubMat->toRDF(numColors, ubNodesPerColor, + colIndicesInColor, nnzsPerRowLimit, + ubNnzValues, ubColIndices, nnzValsSizes, + NROffsets, colorSizes); + return res; +} + + +// coloring is already done, numColors and nodesPerColor are set +// [rows|columns]PerColorLimit are already queried from the FPGA +// colIndicesInColor, PIndicesAddr and colorSizes are written here +// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU +// node == row +// color == partition +// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs +// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF(). +// due to partitioning, lots of columns are removed, this matrix keeps track of the mapping +// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked +// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0 +// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA +template +int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor, + int rowsPerColorLimit, int columnsPerColorLimit, + std::vector >& colIndicesInColor, int *PIndicesAddr, int *colorSizes, + std::vector >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes, + std::vector >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes) +{ + // Data related to column indices per partition + int doneRows = 0; + std::vector isColAccessed(Nb); // std::vector might have some different optimized implementation, initialize in a loop + std::vector isLColAccessed(Nb); + int totalCols = 0; // sum of numColAccesses for each color, blocked + int LTotalCols = 0, UTotalCols = 0; + int maxCols = 0; // max value of numColAccesses for any color + int maxRowsPerColor = 0; // max value of numRows for any color + int maxColsPerRow = 0; // max value of colsPerRow for any color + // colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates + // colsInColor[c][i] contains the ith column that color c accesses + // initial size allows for each color to access all columns, with space for padding + std::vector > colsInColor(numColors, std::vector(roundUpTo(Nb, 16))); + std::vector > LColsInColor(numColors, std::vector(roundUpTo(Nb, 16))); + std::vector > UColsInColor(numColors, std::vector(roundUpTo(Nb, 16))); + + // find which columns are accessed in each color, as well as how many non-zeroes there are per color. + for (int c = 0; c < numColors; c++) { + int numRows = 0; + // initialize + for (int row = 0; row < Nb; row++) { + isColAccessed[row] = false; + isLColAccessed[row] = false; + } + if (c > 0) { + for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) { + isLColAccessed[i] = true; + } + } + int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0; // number of unique accesses, blocked + // for every row in this color + for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) { + int colsPerRow = 0; // number of blocks for this row + bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]); + for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) { + // for every column in the current row, check if that column was accessed before this color + int col = colIndices[idx]; + if (isColAccessed[col] == false) { + colsInColor[c][numColAccesses] = col; + isColAccessed[col] = true; + numColAccesses++; + if (col > row) { + UColsInColor[numColors - c - 1][UNumColAccesses] = col; + UNumColAccesses++; + } + } + if (isLColAccessed[col] == false) { + if (col < row) { + LColsInColor[c][LNumColAccesses] = col; + LNumColAccesses++; + isLColAccessed[col] = true; + } + } + colsPerRow++; + } + if (rowIsEmpty != true) { + numRows++; + } + maxColsPerRow = std::max(maxColsPerRow, colsPerRow); + } + + // add columns from previous color into L partition to simplify data forwarding + if (c > 0) { + for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) { + LColsInColor[c][LNumColAccesses] = i; + LNumColAccesses++; + } + } + + colorSizes[c * 4 + 10] = numColAccesses * block_size; + LColorSizes[c * 4 + 10] = LNumColAccesses * block_size; + UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size; + + // store mapping + for (int col = 0; col < numColAccesses; col++) { + for (unsigned int i = 0; i < block_size; i++) { + colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i; + } + } + for (int col = 0; col < LNumColAccesses; col++) { + for (unsigned int i = 0; i < block_size; i++) { + LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i; + } + } + for (int col = 0; col < UNumColAccesses; col++) { + for (unsigned int i = 0; i < block_size; i++) { + UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i; + } + } + + // zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline + while (numColAccesses % 16 != 0) { + colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1]; + numColAccesses++; + } + while (LNumColAccesses % 16 != 0) { + LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1]; + LNumColAccesses++; + } + while (UNumColAccesses % 16 != 0) { + UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1]; + UNumColAccesses++; + } + maxCols = std::max(numColAccesses, maxCols); + totalCols += numColAccesses; + LTotalCols += LNumColAccesses; + UTotalCols += UNumColAccesses; + doneRows = doneRows + nodesPerColor[c]; + maxRowsPerColor = std::max(numRows, maxRowsPerColor); + } + + if (maxCols * static_cast(block_size) > columnsPerColorLimit) { + std::ostringstream errorstring; + errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit; + OPM_THROW(std::logic_error, errorstring.str()); + } + + doneRows = 0; + int diagValsSize = 0; + int maxRows = 0; + + for (int c = 0; c < numColors; c++) { + // calculate sizes that include zeropadding + diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8); + doneRows += nodesPerColor[c]; + if (nodesPerColor[c] * static_cast(block_size) > maxRows) + maxRows = nodesPerColor[c]; + colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size; + LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size; + UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size; + } + + if (maxRows * static_cast(block_size) > rowsPerColorLimit) { + std::ostringstream errorstring; + errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit; + OPM_THROW(std::logic_error, errorstring.str()); + } + + // create and fill sizes array as far as already possible + colorSizes[0] = Nb * block_size; + LColorSizes[0] = Nb * block_size; + UColorSizes[0] = Nb * block_size; + // col_sizes (but the matrix is square) + colorSizes[1] = Nb * block_size; + LColorSizes[1] = Nb * block_size; + UColorSizes[1] = Nb * block_size; + colorSizes[2] = totalCols * block_size; + LColorSizes[2] = LTotalCols * block_size; + UColorSizes[2] = UTotalCols * block_size; + // missing val_size, written in Matrix::toRDF() + colorSizes[4] = numColors; + LColorSizes[4] = numColors; + UColorSizes[4] = numColors; + // missing NRFlagsSize, written in Matrix::toRDF() + colorSizes[6] = diagValsSize; + LColorSizes[6] = diagValsSize; + UColorSizes[6] = diagValsSize; + + int paddingIdx = numColors; + while (paddingIdx % 4 != 0) { + for (unsigned int i = 0; i < 4; i++) { + colorSizes[paddingIdx * 4 + 8 + i] = 0; + LColorSizes[paddingIdx * 4 + 8 + i] = 0; + UColorSizes[paddingIdx * 4 + 8 + i] = 0; + } + paddingIdx++; + } + + int index = 0, Lindex = 0, Uindex = 0; + for (int c = 0; c < numColors; c++) { + // for each unique col access + for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { + for (unsigned int i = 0; i < block_size; i++) { + PIndicesAddr[index] = colsInColor[c][col] * block_size + i; + index++; + } + } + // add padding + while (index % 16 != 0) { + PIndicesAddr[index] = PIndicesAddr[index - 1]; + index++; + } + for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { + for (unsigned int i = 0; i < block_size; i++) { + LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i; + Lindex++; + } + } + while (Lindex % 16 != 0) { + LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1]; + Lindex++; + } + for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { + for (unsigned int i = 0; i < block_size; i++) { + UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i; + Uindex++; + } + } + while (Uindex % 16 != 0) { + UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1]; + Uindex++; + } + } + return 0; +} + + +void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector& rowsPerColor, double *RDFDiag) { + const unsigned int block_size = 3; + int doneRows = rowSize - 1; // since the rows of U are reversed, the rows of the diag are also reversed + int RDFIndex = 0; + for (int c = 0; c < numColors; c++) { + for (int r = 0; r < rowsPerColor[c]; r++) { + + // the rows in the block are reversed + for (int i = static_cast(block_size) - 1; i >= 0; i--) { + for (unsigned int j = 0; j < block_size; j++) { + RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j]; + RDFIndex++; + } + // add 4th column, zeropadding + RDFDiag[RDFIndex] = 0.0; + RDFIndex++; + } + } + doneRows -= rowsPerColor[c]; + + // make sure the color completely fills a cacheline + // a color with 3 blocks would otherwise leave space + while (RDFIndex % 8 != 0) { + RDFDiag[RDFIndex] = 0.0; + RDFIndex++; + } + } + assert(RDFIndex % 8 == 0); +} + +#endif // HAVE_FPGA + + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template void sortBlockedRow(int *, double *, int, int); \ +template void blockMultSub(double *, double *, double *); \ +template void blockMult(double *, double *, double *); \ INSTANTIATE_BDA_FUNCTIONS(1); INSTANTIATE_BDA_FUNCTIONS(2); @@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4); #undef INSTANTIATE_BDA_FUNCTIONS +#if HAVE_FPGA +#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n) \ +template void blockSub(double *, double *, double *); \ +template void blockVectMult(double *, double *, double, double *, bool); \ +template int BlockedMatrix::toRDF(int, int *, bool, \ + std::vector >& , int, int *, \ + std::vector >&, short int *, unsigned char *, int *, int *); \ +template int BlockedMatrix::findPartitionColumns(int, int *, \ + int, int, \ + std::vector >& , int *, int *, \ + std::vector >& , int *, int *, \ + std::vector >& , int *, int *); + +INSTANTIATE_BDA_FPGA_FUNCTIONS(1); +INSTANTIATE_BDA_FPGA_FUNCTIONS(2); +INSTANTIATE_BDA_FPGA_FUNCTIONS(3); +INSTANTIATE_BDA_FPGA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FPGA_FUNCTIONS +#endif // HAVE_FPGA + + } // end namespace bda diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp index 80488a8dc..1d45a11b6 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.hpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp @@ -20,29 +20,40 @@ #ifndef BLOCKED_MATRIX_HPP #define BLOCKED_MATRIX_HPP +#if HAVE_FPGA +#include +#endif + +#include + namespace bda { /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix. /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer. -template -struct BlockedMatrix{ +template +class BlockedMatrix +{ + +public: + /// Allocate BlockedMatrix and data arrays with given sizes /// \param[in] Nb number of blockrows /// \param[in] nnzbs number of nonzero blocks BlockedMatrix(int Nb_, int nnzbs_) - : nnzValues(new double[nnzbs_*BS*BS]), - colIndices(new int[nnzbs_*BS*BS]), + : nnzValues(new double[nnzbs_*block_size*block_size]), + colIndices(new int[nnzbs_*block_size*block_size]), rowPointers(new int[Nb_+1]), Nb(Nb_), nnzbs(nnzbs_), deleteNnzs(true), deleteSparsity(true) {} + /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory /// \param[in] M matrix to be copied BlockedMatrix(const BlockedMatrix& M) - : nnzValues(new double[M.nnzbs*BS*BS]), + : nnzValues(new double[M.nnzbs*block_size*block_size]), colIndices(M.colIndices), rowPointers(M.rowPointers), Nb(M.Nb), @@ -50,10 +61,11 @@ struct BlockedMatrix{ deleteNnzs(true), deleteSparsity(false) {} + /// Allocate BlockedMatrix, but let data arrays point to existing arrays /// \param[in] Nb number of blockrows /// \param[in] nnzbs number of nonzero blocks - /// \param[in] nnzValues array of nonzero values, contains nnzb*BS*BS scalars + /// \param[in] nnzValues array of nonzero values, contains nnzb*block_size*block_size scalars /// \param[in] colIndices array of column indices, contains nnzb entries /// \param[in] rowPointers array of row pointers, contains Nb+1 entries BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_) @@ -76,6 +88,28 @@ struct BlockedMatrix{ } } +#if HAVE_FPGA + constexpr static double nnzThreshold = 1e-80; // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero + + int countUnblockedNnzs(); + + void unblock(Matrix *mat, bool isUMatrix); + + /// Converts this matrix to the dataformat used by the FPGA + /// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking + int toRDF(int numColors, int *nodesPerColor, bool isUMatrix, + std::vector >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes, + std::vector >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize); + + /// Analyses the sparsity pattern and prepares for toRDF() + /// Is only called once + int findPartitionColumns(int numColors, int *nodesPerColor, + int rowsPerColorLimit, int columnsPerColorLimit, + std::vector >& colIndicesInColor, int *PIndicesAddr, int *colorSizes, + std::vector >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes, + std::vector >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes); +#endif + double *nnzValues; int *colIndices; int *rowPointers; @@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c); template void blockMult(double *mat1, double *mat2, double *resMat); -/// Subtract blocks -/// a = b - c -/// \param[out] a result block -/// \param[in] b input block -/// \param[in] c input block + +#if HAVE_FPGA template -void blockSub(double *a, double *b, double *c); +void blockSub(double *mat1, double *mat2, double *resMat); + +template +void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes); + +/// Convert a blocked inverse diagonal to the FPGA format. +/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U. +/// Since the rows of U are reversed, the rows of the diag are also reversed. +/// The cachelines can hold 8 doubles, a block has 9 doubles. +/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows. +/// Then 2 blocks (24 doubles) fit on 3 cachelines. +/// Example: +/// [1 2 3] [1 2 3 0] [1 2 3 0 4 5 6 0] +/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1] +/// [7 8 9] [7 8 9 0] [block2 row2 block2 row3] +void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector& rowsPerColor, double *RDFDiag); +#endif } // end namespace bda diff --git a/opm/simulators/linalg/bda/FPGABILU0.cpp b/opm/simulators/linalg/bda/FPGABILU0.cpp new file mode 100644 index 000000000..6246140e4 --- /dev/null +++ b/opm/simulators/linalg/bda/FPGABILU0.cpp @@ -0,0 +1,413 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace bda +{ + +using Opm::OpmLog; +using Dune::Timer; + +template +FPGABILU0::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) : + verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_) +{ + if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { + level_scheduling = true; + } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { + graph_coloring = true; + } else { + OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); + } +} + + +template +FPGABILU0::~FPGABILU0() +{ + delete[] invDiagVals; +} + + +template +bool FPGABILU0::init(BlockedMatrix *mat) +{ + const unsigned int bs = block_size; + + resultPointers.resize(numResultPointers, nullptr); + resultSizes.resize(numResultSizes); + + // Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added. + const unsigned int nnzSplit = 1; + + this->N = mat->Nb * block_size; + this->Nb = mat->Nb; + this->nnz = mat->nnzbs * block_size * block_size; + this->nnzbs = mat->nnzbs; + + toOrder.resize(Nb); + fromOrder.resize(Nb); + + std::vector CSCRowIndices(nnzbs); + std::vector CSCColPointers(Nb + 1); + + if (level_scheduling) { + Timer t_convert; + csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb); + if (verbosity >= 3) { + std::ostringstream out; + out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s"; + OpmLog::info(out.str()); + } + } + + Timer t_analysis; + rMat = std::make_shared >(mat->Nb, mat->nnzbs); + LUMat = std::make_unique >(*rMat); + std::ostringstream out; + if (level_scheduling) { + out << "FPGABILU0 reordering strategy: " << "level_scheduling\n"; + findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + } else if (graph_coloring) { + out << "FPGABILU0 reordering strategy: " << "graph_coloring\n"; + findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); + } + + if (numColors > maxNumColors) { + std::ostringstream errorstring; + errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n"; + OPM_THROW(std::logic_error, errorstring.str()); + } + + if (verbosity >= 3) { + out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors"; + } + OpmLog::info(out.str()); + + int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0; + int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0; + int blockDiagSize = 0; + // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes + reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get()); + int doneRows = 0; + for (int c = 0; c < numColors; c++) { + for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) { + for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) { + int columnIndex = rMat->colIndices[j]; + if (columnIndex < i) { + LColorRoundedValSize += 9; + LNROffsetSize += 9; + } + if (columnIndex > i) { + UColorRoundedValSize += 9; + UNROffsetSize += 9; + } + colorRoundedValSize += 9; + NROffsetSize += 9; + } + blockDiagSize += 12; + } + // End of color: round all sizes to nearest cacheline + colorRoundedValSize = roundUpTo(colorRoundedValSize, 32); + LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32); + UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32); + NROffsetSize = roundUpTo(NROffsetSize, 64); + LNROffsetSize = roundUpTo(LNROffsetSize, 64); + UNROffsetSize = roundUpTo(UNROffsetSize, 64); + blockDiagSize = roundUpTo(blockDiagSize, 8); + + doneRows += rowsPerColor[c]; + } + int colorSizesNum = 8 + roundUpTo(4 * numColors, 16); + int worstCaseColumnAccessNum = numColors * maxColsPerColor; + + nnzValues.resize(nnzSplit, std::vector(colorRoundedValSize)); + LnnzValues.resize(nnzSplit, std::vector(LColorRoundedValSize)); + UnnzValues.resize(nnzSplit, std::vector(UColorRoundedValSize)); + // initial number of nnz, used to allocate + nnzValsSizes.resize(nnzSplit, colorRoundedValSize); + LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize); + UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize); + colIndices.resize(colorRoundedValSize); + LColIndices.resize(LColorRoundedValSize); + UColIndices.resize(UColorRoundedValSize); + NROffsets.resize(NROffsetSize); + LNROffsets.resize(LNROffsetSize); + UNROffsets.resize(UNROffsetSize); + PIndicesAddr.resize(worstCaseColumnAccessNum); + LPIndicesAddr.resize(worstCaseColumnAccessNum); + UPIndicesAddr.resize(worstCaseColumnAccessNum); + colorSizes.resize(colorSizesNum); + LColorSizes.resize(colorSizesNum); + UColorSizes.resize(colorSizesNum); + blockDiag.resize(blockDiagSize); + colIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); + LColIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); + UColIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); + + int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(), + maxRowsPerColor, maxColsPerColor, + colIndicesInColor, PIndicesAddr.data(), colorSizes.data(), + LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(), + UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data()); + if (err != 0) { + std::ostringstream errorstring; + errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n"; + OPM_THROW(std::logic_error, errorstring.str()); + } + + diagIndex.resize(mat->Nb, 0); + invDiagVals = new double[mat->Nb * bs * bs]; + LMat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); + UMat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); + resultPointers[0] = (void *) colorSizes.data(); + resultPointers[1] = (void *) PIndicesAddr.data(); + resultPointers[2] = (void *) nnzValues.data(); + resultPointers[3] = (void *) colIndices.data(); + resultPointers[4] = (void *) NROffsets.data(); + resultPointers[5] = (void *) nnzValsSizes.data(); + resultPointers[6] = (void *) LColorSizes.data(); + resultPointers[7] = (void *) LPIndicesAddr.data(); + resultPointers[8] = (void *) LnnzValues.data(); + resultPointers[9] = (void *) LColIndices.data(); + resultPointers[10] = (void *) LNROffsets.data(); + resultPointers[11] = (void *) LnnzValsSizes.data(); + resultPointers[12] = (void *) UColorSizes.data(); + resultPointers[13] = (void *) UPIndicesAddr.data(); + resultPointers[14] = (void *) UnnzValues.data(); + resultPointers[15] = (void *) UColIndices.data(); + resultPointers[16] = (void *) UNROffsets.data(); + resultPointers[17] = (void *) UnnzValsSizes.data(); + resultPointers[18] = (void *) blockDiag.data(); + //resultPointers[19] and [20] are set by the caller + resultSizes[0] = mat->Nb * block_size; + resultSizes[1] = colorRoundedValSize; // zeropadded valSize; + resultSizes[2] = numColors; + resultSizes[3] = worstCaseColumnAccessNum; //totalCols + resultSizes[4] = NROffsetSize; //NRFlagSize + resultSizes[5] = blockDiagSize; //diagValsSize + resultSizes[6] = mat->Nb * block_size; + resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize; + resultSizes[8] = numColors; + resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols + resultSizes[10] = LNROffsetSize; //LNRFlagSize + resultSizes[11] = blockDiagSize; //LDiagValsSize + resultSizes[12] = mat->Nb * block_size; + resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize; + resultSizes[14] = numColors; + resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols + resultSizes[16] = UNROffsetSize; //UNRFlagSize + resultSizes[17] = blockDiagSize; //UDiagValsSize + return true; +} // end init() + + +template +bool FPGABILU0::create_preconditioner(BlockedMatrix *mat) +{ + const unsigned int bs = block_size; + Timer t_reorder; + reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get()); + + if (verbosity >= 3) { + std::ostringstream out; + out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s"; + OpmLog::info(out.str()); + } + + // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp + Timer t_memcpy; + memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs); + + if (verbosity >= 3) { + std::ostringstream out; + out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s"; + OpmLog::info(out.str()); + } + + int i, j, ij, ik, jk; + int iRowStart, iRowEnd, jRowEnd; + double pivot[bs * bs]; + int LSize = 0; + Opm::Detail::Inverter inverter; // reuse inverter to invert blocks + + Timer t_decomposition; + + // go through all rows + for (i = 0; i < LUMat->Nb; i++) { + iRowStart = LUMat->rowPointers[i]; + iRowEnd = LUMat->rowPointers[i + 1]; + + // go through all elements of the row + for (ij = iRowStart; ij < iRowEnd; ij++) { + j = LUMat->colIndices[ij]; + // if the element is the diagonal, store the index and go to next row + if (j == i) { + diagIndex[i] = ij; + break; + } + // if an element beyond the diagonal is reach, no diagonal was found + // throw an error now. TODO: perform reordering earlier to prevent this + if (j > i) { + std::ostringstream out; + out << "BILU0 Error could not find diagonal value in row: " << i; + OpmLog::error(out.str()); + return false; + } + + LSize++; + // calculate the pivot of this row + blockMult(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]); + + memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs); + + jRowEnd = LUMat->rowPointers[j + 1]; + jk = diagIndex[j] + 1; + ik = ij + 1; + // substract that row scaled by the pivot from this row. + while (ik < iRowEnd && jk < jRowEnd) { + if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) { + blockMultSub(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs); + ik++; + jk++; + } else { + if (LUMat->colIndices[ik] < LUMat->colIndices[jk]) + { ik++; } + else + { jk++; } + } + } + } + // store the inverse in the diagonal! + inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs); + memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs); + } + + LMat->rowPointers[0] = 0; + UMat->rowPointers[0] = 0; + + // Split the LU matrix into two by comparing column indices to diagonal indices + for (i = 0; i < LUMat->Nb; i++) { + LMat->rowPointers[i + 1] = LMat->rowPointers[i]; + for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) { + if (j < diagIndex[i]) { + memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs); + LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j]; + LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1; + } + } + } + // Reverse the order or the (blocked) rows for the U matrix, + // because the rows are accessed in reverse order when applying the ILU0 + int URowIndex = 0; + for (i = LUMat->Nb - 1; i >= 0; i--) { + UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex]; + for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) { + if (j > diagIndex[i]) { + memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs); + UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j]; + UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1; + } + } + URowIndex++; + } + + if (verbosity >= 3) { + std::ostringstream out; + out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s"; + OpmLog::info(out.str()); + } + + std::vector URowsPerColor(numColors); + rowSize = block_size * rMat->Nb; + LRowSize = block_size * LMat->Nb; + URowSize = block_size * UMat->Nb; + LNumColors = numColors; + UNumColors = numColors; + for (int c = 0; c < numColors; c++) { + URowsPerColor[numColors - c - 1] = rowsPerColor[c]; + } + int err; + err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false, + colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(), + nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize); + if (err != 0) { + return false; + } + err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false, + LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(), + LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize); + if (err != 0) { + return false; + } + err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true, + UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(), + UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize); + if (err != 0) { + return false; + } + blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data()); + // resultPointers are set in the init method + resultSizes[0] = rowSize; + resultSizes[1] = colorSizes[3]; // zeropadded valSize; + resultSizes[2] = numColors; + resultSizes[3] = colorSizes[2]; //totalCols + resultSizes[4] = colorSizes[5]; //NRFlagSize + resultSizes[5] = colorSizes[6]; //diagValsSize + resultSizes[6] = LRowSize; + resultSizes[7] = LColorSizes[3]; // zeropadded LValSize; + resultSizes[8] = LNumColors; + resultSizes[9] = LColorSizes[2]; //LTotalCols + resultSizes[10] = LColorSizes[5]; //LNRFlagSize + resultSizes[11] = LColorSizes[6]; //LDiagValsSize + resultSizes[12] = URowSize; + resultSizes[13] = UColorSizes[3]; // zeropadded UValSize; + resultSizes[14] = UNumColors; + resultSizes[15] = UColorSizes[2]; //UTotalCols + resultSizes[16] = UColorSizes[5]; //UNRFlagSize + resultSizes[17] = UColorSizes[6]; //UDiagValsSize + return true; +} // end create_preconditioner() + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template FPGABILU0::FPGABILU0(ILUReorder, int, int, int, int, int); \ +template FPGABILU0::~FPGABILU0(); \ +template bool FPGABILU0::init(BlockedMatrix *); \ +template bool FPGABILU0::create_preconditioner(BlockedMatrix *); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} //namespace bda diff --git a/opm/simulators/linalg/bda/FPGABILU0.hpp b/opm/simulators/linalg/bda/FPGABILU0.hpp new file mode 100644 index 000000000..2c3e6c61e --- /dev/null +++ b/opm/simulators/linalg/bda/FPGABILU0.hpp @@ -0,0 +1,117 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef FPGA_BILU0_HEADER_INCLUDED +#define FPGA_BILU0_HEADER_INCLUDED + +#include + +#include +#include + +namespace bda +{ + +/* + * This class implements a Blocked ILU0 preconditioner, with output data + * specifically formatted for the FPGA. + * The decomposition and reorders of the rows of the matrix are done on CPU. + */ + +template +class FPGABILU0 +{ + +private: + int N; // number of rows of the matrix + int Nb; // number of blockrows of the matrix + int nnz; // number of nonzeroes of the matrix (scalar) + int nnzbs; // number of blocks of the matrix + std::unique_ptr > LMat = nullptr, UMat = nullptr, LUMat = nullptr; + std::shared_ptr > rMat = nullptr; // reordered mat + double *invDiagVals = nullptr; + std::vector diagIndex; + std::vector toOrder, fromOrder; + std::vector rowsPerColor; + int numColors; + int verbosity; + + // sizes and arrays used during RDF generation + std::vector > nnzValues, LnnzValues, UnnzValues; + std::vector colIndices, LColIndices, UColIndices; + std::vector NROffsets, LNROffsets, UNROffsets; + std::vector PIndicesAddr, LPIndicesAddr, UPIndicesAddr; + std::vector colorSizes, LColorSizes, UColorSizes; + std::vector nnzValsSizes, LnnzValsSizes, UnnzValsSizes; + std::vector > colIndicesInColor, LColIndicesInColor, UColIndicesInColor; + + int rowSize, valSize; + int LRowSize, LValSize, LNumColors; + int URowSize, UValSize, UNumColors; + std::vector blockDiag; + ILUReorder opencl_ilu_reorder; + bool level_scheduling = false, graph_coloring = false; + int numResultPointers = 21; + std::vector resultPointers; + int numResultSizes = 18; + std::vector resultSizes; + int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor + +public: + + FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors); + + ~FPGABILU0(); + + // analysis (optional) + bool init(BlockedMatrix *mat); + + // ilu_decomposition + bool create_preconditioner(BlockedMatrix *mat); + + int* getToOrder() + { + return toOrder.data(); + } + + int* getFromOrder() + { + return fromOrder.data(); + } + + BlockedMatrix* getRMat() + { + return rMat.get(); + } + + void **getResultPointers() + { + return resultPointers.data(); + } + + int *getResultSizes() + { + return resultSizes.data(); + } + +}; + +} //namespace bda + +#endif // FPGA_BILU0_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/FPGAMatrix.cpp b/opm/simulators/linalg/bda/FPGAMatrix.cpp new file mode 100644 index 000000000..e1817d93d --- /dev/null +++ b/opm/simulators/linalg/bda/FPGAMatrix.cpp @@ -0,0 +1,249 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include + +#include +#include + +namespace bda +{ + +/*Sort a row of matrix elements from a CSR-format.*/ +void sortRow(int *colIndices, double *data, int left, int right) { + int l = left; + int r = right; + int middle = colIndices[(l + r) >> 1]; + do { + while (colIndices[l] < middle) + l++; + while (colIndices[r] > middle) + r--; + if (l <= r) { + int lColIndex = colIndices[l]; + colIndices[l] = colIndices[r]; + colIndices[r] = lColIndex; + double lDatum = data[l]; + data[l] = data[r]; + data[r] = lDatum; + + l++; + r--; + } + } while (l < r); + if (left < r) + sortRow(colIndices, data, left, r); + if (right > l) + sortRow(colIndices, data, l, right); + +} + +/* + * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows: + * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values + * and colors, and the size, in elements, of the NROffsets array, followed by: + * the number of rows (rounded to the nearest 32), the number of rows (not rounded), + * the number of columns (not rounded) and the number of non-zero values + * (rounded to the nearest 32) for every partition. + * This array is zero padded up to the nearest 64-byte cacheline. + * - The "colIndicesInColor" array, which contains for every partition, from which elements + * in the global X vector the elements of that X vector partition came. + * For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then + * that X vector partition will only have three elements, and the color_col_indices array + * will contain 1, 3 and 6 for that partition. + * This array is zero padded up to the nearest 64-byte cacheline for every partition. + * - The "nnzValues" array contains all non-zero values of each partition of the matrix. + * This array is zero-padded so that each color has a multiple of 32 elements (to have the + * same number of elements per partition as the column indices array). + * - The "colIndices" array contains all column indices of each partition of the matrix. + * These column indices are the local indices for that partition, so to be used, first a + * local X vector partition needs to be loaded into some local memory (this is done using + * data from the _color_col_indices array), before these column indices can be used as + * addresses to that local memory to read the desired X vector values. + * This array is zero-padded so that data for every partition fills up a number of complete + * cachelines (this means every color has a multiple of 32 elements). + * - "NROffsets" is the name of the array that contains the new row offsets for + * all elements of every partition of the matrix. New row offsets are 8-bit values which + * are 0 if that element is not the first element in a row, or which, if that element is + * the first element of a row) is equal to the amount of empty rows between that new row + * and the row before it plus 1. This array is zero-padded so that data for every partition + * fills up a number of complete cachelines (this means every color has a multiple of 64 elements). + */ +int Matrix::toRDF(int numColors, std::vector& nodesPerColor, + std::vector >& colIndicesInColor, int nnzsThisRowLimit, + std::vector >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes) +{ + auto mat = this; + + int doneRows = 0; + int totalRowNum = 0; // total number of non-empty rows + int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color + int maxNNZsPerColor = 0; // max of nnzsPerColor + + int totalValSize = 0; // sum of nnzsPerColor, padded + + std::vector nnzRowsPerColor(numColors); + + // find number of nnzs per color and number of non-empty rows + for (int c = 0; c < numColors; c++) { + int numRows = 0; + nnzRowsPerColor[c] = 0; + int firstNnzOfColor = mat->rowPointers[doneRows]; + int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]]; + nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices + totalValSize += nnzsPerColor; + maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor); + int row = doneRows; + for (; row < doneRows + nodesPerColor[c]; row++) { + if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) { + numRows++; + nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1; + } + } + + doneRows = row; + totalRowNum += numRows; + } + + int conseqZeroRows = 0; // number of consecutive empty rows + int maxConseqZeroRows = 0; + int numEmptyRows = 0; // total number of empty rows + std::vector rowOffsets(totalRowNum); + std::vector nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows + std::vector colorValPointers(numColors + 1); // points to first nnz of first row of each color + std::vector colorValZeroPointers(numColors); // points to first padded zero for each color + + int nonEmptyRowIdx = 0; // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen + doneRows = 0; + + int totalPaddingSize = 0; // number of padded zeros from previous colors + int NROffsetSize = 0; // number of NROffsets entries, padded to multiple of 64 for each color + int maxRows = 0; + int maxNNZsPerRow = 0; + + // determine the row offset of each row (amount of zero rows between it and the previous non-zero row) + // this is later converted to rowOffset for each nnz + for (int c = 0; c < numColors; c++) { + conseqZeroRows = 0; + for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) { + int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row]; + if (nnzsThisRow == 0) { + conseqZeroRows++; + numEmptyRows++; + } else { + maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow); + nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1]; + rowOffsets[nonEmptyRowIdx] = conseqZeroRows; + maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows); + conseqZeroRows = 0; + nonEmptyRowIdx++; + } + } + // calculate sizes that include zeropadding + colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize; + colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32); + totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c]; + NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64); + + doneRows += nodesPerColor[c]; + maxRows = std::max(nodesPerColor[c], maxRows); + } + + if (maxNNZsPerRow > nnzsThisRowLimit) { + std::ostringstream errorstring; + errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit; + OPM_THROW(std::logic_error, errorstring.str()); + } + + // create and fill RDF arrays + colorSizes[3] = colorValPointers[numColors]; // total number of nnzs the FPGA has to process, including zeropadding + colorSizes[5] = NROffsetSize; + + for (int c = 0; c < numColors; c++) { + colorSizes[c * 4 + 8] = nnzRowsPerColor[c]; + colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c]; + } + + int rowIndex = 0; // keep track of where to read/write + int valIndex = 0; + int NRIndex = 0; + int halfwayPoint = colorValPointers[numColors] / 2; + nnzValsSizes[0] = colorValPointers[numColors]; + + colorSizes[7] = halfwayPoint; + + for (int c = 0; c < numColors; c++) { + int nnzsThisRow; + // make sure 32 values are written in batches (pad with zeros if needed) + for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) { + for (int vb = 0; vb < 32; vb++) { + + // if there are enough values for the whole cacheline + if (v + vb < colorValZeroPointers[c]) { + ubNnzValues[0][v + vb] = mat->nnzValues[valIndex]; + ubColIndices[v + vb] = static_cast(colIndicesInColor[c][mat->colIndices[valIndex]]); + + // if this val is the first of a row + if (nnzRowPointers[rowIndex] == valIndex) { + + if (rowOffsets[rowIndex] + 1 >= 255) { + std::ostringstream errorstring; + errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1; + OPM_THROW(std::logic_error, errorstring.str()); + } + + NROffsets[NRIndex] = static_cast(rowOffsets[rowIndex] + 1); + + // skip all empty rows + while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) { + rowIndex++; + nnzsThisRow = 0; + } + nnzsThisRow++; + } + else + { + NROffsets[NRIndex] = (unsigned char) 0; + nnzsThisRow++; + } + valIndex++; + } + else // zeropadding is needed + { + ubNnzValues[0][v + vb] = 0.0; + ubColIndices[v + vb] = static_cast(colIndicesInColor[c][mat->colIndices[valIndex - 1]]); + NROffsets[NRIndex] = 0; + } + NRIndex++; + + } + } + + // zeropad the NROffsets file + while (NRIndex % 64 != 0) { + NROffsets[NRIndex] = 0; + NRIndex++; + } + } + + return 0; +} + +} // end namespace bda diff --git a/opm/simulators/linalg/bda/FPGAMatrix.hpp b/opm/simulators/linalg/bda/FPGAMatrix.hpp new file mode 100644 index 000000000..34eedc02e --- /dev/null +++ b/opm/simulators/linalg/bda/FPGAMatrix.hpp @@ -0,0 +1,84 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef FPGA_MATRIX_HEADER_INCLUDED +#define FPGA_MATRIX_HEADER_INCLUDED + +#include + +namespace bda +{ + +/// This struct resembles a csr matrix, only doubles are supported +/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer. +class Matrix { + +public: + + /// Allocate Matrix and data arrays with given sizes + /// \param[in] N number of rows + /// \param[in] nnzs number of nonzeros + Matrix(int N_, int nnzs_) + : nnzValues(new double[nnzs_]), + colIndices(new int[nnzs_]), + rowPointers(new int[N_+1]), + N(N_), + nnzs(nnzs_) + {} + + /// All constructors allocate new memory, so always delete here + ~Matrix(){ + delete[] nnzValues; + delete[] colIndices; + delete[] rowPointers; + } + + + /// Converts this matrix to the dataformat used by the FPGA. + /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset). + /// The purpose of this format is to allow the data to be streamable. + /// The rowPointers array has an unpredictable reading pattern/timing, + /// it also needs a extra work if a row is shorter than a cacheline. + /// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets. + /// The value of this offset is 0, unless the corresponding nnz is the first of a row, + /// in that case it is 'the number of empty rows preceeding it + 1'. + /// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx. + /// Example: + /// [1 0 0 3 0] nnzValues [1 3 2 2 1 4 3 4 1] + /// [0 2 2 0 1] colIndices [0 3 1 2 4 0 1 2 4] + /// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9] + /// [0 0 0 0 0] rowOffsets [1 0 1 0 0 1 2 0 0] + /// [0 3 4 0 1] + /// The rowOffset is stored in 1 byte, meaning the maximum value is 255. + int toRDF(int numColors, std::vector& nodesPerColor, + std::vector >& colIndicesInColor, int nnzsPerRowLimit, + std::vector >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes); + + double *nnzValues; + int *colIndices; + int *rowPointers; + int N; + int nnzs; +}; + +void sortRow(int *colIndices, double *data, int left, int right); + +} // end namespace bda + +#endif // FPGA_MATRIX_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp new file mode 100644 index 000000000..9de0b8fef --- /dev/null +++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp @@ -0,0 +1,732 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include + +#include + +#include +#include +#include + +#include +#include +#include + +// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA +// kernel will be disabled and execution will continue using DUNE +#define FPGA_EXIT_WITH_HW_FAILURE +//#undef FPGA_EXIT_WITH_HW_FAILURE + +// if defined, the function generate_statistics will create a CSV-formatted file +// with detailed statistics about the FPGA backend performance +//#define FPGA_STATISTICS_FILE_ENABLED +#undef FPGA_STATISTICS_FILE_ENABLED + +namespace bda +{ + +using Opm::OpmLog; + +template +FpgaSolverBackend::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver(fpga_bitstream, verbosity_, maxit_, tolerance_) +{ + int err; + std::ostringstream oss; + double start = second(); + + // currently, only block size == 3 is supported by the FPGA backend + assert(block_size == 3); + + if (verbosity < 1) { + perf_call_enabled = false; + } + // setup bitstream name and other parameters + if (fpga_bitstream.compare("") == 0) { + OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified"); + } + if (!fileExists(fpga_bitstream.c_str())) { + OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable"); + } + // ----------------------------- + // FPGA: setup the OpenCL platform + // ----------------------------- + std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp + // auto-select the proper FPGA device and create context and other CL objects + err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1); + if (err != 0) { + oss << "Failed to setup the OpenCL device (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + oss << "Detected FPGA platform type is "; + if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; } + OpmLog::info(oss.str()); + oss.str(""); + oss.clear(); + // ----------------------------- + // FPGA: setup the debug buffer + // ----------------------------- + // set kernel debug lines depending on an environment variable + const char *xem = getenv("XCL_EMULATION_MODE"); + if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) { + debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU; + oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n"; + OpmLog::info(oss.str()); + oss.str(""); + oss.clear(); + } else { + // set to 2 to reduce overhead in reading back and interpreting the debug lines; + // increase to get more debug info from the kernel + // range is 2..DEBUG_OUTBUF_WORDS_MAX-1 + debug_outbuf_words = 2; + } + + // host debug buffer setup + err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize); + if (err != 0) { + oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + // device debug buffer setup + err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize); + if (err != 0) { + oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n"; + OPM_THROW(std::logic_error, oss.str()); + } + // copy debug buffer to device + err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words); + if (err != 0) { + oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n"; + OPM_THROW(std::logic_error, oss.str()); + } + // ------------------------------------------------ + // FPGA: query the kernel for limits/configuration + // ------------------------------------------------ + err = fpga_kernel_query(context, commands, kernel, cldebug, + debugBuffer, debug_outbuf_words, + rst_assert_cycles, rst_settle_cycles, + &hw_x_vector_elem, &hw_max_row_size, + &hw_max_column_size, &hw_max_colors_size, + &hw_max_nnzs_per_row, &hw_max_matrix_size, + &hw_use_uram, &hw_write_ilu0_results, + &hw_dma_data_width, &hw_mult_num, + &hw_x_vector_latency, &hw_add_latency, &hw_mult_latency, + &hw_num_read_ports, &hw_num_write_ports, + &hw_reset_cycles, &hw_reset_settle); + if (err != 0) { + oss << "Failed to call fpga_kernel_query (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + + if (verbosity >= 1) { + oss << "FPGA kernel limits/configuration:\n"; + oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n"; + oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n"; + oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n"; + oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n"; + oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n"; + oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n"; + oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n"; + oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle; + OpmLog::info(oss.str()); + oss.str(""); + oss.clear(); + } + + // check that LU results are generated by the kernel + if (use_LU_res && !hw_write_ilu0_results) { + OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage"); + oss.str(""); + oss.clear(); + use_LU_res = false; + } + + // setup preconditioner + double start_prec = second(); + prec = std::make_unique(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size); + perf_total.s_preconditioner_setup = second() - start_prec; + + if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { + level_scheduling = true; + } + + perf_total.s_initialization = second() - start; +} // end fpgaSolverBackend + + +template +FpgaSolverBackend::~FpgaSolverBackend() +{ + if (verbosity >= 1) { + generate_statistics(); + } + delete[] rx; + delete[] rb; + if (nnzValArrays != nullptr) { free(nnzValArrays); } + if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); } + if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); } + // FPGA: buffers + free(debugBuffer); + for (int b = 0; b < RW_BUF; b++) { + free(dataBuffer[b]); + } + free(databufferSize); + // FPGA: OpenCL objects + if (cldebug != nullptr) { clReleaseMemObject(cldebug); } + for (int b = 0; b < RW_BUF; b++) { + if (cldata[b] != nullptr) { + clReleaseMemObject(cldata[b]); + } + } + clReleaseCommandQueue(commands); + clReleaseContext(context); + clReleaseKernel(kernel); + clReleaseProgram(program); + clReleaseDevice(device_id); +} // end ~fpgaSolverBackend() + + +// copy result to host memory +// caller must be sure that x is a valid array +template +void FpgaSolverBackend::get_result(double *x_) +{ + double start = 0; + + if (perf_call_enabled) { + start = second(); + } + // apply to results the reordering (stored in toOrder) + reorderBlockedVectorByPattern(mat->Nb, rx, toOrder, x_); + // TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private) + if (perf_call_enabled) { + perf_call.back().s_postprocess = second() - start; + } +} // end get_result() + + +template +SolverStatus FpgaSolverBackend::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res) +{ + if (initialized == false) { + initialize(N_, nnz_, dim, vals, rows, cols); + if (!analyse_matrix()) { + return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; + } + } + perf_call.emplace_back(); + update_system(vals, b); + if (!create_preconditioner()) { + return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; + } + solve_system(res); + + if (verbosity >= 1) { + std::ostringstream oss; + oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \ + ", iterations: " << res.iterations << ", reduction: " << res.reduction << \ + ", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed; + OpmLog::info(oss.str()); + } + return SolverStatus::BDA_SOLVER_SUCCESS; +} + + +template +void FpgaSolverBackend::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) +{ + double start = second(); + this->N = N_; + this->nnz = nnz_; + this->nnzb = nnz_ / block_size / block_size; + Nb = (N + dim - 1) / dim; + + // allocate host memory for matrices and vectors + // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here + mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows)); + + std::ostringstream oss; + oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \ + "block size: " << dim << ", total nnz: " << this->nnz << "\n"; + oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; + OpmLog::info(oss.str()); + + rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; + rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; + + perf_total.s_initialization += second() - start; + initialized = true; +} // end initialize() + + +template +bool FpgaSolverBackend::analyse_matrix() +{ + std::ostringstream oss; + int err; + + double start = second(); + bool success = prec->init(mat.get()); + + if (!success) { + OpmLog::warning("Preconditioner for FPGA solver failed to initialize"); + return success; + } + + toOrder = prec->getToOrder(); + fromOrder = prec->getFromOrder(); + rMat = prec->getRMat(); + processedPointers = prec->getResultPointers(); + processedSizes = prec->getResultSizes(); + processedPointers[19] = rb; + processedPointers[20] = rx; + nnzValArrays_size = static_cast(processedPointers[5])[0]; + L_nnzValArrays_size = static_cast(processedPointers[11])[0]; + U_nnzValArrays_size = static_cast(processedPointers[17])[0]; + // ------------------------------------- + // FPGA: setup host/device data buffers + // ------------------------------------- + // allocate memory and setup data layout + err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits, + processedSizes, + &setupArray, + &nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray, + &L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray, + &U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray, + &BLKDArray, &X1Array, &R1Array, + &X2Array, &R2Array, + &LresArray, &UresArray, + &databufferSize, dataBuffer, + result_offsets, 1 /*num_nnz_arrays*/, + true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */ + debugbufferSize); + if (err) { + oss << "Failed to call fpga_setup_host_datamem (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + + // results buffers setup + if (use_LU_res) { + resultsBufferNum = 4; + } else { + resultsBufferNum = 2; + } + if (resultsBufferNum > RES_BUF_MAX) { + oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + + resultsNum = processedSizes[0]; // rowSize, invariant between system solves + for (int i = 0; i < resultsBufferNum; i++) { + resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double); + } + + // device data memory setup + err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata); + if (err != 0) { + oss << "Failed to call fpga_setup_device_datamem (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + + // ------------------------------------ + // FPGA: setup the kernel's parameters + // ------------------------------------ + err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit, + debug_sample_rate, tolerance, cldata, cldebug); + if (err != 0) { + oss << "Failed to call fpga_set_kernel_parameters (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + + perf_total.s_analysis = second() - start; + analysis_done = true; + + return success; +} // end analyse_matrix() + + +template +bool FpgaSolverBackend::create_preconditioner() +{ + double start = 0; + + if (perf_call_enabled) { + start = second(); + } + memset(rx, 0, sizeof(double) * N); + bool result = prec->create_preconditioner(mat.get()); + if (!result) { + OpmLog::warning("fpgaSolverBackend: create_preconditioner failed"); + } + + if (perf_call_enabled) { + perf_call.back().s_preconditioner_create = second() - start; + } + return result; +} // end create_preconditioner() + + +template +void FpgaSolverBackend::solve_system(BdaResult &res) +{ + std::ostringstream oss; + int err; + double start = 0, start_total = 0; + + // ------------------------------------ + // FPGA: return immediately if FPGA is disabled + // ------------------------------------ + if (fpga_disabled) { + res.converged = false; + OpmLog::warning("FPGA is disabled, fallback to SW execution"); + return; + } + + fpga_calls++; + + if (perf_call_enabled) { + start = second(); + start_total = start; + } + + // check if any buffer is larger than the size set in preconditioner->init + // TODO: add check for all other buffer sizes that may overflow? + err = 0; + if ( ((int *)processedPointers[5])[0] > nnzValArrays_size || + ((int *)processedPointers[11])[0] > L_nnzValArrays_size || + ((int *)processedPointers[17])[0] > U_nnzValArrays_size ) { + err = 1; + } + if (err != 0) { + OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)"); + } + + // ------------------------------------ + // FPGA: copy input data to host data buffers + // ------------------------------------ + if (perf_call_enabled) { + start = second(); + } + err = fpga_copy_host_datamem( + processedPointers, processedSizes, setupArray, + nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray, + L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray, + U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray, + BLKDArray, X1Array, R1Array, X2Array, R2Array, + use_LU_res, LresArray, UresArray, + databufferSize, dataBuffer, + 1 /* nnzValArrays_num */, + reset_data_buffers, fill_results_buffers, + dump_data_buffers, fpga_calls); + if (perf_call_enabled) { + perf_call.back().s_mem_setup = second() - start; + } + if (err != 0) { + oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + // ------------------------------------ + // FPGA: copy buffers to device + // ------------------------------------ + // copy debug buffer to device + if (perf_call_enabled) { + start = second(); + } + err = fpga_copy_to_device_debugbuf(commands, + cldebug, debugBuffer, debugbufferSize, + debug_outbuf_words); + if (err != 0) { + oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + // copy data buffers to device + err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata); + if (err != 0) { + oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + if (perf_call_enabled) { + perf_call.back().s_mem_h2d = second() - start; + } + // ------------------------------------ + // FPGA: execute the kernel + // ------------------------------------ + double time_elapsed_ms; + if (perf_call_enabled) { + start = second(); + } + err = fpga_kernel_run(commands, kernel, &time_elapsed_ms); + if (perf_call_enabled) { + perf_call.back().s_kernel_exec = second() - start; + } + if (err != 0) { + oss << "Failed to call fpga_kernel_run (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + // ---------------------------------------- + // FPGA: read back debug buffer from device + // ---------------------------------------- + if (perf_call_enabled) { + start = second(); + } + err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10), + commands, + debug_outbuf_words, debugbufferSize, + cldebug, debugBuffer, + abort_cycles, + &kernel_cycles, &kernel_iter_run, + norms, &last_norm_idx, + &kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults, + &kernel_wrafterend, &kernel_dbgfifofull); + if (err != 0) { + oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + if (kernel_wrafterend) { + OpmLog::warning("Detected recoverable FPGA error: kernel write after end"); + } + if (kernel_dbgfifofull) { + OpmLog::warning("Detected recoverable FPGA error: debug FIFO full"); + } + if (kernel_aborted || kernel_signature || kernel_overflow) { +#if defined(FPGA_EXIT_WITH_HW_FAILURE) + oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ + ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")"; + OPM_THROW(std::logic_error, oss.str()); +#else + oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ + ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n"; + oss << "Disabling FPGA kernel: execution will continue with SW kernel"; + OpmLog::warning(oss.str()); + oss.str(""); + oss.clear(); + fpga_disabled = true; +#endif + } + if (perf_call_enabled) { + perf_call.back().n_kernel_exec_cycles = kernel_cycles; + } + // copy (back) results only if FPGA is not disabled + if (!fpga_disabled) { + if (kernel_noresults) { + OpmLog::warning("FPGA kernel did not return results because the required precision is already reached"); + // rx still contains zeros from initial guess + } else { + // ------------------------------------ + // FPGA: read back results from device + // ------------------------------------ + err = fpga_map_results(even(kernel_iter_run), + use_residuals, use_LU_res, commands, + resultsNum, resultsBufferNum, resultsBufferSize, + debugbufferSize, + cldata, resultsBuffer, + result_offsets, + dump_results, data_dir, basename, sequence); + if (err != 0) { + oss << "Failed to call fpga_map_results (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + // TODO: copy results buffers to reordering output buffers + memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double)); + err = fpga_unmap_results(even(kernel_iter_run), + use_residuals, use_LU_res, + commands, cldata, resultsBuffer); + if (err != 0) { + oss << "Failed to call fpga_unmap_results (" << err << ")"; + OPM_THROW(std::logic_error, oss.str()); + } + } + } + // set results and update statistics (if enabled) + if (perf_call_enabled) { + perf_call.back().s_mem_d2h = second() - start; + } + float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns + res.iterations = (int)iter; + res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm + res.conv_rate = pow(res.reduction, 1.0 / iter); + res.elapsed = second() - start_total; + if (perf_call_enabled) { + perf_call.back().s_solve = res.elapsed; + perf_call.back().n_kernel_exec_iters = iter; + } + // convergence depends on number of iterations reached and hw execution errors + res.converged = true; + if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) { + res.converged = false; + if (verbosity >= 1) { + oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \ + ", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \ + ", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit); + OpmLog::warning(oss.str()); + oss.str(""); + oss.clear(); + } + } + if (perf_call_enabled) { + perf_call.back().converged = res.converged; + perf_call.back().converged_flags = ((unsigned int)fpga_disabled) + + ((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) + + ((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4); + } +} // end solve_system() + + +template +void FpgaSolverBackend::update_system(double *vals, double *b) +{ + double start = 0; + + mat->nnzValues = vals; + // reorder inputs using previously found ordering (stored in fromOrder) + if (perf_call_enabled) { + start = second(); + } + reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); + if (perf_call_enabled) { + perf_call.back().s_reorder = second() - start; + } +} // end update_system() + + +template +void FpgaSolverBackend::generate_statistics() +{ + std::ostringstream oss; + unsigned int conv_iter = 0, conv_ovf = 0; + + if (!perf_call_enabled || fpga_calls == 0) { + OpmLog::warning("FPGA statistics were not collected"); + return; + } + std::printf("--- FPGA statistics ---\n"); + std::printf("total solver calls..........: %u\n", fpga_calls); + std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization); + std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup); + +#if defined(FPGA_STATISTICS_FILE_ENABLED) + // DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file + FILE *fout = fopen("fpga_statistics_details.csv", "w"); + if (fout != nullptr) { + std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n"); + } +#endif + unsigned int num_data_points = perf_call.size(); + for (unsigned int i = 0; i < num_data_points; i++) { + perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create; + if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; } + if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; } + perf_total.s_analysis += perf_call[i].s_analysis; + if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; } + if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; } + perf_total.s_reorder += perf_call[i].s_reorder; + if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; } + if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; } + perf_total.s_mem_setup += perf_call[i].s_mem_setup; + if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; } + if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; } + perf_total.s_mem_h2d += perf_call[i].s_mem_h2d; + if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; } + if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; } + perf_total.s_kernel_exec += perf_call[i].s_kernel_exec; + if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; } + if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; } + perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles; + if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; } + if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; } + perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters; + if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; } + if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; } + perf_total.s_mem_d2h += perf_call[i].s_mem_d2h; + if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; } + if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; } + perf_total.s_solve += perf_call[i].s_solve; + if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; } + if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; } + perf_total.s_postprocess += perf_call[i].s_postprocess; + if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; } + if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; } + perf_total.n_converged += (unsigned int)perf_call[i].converged; + if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; } + if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; } +#if defined(FPGA_STATISTICS_FILE_ENABLED) + if (fout != nullptr) { + std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n", + i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder, + perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles, + perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess, + (unsigned int)perf_call[i].converged); + } +#endif + } +#if defined(FPGA_STATISTICS_FILE_ENABLED) + if (fout != nullptr) { + fclose(fout); + } +#endif + perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points; + perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points; + perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points; + perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points; + perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points; + perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points; + perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points; + perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points; + perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points; + perf_total.s_solve_avg = perf_total.s_solve / num_data_points; + perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points; + std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max); + std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max); + std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max); + std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max); + std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max); + std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max); + std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n", + perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max); + std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n", + perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max); + std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max); + std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max); + std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", + perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max); + std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n", + perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf); + std::printf("-----------------------\n"); +} //end generate_statistics() + + +#define INSTANTIATE_BDA_FUNCTIONS(n) \ +template FpgaSolverBackend::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \ + +INSTANTIATE_BDA_FUNCTIONS(1); +INSTANTIATE_BDA_FUNCTIONS(2); +INSTANTIATE_BDA_FUNCTIONS(3); +INSTANTIATE_BDA_FUNCTIONS(4); + +#undef INSTANTIATE_BDA_FUNCTIONS + +} //namespace bda diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp new file mode 100644 index 000000000..f78e97e2a --- /dev/null +++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp @@ -0,0 +1,265 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED +#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED + +#include +#include + +#include +#include +#include + +namespace bda +{ + +/// This class implements an ilu0-bicgstab solver on FPGA +template +class FpgaSolverBackend : public BdaSolver +{ + typedef BdaSolver Base; + typedef FPGABILU0 Preconditioner; + + using Base::N; + using Base::Nb; + using Base::nnz; + using Base::nnzb; + using Base::verbosity; + using Base::maxit; + using Base::tolerance; + using Base::initialized; + +private: + double *rx = nullptr; // reordered x + double *rb = nullptr; // reordered b + int *fromOrder = nullptr, *toOrder = nullptr; + bool analysis_done = false; + bool level_scheduling = false; + + // LUMat will shallow copy rowPointers and colIndices of mat/rMat + std::unique_ptr > mat = nullptr; + BlockedMatrix *rMat = nullptr; + std::unique_ptr prec = nullptr; + + // vectors with data processed by the preconditioner (input to the kernel) + void **processedPointers = nullptr; + int *processedSizes = nullptr; + + unsigned int fpga_calls = 0; + bool perf_call_enabled = true; + + // per call performance metrics + typedef struct { + double s_preconditioner_create = 0.0; + double s_analysis = 0.0; + double s_reorder = 0.0; + double s_mem_setup = 0.0; + double s_mem_h2d = 0.0; + double s_kernel_exec = 0.0; + unsigned int n_kernel_exec_cycles = 0; + float n_kernel_exec_iters = 0.0; + double s_mem_d2h = 0.0; + double s_solve = 0.0; + double s_postprocess = 0.0; + bool converged = false; + unsigned int converged_flags = 0; + } perf_call_metrics_t; + // cumulative performance metrics + typedef struct { + double s_initialization; + double s_preconditioner_setup; + double s_preconditioner_create; + double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg; + double s_analysis; + double s_analysis_min,s_analysis_max,s_analysis_avg; + double s_reorder; + double s_reorder_min,s_reorder_max,s_reorder_avg; + double s_mem_setup; + double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg; + double s_mem_h2d; + double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg; + double s_kernel_exec; + double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg; + unsigned long n_kernel_exec_cycles; + unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg; + float n_kernel_exec_iters; + float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg; + double s_mem_d2h; + double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg; + double s_solve; + double s_solve_min,s_solve_max,s_solve_avg; + double s_postprocess; + double s_postprocess_min,s_postprocess_max,s_postprocess_avg; + unsigned int n_converged; + } perf_total_metrics_t; + std::vector perf_call; + perf_total_metrics_t perf_total; + // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky + // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental + unsigned int fpga_config_bits = 0; + bool fpga_disabled = false; + bool platform_awsf1; + unsigned int debugbufferSize; + unsigned long int *debugBuffer = nullptr; + unsigned int *databufferSize = nullptr; + unsigned char *dataBuffer[RW_BUF] = {nullptr}; + unsigned int debug_outbuf_words; + int resultsNum; + int resultsBufferNum; + unsigned int resultsBufferSize[RES_BUF_MAX]; + unsigned int result_offsets[6]; + unsigned int kernel_cycles, kernel_iter_run; + double norms[4]; + unsigned char last_norm_idx; + bool kernel_aborted, kernel_signature, kernel_overflow; + bool kernel_noresults; + bool kernel_wrafterend, kernel_dbgfifofull; + bool use_residuals = false; + bool use_LU_res = false; + int sequence = 0; + // TODO: these values may be sent via command line parameters + unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s + unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump + int nnzValArrays_size = 0; + int L_nnzValArrays_size = 0; + int U_nnzValArrays_size = 0; + // aliases to areas of the host data buffers + long unsigned int *setupArray = nullptr; + double **nnzValArrays = nullptr; + short unsigned int *columnIndexArray = nullptr; + unsigned char *newRowOffsetArray = nullptr; + unsigned int *PIndexArray = nullptr; + unsigned int *colorSizesArray = nullptr; + double **L_nnzValArrays = nullptr; + short unsigned int *L_columnIndexArray = nullptr; + unsigned char *L_newRowOffsetArray = nullptr; + unsigned int *L_PIndexArray = nullptr; + unsigned int *L_colorSizesArray = nullptr; + double **U_nnzValArrays = nullptr; + short unsigned int *U_columnIndexArray = nullptr; + unsigned char *U_newRowOffsetArray = nullptr; + unsigned int *U_PIndexArray = nullptr; + unsigned int *U_colorSizesArray = nullptr; + double *BLKDArray = nullptr; + double *X1Array = nullptr, *X2Array = nullptr; + double *R1Array = nullptr, *R2Array = nullptr; + double *LresArray = nullptr, *UresArray = nullptr; + double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region + // OpenCL variables + cl_device_id device_id; + cl_context context; + cl_command_queue commands; + cl_program program; + cl_kernel kernel; + cl_mem cldata[RW_BUF] = {nullptr}; + cl_mem cldebug = nullptr; + // HW limits/configuration variables + unsigned int hw_x_vector_elem; + unsigned int hw_max_row_size; + unsigned int hw_max_column_size; + unsigned int hw_max_colors_size; + unsigned short hw_max_nnzs_per_row; + unsigned int hw_max_matrix_size; + bool hw_use_uram; + bool hw_write_ilu0_results; + unsigned short hw_dma_data_width; + unsigned char hw_x_vector_latency; + unsigned char hw_add_latency; + unsigned char hw_mult_latency; + unsigned char hw_mult_num; + unsigned char hw_num_read_ports; + unsigned char hw_num_write_ports; + unsigned short hw_reset_cycles; + unsigned short hw_reset_settle; + // debug + bool reset_data_buffers = false; + bool fill_results_buffers = false; + int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format + bool dump_results = false; + char *data_dir = nullptr; + char *basename = nullptr; + unsigned short rst_assert_cycles = 0; + unsigned short rst_settle_cycles = 0; + + /// Allocate host memory + /// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] dim size of block + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values + /// \param[in] cols array of columnIndices, contains nnz values + void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols); + + /// Reorder the linear system so it corresponds with the coloring + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] b input vector + void update_system(double *vals, double *b); + + /// Analyse sparsity pattern to extract parallelism + /// \return true iff analysis was successful + bool analyse_matrix(); + + /// Perform ilu0-decomposition + /// \return true iff decomposition was successful + bool create_preconditioner(); + + /// Solve linear system + /// \param[inout] res summary of solver result + void solve_system(BdaResult &res); + + /// Generate FPGA backend statistics + void generate_statistics(void); + +public: + + /// Construct an fpgaSolver + /// \param[in] fpga_bitstream FPGA bitstream file name + /// \param[in] linear_solver_verbosity verbosity of fpgaSolver + /// \param[in] maxit maximum number of iterations for fpgaSolver + /// \param[in] tolerance required relative tolerance for fpgaSolver + /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation + FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); + + /// Destroy an fpgaSolver, and free memory + ~FpgaSolverBackend(); + + /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format + /// \param[in] N number of rows, divide by dim to get number of blockrows + /// \param[in] nnz number of nonzeroes, divide by dim*dim to get number of blocks + /// \param[in] dim size of block + /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values + /// \param[in] rows array of rowPointers, contains N/dim+1 values + /// \param[in] cols array of columnIndices, contains nnz values + /// \param[in] b input vector, contains N values + /// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A + /// \param[inout] res summary of solver result + /// \return status code + SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override; + + /// Get result after linear solve, and peform postprocessing if necessary + /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array + void get_result(double *x) override; + +}; // end class fpgaSolverBackend + +} //namespace bda + +#endif + diff --git a/opm/simulators/linalg/bda/FPGAUtils.cpp b/opm/simulators/linalg/bda/FPGAUtils.cpp new file mode 100644 index 000000000..624e2836e --- /dev/null +++ b/opm/simulators/linalg/bda/FPGAUtils.cpp @@ -0,0 +1,63 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#include +#include + +namespace bda +{ + +double second(void) +{ + struct timeval tv; + gettimeofday(&tv, nullptr); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + +bool even(int n) +{ + if (n % 2 == 0) { + return true; + } else { + return false; + } +} + +int roundUpTo(int i, int n) +{ + if (i % n == 0) { + return i; + } else { + return (i / n + 1) * n; + } +} + +bool fileExists(const char *filename) +{ + FILE *fin; + fin = fopen(filename, "r"); + if (fin == nullptr) { + return false; + } else { + fclose(fin); + return true; + } +} + +} //namespace bda diff --git a/opm/simulators/linalg/bda/FPGAUtils.hpp b/opm/simulators/linalg/bda/FPGAUtils.hpp new file mode 100644 index 000000000..3558f7bba --- /dev/null +++ b/opm/simulators/linalg/bda/FPGAUtils.hpp @@ -0,0 +1,39 @@ +/* + Copyright 2020 Equinor ASA + + This file is part of the Open Porous Media project (OPM). + + OPM is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OPM is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with OPM. If not, see . +*/ + +#ifndef FPGA_UTILS_HEADER_INCLUDED +#define FPGA_UTILS_HEADER_INCLUDED + +namespace bda +{ + +union double2int +{ + unsigned long int int_val; + double double_val; +}; + +double second(void); +bool even(int n); +int roundUpTo(int i, int n); +bool fileExists(const char *filename); + +} // end namespace bda + +#endif // FPGA_UTILS_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/Reorder.cpp b/opm/simulators/linalg/bda/Reorder.cpp index cae3b35f9..4b02bc8bd 100644 --- a/opm/simulators/linalg/bda/Reorder.cpp +++ b/opm/simulators/linalg/bda/Reorder.cpp @@ -17,12 +17,7 @@ along with OPM. If not, see . */ -#include -#include -#include // for fill() #include -#include -#include #include @@ -35,7 +30,7 @@ namespace bda /* Give every node in the matrix (of which only the sparsity pattern in the * form of row pointers and column indices arrays are in the input), a color - * in the colors array. Also return the amount of colors in the return integer. + * in the colors array. Also return the amount of colors in the return integer. * This graph-coloring algorithm is based on the Jones-Plassmann algorithm, proposed in: * "A Parallel Graph Coloring Heuristic" by M.T. Jones and P.E. Plassmann in SIAM Journal of Scientific Computing 14 (1993) */ @@ -60,7 +55,7 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi std::mt19937 gen(rd()); std::uniform_int_distribution<> uniform(0, std::numeric_limits::max()); { - for(int i = 0; i < rows; ++i){ + for (int i = 0; i < rows; ++i) { randoms[i] = uniform(gen); } } @@ -88,12 +83,12 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi if (((jc != -1) && (jc != c)) || (i == j)) { continue; } - // node i is not in the current color if one of its neighbours shares this color, + // node i is not in the current color if one of its neighbours shares this color, if (jc == c) { iMax = false; break; } - // or if one of its uncolored neighbours has a higher random value + // or if one of its uncolored neighbours has a higher random value int jr = randoms[j]; if (ir <= jr) { iMax = false; @@ -108,12 +103,12 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi if (((jc != -1) && (jc != c)) || (i == j)) { continue; } - // node i is not in the current color if one of its neighbours shares this color, + // node i is not in the current color if one of its neighbours shares this color, if (jc == c) { iMax = false; break; } - // or if one of its uncolored neighbours has a higher random value + // or if one of its uncolored neighbours has a higher random value int jr = randoms[j]; if (ir <= jr) { iMax = false; @@ -180,7 +175,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix *mat, int *toOrder, // put thisRow from the old matrix into row i of the new matrix rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow]; for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) { - for (j = 0; j < bs * bs; j++){ + for (j = 0; j < bs * bs; j++) { rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j]; } rmat->colIndices[rIndex] = mat->colIndices[k]; @@ -251,7 +246,7 @@ bool canBeStarted(const int rowIndex, const int *rowPointers, const int *colIndi /* * The level scheduling of a non-symmetric, blocked matrix requires access to a CSC encoding and a CSR encoding of the sparsity pattern of the input matrix. * This function is based on a standard level scheduling algorithm, like the one described in: - * "Iterative methods for Sparse Linear Systems" by Yousef Saad in section 11.6.3 + * "Iterative methods for Sparse Linear Systems" by Yousef Saad in section 11.6.3 */ void findLevelScheduling(int *CSRColIndices, int *CSRRowPointers, int *CSCRowIndices, int *CSCColPointers, int Nb, int *numColors, int *toOrder, int* fromOrder, std::vector& rowsPerColor) { diff --git a/opm/simulators/linalg/bda/Reorder.hpp b/opm/simulators/linalg/bda/Reorder.hpp index 08c8716ff..76347a01e 100644 --- a/opm/simulators/linalg/bda/Reorder.hpp +++ b/opm/simulators/linalg/bda/Reorder.hpp @@ -20,6 +20,8 @@ #ifndef REORDER_HPP #define REORDER_HPP +#include + #include namespace bda diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 7d9c4dc0b..b6dc2b075 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -28,15 +28,18 @@ namespace Opm { -WellContributions::WellContributions(std::string gpu_mode){ - if(gpu_mode.compare("cusparse") == 0){ +WellContributions::WellContributions(std::string accelerator_mode){ + if(accelerator_mode.compare("cusparse") == 0){ cuda_gpu = true; } - else if(gpu_mode.compare("opencl") == 0){ + else if(accelerator_mode.compare("opencl") == 0){ opencl_gpu = true; } + else if(accelerator_mode.compare("fpga") == 0){ + // unused for FPGA, but must be defined to avoid error + } else{ - OPM_THROW(std::logic_error, "Error: invalid GPU mode"); + OPM_THROW(std::logic_error, "Invalid accelerator mode"); } } diff --git a/opm/simulators/linalg/bda/WellContributions.hpp b/opm/simulators/linalg/bda/WellContributions.hpp index 00addf546..5c93748c7 100644 --- a/opm/simulators/linalg/bda/WellContributions.hpp +++ b/opm/simulators/linalg/bda/WellContributions.hpp @@ -176,7 +176,7 @@ public: void alloc(); /// Create a new WellContributions - WellContributions(std::string gpu_mode); + WellContributions(std::string accelerator_mode); /// Destroy a WellContributions, and free memory ~WellContributions(); From a933d4da1089a2b0a5b857a29f6556c16d2c0749 Mon Sep 17 00:00:00 2001 From: Giacomo Marchiori Date: Fri, 15 Jan 2021 14:13:18 +0100 Subject: [PATCH 2/4] Modified CMakeLists.txt after comments by blattms. Also, tentative changes to compile the FPGA library from a different module: this part needs to be revised because it assumes a fixed path for the OPM/FPGA module. Modified CMakeLists_files.cmake to remove files moved to the OPM/FPGA module. --- CMakeLists.txt | 1 + CMakeLists_files.cmake | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77dc99efc..c160fdb4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF) option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF) option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON) +option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF) if(SIBLING_SEARCH AND NOT opm-common_DIR) # guess the sibling dir diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index 6b3675c4d..822654edc 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -66,7 +66,7 @@ if(OPENCL_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp) endif() -if(CMAKE_ENABLE_FPGA) +if(HAVE_FPGA) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp) @@ -184,10 +184,6 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/FPGAMatrix.hpp opm/simulators/linalg/bda/FPGABILU0.hpp opm/simulators/linalg/bda/FPGASolverBackend.hpp - opm/simulators/linalg/bda/fpga/src/sda_app/bicgstab_solver_config.hpp - opm/simulators/linalg/bda/fpga/src/sda_app/common/dev_config.hpp - opm/simulators/linalg/bda/fpga/src/sda_app/common/fpga_functions_bicgstab.hpp - opm/simulators/linalg/bda/fpga/src/sda_app/common/opencl_lib.hpp opm/simulators/linalg/bda/FPGAUtils.hpp opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/ILUReorder.hpp From 81c0a3d9f99ba3bb390d66699502d9c9ac5b72c3 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Thu, 4 Mar 2021 10:58:41 +0100 Subject: [PATCH 3/4] Simplified CPU fallback warnings --- opm/simulators/linalg/ISTLSolverEbos.hpp | 17 +++++------------ opm/simulators/linalg/bda/BdaBridge.hpp | 5 +++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index e18687702..752c7941d 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -272,19 +272,12 @@ namespace Opm bdaBridge->get_result(x); accelerator_was_used = true; } else { - // CPU fallback - use_gpu = bdaBridge->getUseGpu(); // update value, BdaBridge might have disabled cusparseSolver - use_fpga = bdaBridge->getUseFpga(); + // warn about CPU fallback + // BdaBridge might have disabled its BdaSolver for this simulation due to some error + // in that case the BdaBridge is disabled and flexibleSolver is always used + // or maybe the BdaSolver did not converge in time, then it will be used next linear solve if (simulator_.gridView().comm().rank() == 0) { - if (use_gpu && accelerator_mode.compare("cusparse") == 0) { - OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system..."); - } - if (use_gpu && accelerator_mode.compare("opencl") == 0) { - OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system..."); - } - if (use_fpga && accelerator_mode.compare("fpga") == 0) { - OpmLog::warning("fpgaSolver did not converge, now trying Dune to solve current linear system..."); - } + OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system..."); } } } diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 5954de4c8..c3e91db47 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -92,6 +92,11 @@ public: return use_fpga; } + /// Return the selected accelerator mode, this is input via the command-line + std::string getAccleratorName(){ + return accelerator_mode; + } + }; // end class BdaBridge } From 8ea19c66aa86bcc8e5d8bed3a680e9f60fcfd893 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Thu, 4 Mar 2021 11:49:29 +0100 Subject: [PATCH 4/4] Reduced code duplication in BdaBridge --- opm/simulators/linalg/bda/BdaBridge.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 293994122..6d42bf287 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -148,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) { // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays -// sparsity pattern should stay the same due to matrix-add-well-contributions +// sparsity pattern should stay the same // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers template void getSparsityPattern(BridgeMatrix& mat, std::vector &h_rows, std::vector &h_cols) { @@ -185,8 +185,10 @@ void BdaBridge::solve_system(BridgeMatri static std::vector h_rows; static std::vector h_cols; const int dim = (*mat)[0][0].N(); - const int N = mat->N()*dim; - const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim; + const int Nb = mat->N(); + const int N = Nb * dim; + const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back(); + const int nnz = nnzb * dim * dim; if (dim != 3) { OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); @@ -195,8 +197,8 @@ void BdaBridge::solve_system(BridgeMatri } if (h_rows.capacity() == 0) { - h_rows.reserve(N+1); - h_cols.reserve(nnz); + h_rows.reserve(Nb+1); + h_cols.reserve(nnzb); #if PRINT_TIMERS_BRIDGE Dune::Timer t; #endif