diff --git a/CMakeLists.txt b/CMakeLists.txt index f7b74f01f..98acb7b3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,7 +26,6 @@ option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF) option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF) option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON) option(OPM_INSTALL_PYTHON "Install python bindings?" ON) -option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF) option(USE_CHOW_PATEL_ILU "Use the iterative ILU by Chow and Patel?" OFF) option(USE_CHOW_PATEL_ILU_GPU "Run iterative ILU decomposition on GPU? Requires USE_CHOW_PATEL_ILU" OFF) option(USE_CHOW_PATEL_ILU_GPU_PARALLEL "Try to use more parallelism on the GPU during the iterative ILU decomposition? Requires USE_CHOW_PATEL_ILU_GPU" OFF) @@ -168,70 +167,6 @@ endif() find_package(OpenCL) -# include FPGA target only when ENABLE_FPGA is set to ON -if(ENABLE_FPGA) - # must include opencl.h which is a wrapper for all the OpenCL extensions - find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS}) - - # FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in /../FPGA - find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA) - - # this code only runs if the user explicitly enabled the FPGA - # hence fatal_error instead of warning - if(NOT OPENCL_H OR NOT OpenCL_FOUND) - set(HAVE_FPGA 0) - message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.") - elseif(NOT FPGA_MODULE) - set(HAVE_FPGA 0) - message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.") - elseif(NOT DEFINED ENV{XILINX_XRT}) - # Xilinx XRT must be installed and properly setup - set(HAVE_FPGA 0) - message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.") - else() - set(HAVE_FPGA 1) - message(STATUS "FPGA library and kernel integration active.") - - # FIXME: set the correct path to the FPGA module - set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA) - - # configuration variables with default values: they can be overridden on the cmake command line - # FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream - if(NOT FPGA_PORTS_CONFIG) - set(FPGA_PORTS_CONFIG 2r_3r3w_ddr) - endif() - # FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build) - if(NOT FPGA_DEBUG_LEVEL) - set(FPGA_DEBUG_LEVEL 0) - endif() - message(STATUS "Using the following settings for the FPGA library compilation: " - "FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, " - "FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}") - add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG}) - add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}) - # include directories for the FPGA library - include_directories(${FPGA_SOURCE_DIR}) - include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app) - include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common) - # add external project to compile the FPGA library - include (ExternalProject) - ExternalProject_Add(FPGA_library - # force the build step to always be run because source dependencies cannot be made explicit - BUILD_ALWAYS 1 - DOWNLOAD_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND make - -f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile - SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app - PORTS_CONFIG=${FPGA_PORTS_CONFIG} - DEBUG_LEVEL=${FPGA_DEBUG_LEVEL} - INSTALL_COMMAND "" - TEST_COMMAND "" - ) - endif() -endif() - if(OpenCL_FOUND) # the current OpenCL implementation relies on cl2.hpp, not cl.hpp # make sure it is available, otherwise disable OpenCL @@ -553,11 +488,7 @@ endif() if(VexCL_FOUND) target_link_libraries( opmsimulators PUBLIC OPM::VexCL::OpenCL ) endif() -if(HAVE_FPGA) - add_dependencies(opmsimulators FPGA_library) - ExternalProject_Get_Property(FPGA_library binary_dir) - target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a) -endif() + if(Damaris_FOUND) target_link_libraries(opmsimulators PUBLIC damaris) endif() diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index b05bfe65c..d5e536d49 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -149,11 +149,6 @@ endif() if(CUDA_FOUND OR OPENCL_FOUND OR HAVE_FPGA OR amgcl_FOUND OR ROCALUTION_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp) endif() -if(HAVE_FPGA) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp) - list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp) -endif() if(amgcl_FOUND) list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/amgclSolverBackend.cpp) if(CUDA_FOUND) @@ -304,9 +299,6 @@ list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/bda/cuda/cusparseSolverBackend.hpp opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp opm/simulators/linalg/bda/opencl/BISAI.hpp - opm/simulators/linalg/bda/FPGABILU0.hpp - opm/simulators/linalg/bda/FPGASolverBackend.hpp - opm/simulators/linalg/bda/FPGAUtils.hpp opm/simulators/linalg/bda/Reorder.hpp opm/simulators/linalg/bda/opencl/opencl.hpp opm/simulators/linalg/bda/opencl/openclKernels.hpp diff --git a/opm-simulators-prereqs.cmake b/opm-simulators-prereqs.cmake index 468f7ed06..741b470ab 100644 --- a/opm-simulators-prereqs.cmake +++ b/opm-simulators-prereqs.cmake @@ -8,7 +8,6 @@ set (opm-simulators_CONFIG_VAR HAVE_CUDA HAVE_OPENCL HAVE_OPENCL_HPP - HAVE_FPGA HAVE_AMGCL HAVE_VEXCL HAVE_ROCALUTION diff --git a/opm/simulators/linalg/FlowLinearSolverParameters.hpp b/opm/simulators/linalg/FlowLinearSolverParameters.hpp index ec57870b5..4cfc20bec 100644 --- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp +++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp @@ -128,10 +128,6 @@ template struct OpenclIluParallel { using type = UndefinedProperty; }; -template -struct FpgaBitstream { - using type = UndefinedProperty; -}; template struct LinearSolverReduction { using type = GetPropType; @@ -226,10 +222,6 @@ template struct OpenclIluParallel { static constexpr bool value = true; // note: false should only be used in debug }; -template -struct FpgaBitstream { - static constexpr auto value = ""; -}; } // namespace Opm::Properties @@ -261,7 +253,6 @@ namespace Opm int cpr_reuse_setup_; int cpr_reuse_interval_; bool opencl_ilu_parallel_; - std::string fpga_bitstream_; template void init() @@ -287,7 +278,6 @@ namespace Opm bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId); opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId); opencl_ilu_parallel_ = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel); - fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); } template @@ -309,11 +299,10 @@ namespace Opm EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate, 4: recreated every CprReuseInterval"); EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseInterval, "Reuse preconditioner interval. Used when CprReuseSetup is set to 4, then the preconditioner will be fully recreated instead of reused every N linear solve, where N is this parameter."); EWOMS_REGISTER_PARAM(TypeTag, std::string, LinearSolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'"); - EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl]'"); + EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Choose a linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]'"); EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs"); EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs"); EWOMS_REGISTER_PARAM(TypeTag, bool, OpenclIluParallel, "Parallelize ILU decomposition and application on GPU"); - EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream='"); } FlowLinearSolverParameters() { reset(); } @@ -337,7 +326,6 @@ namespace Opm bda_device_id_ = 0; opencl_platform_id_ = 0; opencl_ilu_parallel_ = true; - fpga_bitstream_ = ""; } }; diff --git a/opm/simulators/linalg/ISTLSolverEbos.cpp b/opm/simulators/linalg/ISTLSolverEbos.cpp index 1a15bed8f..beee1906f 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.cpp +++ b/opm/simulators/linalg/ISTLSolverEbos.cpp @@ -31,7 +31,7 @@ #include #include -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION #include #include #include @@ -176,11 +176,10 @@ void FlexibleSolverInfo::create(const Matrix& matrix, } } -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION template BdaSolverInfo:: BdaSolverInfo(const std::string& accelerator_mode, - const std::string& fpga_bitstream, const int linear_solver_verbosity, const int maxit, const double tolerance, @@ -188,7 +187,7 @@ BdaSolverInfo(const std::string& accelerator_mode, const int deviceID, const bool opencl_ilu_parallel, const std::string& linsolver) - : bridge_(std::make_unique(accelerator_mode, fpga_bitstream, + : bridge_(std::make_unique(accelerator_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver)) @@ -229,12 +228,11 @@ apply(Vector& rhs, Dune::InverseOperatorResult& result) { bool use_gpu = bridge_->getUseGpu(); - bool use_fpga = bridge_->getUseFpga(); - if (use_gpu || use_fpga) { + if (use_gpu) { auto wellContribs = WellContributions::create(accelerator_mode_, useWellConn); bridge_->initWellContributions(*wellContribs, x.N() * x[0].N()); - // the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA, amgcl or rocalution + // the WellContributions can only be applied separately with CUDA or OpenCL, not with amgcl or rocalution #if HAVE_CUDA || HAVE_OPENCL if (!useWellConn) { getContribs(*wellContribs); @@ -350,7 +348,7 @@ using CommunicationType = Dune::CollectiveCommunication; template void makeOverlapRowsInvalid>(BM&, const std::vector&); \ template struct FlexibleSolverInfo,BV,CommunicationType>; -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION #define INSTANCE_GRID(Dim, Grid) \ template void BdaSolverInfo,BV>:: \ diff --git a/opm/simulators/linalg/ISTLSolverEbos.hpp b/opm/simulators/linalg/ISTLSolverEbos.hpp index a49c633af..958729fc7 100644 --- a/opm/simulators/linalg/ISTLSolverEbos.hpp +++ b/opm/simulators/linalg/ISTLSolverEbos.hpp @@ -84,7 +84,7 @@ public: namespace Opm { -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION template class BdaBridge; class WellContributions; #endif @@ -113,7 +113,7 @@ struct FlexibleSolverInfo size_t interiorCellNum_ = 0; }; -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION template struct BdaSolverInfo { @@ -121,7 +121,6 @@ struct BdaSolverInfo using Bridge = BdaBridge; BdaSolverInfo(const std::string& accelerator_mode, - const std::string& fpga_bitstream, const int linear_solver_verbosity, const int maxit, const double tolerance, @@ -246,12 +245,12 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, EWOMS_PARAM_IS_SET(TypeTag, int, LinearSolverMaxIter), EWOMS_PARAM_IS_SET(TypeTag, int, CprMaxEllIter)); -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION { std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode); if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) { if (on_io_rank) { - OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled"); + OpmLog::warning("Cannot use GPU with MPI, GPU are disabled"); } accelerator_mode = "none"; } @@ -261,10 +260,8 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction); const bool opencl_ilu_parallel = EWOMS_GET_PARAM(TypeTag, bool, OpenclIluParallel); const int linear_solver_verbosity = parameters_.linear_solver_verbosity_; - std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream); std::string linsolver = EWOMS_GET_PARAM(TypeTag, std::string, LinearSolver); bdaBridge = std::make_unique>(accelerator_mode, - fpga_bitstream, linear_solver_verbosity, maxit, tolerance, @@ -275,7 +272,7 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, } #else if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") { - OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake and FPGA was not enabled"); + OPM_THROW(std::logic_error,"Cannot use accelerated solver since CUDA, OpenCL and amgcl were not found by cmake"); } #endif extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_); @@ -286,12 +283,6 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, ElementMapper elemMapper(simulator_.vanguard().gridView(), Dune::mcmgElementLayout()); detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_); useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions); -#if HAVE_FPGA - // check usage of MatrixAddWellContributions: for FPGA they must be included - if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) { - OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true"); - } -#endif const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst); if (!ownersFirst) { const std::string msg = "The linear solver no longer supports --owner-cells-first=false."; @@ -392,9 +383,7 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, // Solve system. Dune::InverseOperatorResult result; - // Use GPU if: available, chosen by user, and successful. - // Use FPGA if: support compiled, chosen by user, and successful. -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION std::function getContribs = [this](WellContributions& w) { @@ -566,7 +555,7 @@ std::unique_ptr blockJacobiAdjacency(const Grid& grid, Matrix* matrix_; Vector *rhs_; -#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA || HAVE_AMGCL || HAVE_ROCALUTION +#if HAVE_CUDA || HAVE_OPENCL || HAVE_AMGCL || HAVE_ROCALUTION std::unique_ptr> bdaBridge; #endif diff --git a/opm/simulators/linalg/bda/BdaBridge.cpp b/opm/simulators/linalg/bda/BdaBridge.cpp index 24e123b90..0d1a7e18c 100644 --- a/opm/simulators/linalg/bda/BdaBridge.cpp +++ b/opm/simulators/linalg/bda/BdaBridge.cpp @@ -37,10 +37,6 @@ #include #endif -#if HAVE_FPGA -#include -#endif - #if HAVE_AMGCL #include #endif @@ -60,7 +56,6 @@ namespace Opm template BdaBridge::BdaBridge(std::string accelerator_mode_, - [[maybe_unused]] std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, [[maybe_unused]] unsigned int platformID, @@ -82,13 +77,6 @@ BdaBridge::BdaBridge(std::string acceler backend.reset(new Opm::Accelerator::openclSolverBackend(linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_parallel, linsolver)); #else OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake"); -#endif - } else if (accelerator_mode.compare("fpga") == 0) { -#if HAVE_FPGA - use_fpga = true; - backend.reset(new Opm::Accelerator::FpgaSolverBackend(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, opencl_ilu_parallel)); -#else - OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake"); #endif } else if (accelerator_mode.compare("amgcl") == 0) { #if HAVE_AMGCL @@ -106,9 +94,8 @@ BdaBridge::BdaBridge(std::string acceler #endif } else if (accelerator_mode.compare("none") == 0) { use_gpu = false; - use_fpga = false; } else { - OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga|amgcl|rocalution]"); + OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]"); } } @@ -208,7 +195,7 @@ void BdaBridge::solve_system(BridgeMatri WellContributions& wellContribs, InverseOperatorResult& res) { - if (use_gpu || use_fpga) { + if (use_gpu) { BdaResult result; result.converged = false; const int dim = (*bridgeMat)[0][0].N(); @@ -217,7 +204,7 @@ void BdaBridge::solve_system(BridgeMatri if (dim != 3) { OpmLog::warning("BdaSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program"); - use_gpu = use_fpga = false; + use_gpu = false; return; } @@ -289,7 +276,7 @@ void BdaBridge::solve_system(BridgeMatri template void BdaBridge::get_result([[maybe_unused]] BridgeVector& x) { - if (use_gpu || use_fpga) { + if (use_gpu) { backend->get_result(static_cast(&(x[0][0]))); } } diff --git a/opm/simulators/linalg/bda/BdaBridge.hpp b/opm/simulators/linalg/bda/BdaBridge.hpp index 33609d3a4..8001b9552 100644 --- a/opm/simulators/linalg/bda/BdaBridge.hpp +++ b/opm/simulators/linalg/bda/BdaBridge.hpp @@ -38,7 +38,6 @@ class BdaBridge private: int verbosity = 0; bool use_gpu = false; - bool use_fpga = false; std::string accelerator_mode; std::unique_ptr > backend; std::shared_ptr matrix; // 'stores' matrix, actually points to h_rows, h_cols and the received BridgeMatrix for the nonzeroes @@ -50,8 +49,7 @@ private: public: /// Construct a BdaBridge - /// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]' - /// \param[in] fpga_bitstream FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[]' + /// \param[in] accelerator_mode to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|amgcl|rocalution]' /// \param[in] linear_solver_verbosity verbosity of BdaSolver /// \param[in] maxit maximum number of iterations for BdaSolver /// \param[in] tolerance required relative tolerance for BdaSolver @@ -59,7 +57,7 @@ public: /// \param[in] deviceID the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors /// \param[in] opencl_ilu_parallel whether to parallelize the ILU decomposition and application in OpenCL with level_scheduling /// \param[in] linsolver indicating the preconditioner, equal to the --linear-solver cmdline argument - BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, + BdaBridge(std::string accelerator_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, bool opencl_ilu_parallel, std::string linsolver); @@ -95,12 +93,6 @@ public: /// \param[in] N number of rows in scalar vector that wellContribs will be applied on void initWellContributions(WellContributions& wellContribs, unsigned N); - /// Return whether the BdaBridge will use the FPGA or not - /// return whether the BdaBridge will use the FPGA or not - bool getUseFpga(){ - return use_fpga; - } - /// Return the selected accelerator mode, this is input via the command-line std::string getAccleratorName(){ return accelerator_mode; diff --git a/opm/simulators/linalg/bda/BdaSolver.hpp b/opm/simulators/linalg/bda/BdaSolver.hpp index dd5b97094..586a7b9f6 100644 --- a/opm/simulators/linalg/bda/BdaSolver.hpp +++ b/opm/simulators/linalg/bda/BdaSolver.hpp @@ -57,8 +57,6 @@ namespace Accelerator { int maxit = 200; double tolerance = 1e-2; - std::string bitstream; - int N; // number of rows int Nb; // number of blocked rows (Nb*block_size == N) int nnz; // number of nonzeroes (scalars) @@ -70,8 +68,7 @@ namespace Accelerator { bool initialized = false; public: - /// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver - /// \param[in] fpga_bitstream FPGA bitstream file name (only for fpgaSolver) + /// Construct a BdaSolver /// \param[in] linear_solver_verbosity verbosity of solver /// \param[in] maxit maximum number of iterations for solver /// \param[in] tolerance required relative tolerance for solver @@ -80,7 +77,6 @@ namespace Accelerator { BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {}; BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {}; - BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {}; /// Define virtual destructor, so that the derivedclass destructor will be called virtual ~BdaSolver() {}; diff --git a/opm/simulators/linalg/bda/BlockedMatrix.cpp b/opm/simulators/linalg/bda/BlockedMatrix.cpp index c5b2074e7..20f055cb1 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.cpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp @@ -27,7 +27,6 @@ #include #include -#include #include namespace Opm @@ -98,368 +97,5 @@ void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_si } } -#if HAVE_FPGA - -/*Subtract two blocks from one another element by element*/ -void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size) { - for (unsigned int row = 0; row < block_size; row++) { - for (unsigned int col = 0; col < block_size; col++) { - resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col]; - } - } -} - -/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/ -void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size) { - for (unsigned int row = 0; row < block_size; row++) { - if (resetRes) { - resVect[row] = 0.0; - } - for (unsigned int col = 0; col < block_size; col++) { - resVect[row] += scale * mat[row * block_size + col] * vect[col]; - } - } -} - - - -int BlockedMatrix::countUnblockedNnzs() { - int numNnzsOverThreshold = 0; - int totalNnzs = rowPointers[Nb]; - for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) { - if (fabs(nnzValues[idx]) > nnzThreshold) { - numNnzsOverThreshold++; - } - } - return numNnzsOverThreshold; -} - -/* - * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks. - * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order. -*/ -void BlockedMatrix::unblock(Matrix *mat, bool isUMatrix) { - const unsigned int bs = block_size; - int valIndex = 0, nnzsPerRow; - - mat->rowPointers[0] = 0; - // go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and - // write all non-zero values and corresponding column indices that belong to the same row into the new matrix. - for (int row = 0; row < Nb; row++) { - for (unsigned int bRow = 0; bRow < bs; bRow++) { - nnzsPerRow = 0; - for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) { - for (unsigned int bCol = 0; bCol < bs; bCol++) { - int idx = 0; - // If the matrix is the U matrix, store the rows inside a block in reverse order. - if (isUMatrix) { - idx = col * bs * bs + (bs - bRow - 1) * bs + bCol; - } else { - idx = col * bs * bs + bRow * bs + bCol; - } - - if (fabs(nnzValues[idx]) > nnzThreshold) { - mat->nnzValues[valIndex] = nnzValues[idx]; - mat->colIndices[valIndex] = colIndices[col] * bs + bCol; - valIndex++; - nnzsPerRow++; - } - } - } - // Update the rowpointers of the new matrix - mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow; - } - } -} - - - -/*Optimized version*/ -// ub* prefixes indicate unblocked data -int BlockedMatrix::toRDF(int numColors, int *nodesPerColor, bool isUMatrix, - std::vector >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes, - std::vector >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize) -{ - int res; - int numUnblockedNnzs = countUnblockedNnzs(); - - // initialize the non-blocked matrix with the obtained size. - std::unique_ptr ubMat = std::make_unique(Nb * block_size, numUnblockedNnzs); - - unblock(ubMat.get(), isUMatrix); - - std::vector ubNodesPerColor(numColors); - for (int i = 0; i < numColors; i++) { - ubNodesPerColor[i] = nodesPerColor[i] * block_size; - } - - *valSize = ubMat->nnzs; - - res = ubMat->toRDF(numColors, ubNodesPerColor, - colIndicesInColor, nnzsPerRowLimit, - ubNnzValues, ubColIndices, nnzValsSizes, - NROffsets, colorSizes); - return res; -} - - -// coloring is already done, numColors and nodesPerColor are set -// [rows|columns]PerColorLimit are already queried from the FPGA -// colIndicesInColor, PIndicesAddr and colorSizes are written here -// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU -// node == row -// color == partition -// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs -// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF(). -// due to partitioning, lots of columns are removed, this matrix keeps track of the mapping -// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked -// if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0 -// then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA -int BlockedMatrix::findPartitionColumns(int numColors, int *nodesPerColor, - int rowsPerColorLimit, int columnsPerColorLimit, - std::vector >& colIndicesInColor, int *PIndicesAddr, int *colorSizes, - std::vector >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes, - std::vector >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes) -{ - // Data related to column indices per partition - int doneRows = 0; - std::vector isColAccessed(Nb); // std::vector might have some different optimized implementation, initialize in a loop - std::vector isLColAccessed(Nb); - int totalCols = 0; // sum of numColAccesses for each color, blocked - int LTotalCols = 0, UTotalCols = 0; - int maxCols = 0; // max value of numColAccesses for any color - int maxRowsPerColor = 0; // max value of numRows for any color - int maxColsPerRow = 0; // max value of colsPerRow for any color - // colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates - // colsInColor[c][i] contains the ith column that color c accesses - // initial size allows for each color to access all columns, with space for padding - std::vector > colsInColor(numColors, std::vector(roundUpTo(Nb, 16))); - std::vector > LColsInColor(numColors, std::vector(roundUpTo(Nb, 16))); - std::vector > UColsInColor(numColors, std::vector(roundUpTo(Nb, 16))); - - // find which columns are accessed in each color, as well as how many non-zeroes there are per color. - for (int c = 0; c < numColors; c++) { - int numRows = 0; - // initialize - for (int row = 0; row < Nb; row++) { - isColAccessed[row] = false; - isLColAccessed[row] = false; - } - if (c > 0) { - for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) { - isLColAccessed[i] = true; - } - } - int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0; // number of unique accesses, blocked - // for every row in this color - for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) { - int colsPerRow = 0; // number of blocks for this row - bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]); - for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) { - // for every column in the current row, check if that column was accessed before this color - int col = colIndices[idx]; - if (isColAccessed[col] == false) { - colsInColor[c][numColAccesses] = col; - isColAccessed[col] = true; - numColAccesses++; - if (col > row) { - UColsInColor[numColors - c - 1][UNumColAccesses] = col; - UNumColAccesses++; - } - } - if (isLColAccessed[col] == false) { - if (col < row) { - LColsInColor[c][LNumColAccesses] = col; - LNumColAccesses++; - isLColAccessed[col] = true; - } - } - colsPerRow++; - } - if (rowIsEmpty != true) { - numRows++; - } - maxColsPerRow = std::max(maxColsPerRow, colsPerRow); - } - - // add columns from previous color into L partition to simplify data forwarding - if (c > 0) { - for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) { - LColsInColor[c][LNumColAccesses] = i; - LNumColAccesses++; - } - } - - colorSizes[c * 4 + 10] = numColAccesses * block_size; - LColorSizes[c * 4 + 10] = LNumColAccesses * block_size; - UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size; - - // store mapping - for (int col = 0; col < numColAccesses; col++) { - for (unsigned int i = 0; i < block_size; i++) { - colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i; - } - } - for (int col = 0; col < LNumColAccesses; col++) { - for (unsigned int i = 0; i < block_size; i++) { - LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i; - } - } - for (int col = 0; col < UNumColAccesses; col++) { - for (unsigned int i = 0; i < block_size; i++) { - UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i; - } - } - - // zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline - while (numColAccesses % 16 != 0) { - colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1]; - numColAccesses++; - } - while (LNumColAccesses % 16 != 0) { - LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1]; - LNumColAccesses++; - } - while (UNumColAccesses % 16 != 0) { - UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1]; - UNumColAccesses++; - } - maxCols = std::max(numColAccesses, maxCols); - totalCols += numColAccesses; - LTotalCols += LNumColAccesses; - UTotalCols += UNumColAccesses; - doneRows = doneRows + nodesPerColor[c]; - maxRowsPerColor = std::max(numRows, maxRowsPerColor); - } - - if (maxCols * static_cast(block_size) > columnsPerColorLimit) { - std::ostringstream errorstring; - errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit; - OPM_THROW(std::logic_error, errorstring.str()); - } - - doneRows = 0; - int diagValsSize = 0; - int maxRows = 0; - - for (int c = 0; c < numColors; c++) { - // calculate sizes that include zeropadding - diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8); - doneRows += nodesPerColor[c]; - if (nodesPerColor[c] * static_cast(block_size) > maxRows) - maxRows = nodesPerColor[c]; - colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size; - LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size; - UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size; - } - - if (maxRows * static_cast(block_size) > rowsPerColorLimit) { - std::ostringstream errorstring; - errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit; - OPM_THROW(std::logic_error, errorstring.str()); - } - - // create and fill sizes array as far as already possible - colorSizes[0] = Nb * block_size; - LColorSizes[0] = Nb * block_size; - UColorSizes[0] = Nb * block_size; - // col_sizes (but the matrix is square) - colorSizes[1] = Nb * block_size; - LColorSizes[1] = Nb * block_size; - UColorSizes[1] = Nb * block_size; - colorSizes[2] = totalCols * block_size; - LColorSizes[2] = LTotalCols * block_size; - UColorSizes[2] = UTotalCols * block_size; - // missing val_size, written in Matrix::toRDF() - colorSizes[4] = numColors; - LColorSizes[4] = numColors; - UColorSizes[4] = numColors; - // missing NRFlagsSize, written in Matrix::toRDF() - colorSizes[6] = diagValsSize; - LColorSizes[6] = diagValsSize; - UColorSizes[6] = diagValsSize; - - int paddingIdx = numColors; - while (paddingIdx % 4 != 0) { - for (unsigned int i = 0; i < 4; i++) { - colorSizes[paddingIdx * 4 + 8 + i] = 0; - LColorSizes[paddingIdx * 4 + 8 + i] = 0; - UColorSizes[paddingIdx * 4 + 8 + i] = 0; - } - paddingIdx++; - } - - int index = 0, Lindex = 0, Uindex = 0; - for (int c = 0; c < numColors; c++) { - // for each unique col access - for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { - for (unsigned int i = 0; i < block_size; i++) { - PIndicesAddr[index] = colsInColor[c][col] * block_size + i; - index++; - } - } - // add padding - while (index % 16 != 0) { - PIndicesAddr[index] = PIndicesAddr[index - 1]; - index++; - } - for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { - for (unsigned int i = 0; i < block_size; i++) { - LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i; - Lindex++; - } - } - while (Lindex % 16 != 0) { - LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1]; - Lindex++; - } - for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast(block_size) ; col++) { - for (unsigned int i = 0; i < block_size; i++) { - UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i; - Uindex++; - } - } - while (Uindex % 16 != 0) { - UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1]; - Uindex++; - } - } - return 0; -} - - -void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector& rowsPerColor, double *RDFDiag) { - const unsigned int block_size = 3; - int doneRows = rowSize - 1; // since the rows of U are reversed, the rows of the diag are also reversed - int RDFIndex = 0; - for (int c = 0; c < numColors; c++) { - for (int r = 0; r < rowsPerColor[c]; r++) { - - // the rows in the block are reversed - for (int i = static_cast(block_size) - 1; i >= 0; i--) { - for (unsigned int j = 0; j < block_size; j++) { - RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j]; - RDFIndex++; - } - // add 4th column, zeropadding - RDFDiag[RDFIndex] = 0.0; - RDFIndex++; - } - } - doneRows -= rowsPerColor[c]; - - // make sure the color completely fills a cacheline - // a color with 3 blocks would otherwise leave space - while (RDFIndex % 8 != 0) { - RDFDiag[RDFIndex] = 0.0; - RDFIndex++; - } - } - assert(RDFIndex % 8 == 0); -} - -#endif // HAVE_FPGA - - } // namespace Accelerator } // namespace Opm diff --git a/opm/simulators/linalg/bda/BlockedMatrix.hpp b/opm/simulators/linalg/bda/BlockedMatrix.hpp index ba120e523..d146938b6 100644 --- a/opm/simulators/linalg/bda/BlockedMatrix.hpp +++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp @@ -20,12 +20,6 @@ #ifndef OPM_BLOCKED_MATRIX_HPP #define OPM_BLOCKED_MATRIX_HPP -#if HAVE_FPGA -#include -#include -#endif - - namespace Opm { namespace Accelerator @@ -94,27 +88,6 @@ public: } } -#if HAVE_FPGA - constexpr static double nnzThreshold = 1e-80; // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero - - int countUnblockedNnzs(); - - void unblock(Matrix *mat, bool isUMatrix); - - /// Converts this matrix to the dataformat used by the FPGA - /// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking - int toRDF(int numColors, int *nodesPerColor, bool isUMatrix, - std::vector >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes, - std::vector >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize); - - /// Analyses the sparsity pattern and prepares for toRDF() - /// Is only called once - int findPartitionColumns(int numColors, int *nodesPerColor, - int rowsPerColorLimit, int columnsPerColorLimit, - std::vector >& colIndicesInColor, int *PIndicesAddr, int *colorSizes, - std::vector >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes, - std::vector >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes); -#endif double *nnzValues; int *colIndices; @@ -151,40 +124,6 @@ void blockMultSub(double *a, double *b, double *c, unsigned int block_size); /// \param[in] block_size size of block void blockMult(double *mat1, double *mat2, double *resMat, unsigned int block_size); - -#if HAVE_FPGA -/// Perform a matrix-matrix subtraction on two blocks, element-wise -/// resMat = mat1 - mat2 -/// \param[in] mat1 input block 1 -/// \param[in] mat2 input block 2 -/// \param[out] resMat output block -/// \param[in] block_size size of block -void blockSub(double *mat1, double *mat2, double *resMat, unsigned int block_size); - -/// Perform a matrix-vector multiplication -/// resVect = mat * vect -/// resVect += mat * vect -/// \param[in] mat input matrix -/// \param[in] vect input vector -/// \param[in] scale multiply output with this factor -/// \param[inout] resVect output vector -/// \param[in] resetRes if true, overwrite resVect, otherwise add to it -/// \param[in] block_size size of block -void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes, unsigned int block_size); - -/// Convert a blocked inverse diagonal to the FPGA format. -/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U. -/// Since the rows of U are reversed, the rows of the diag are also reversed. -/// The cachelines can hold 8 doubles, a block has 9 doubles. -/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows. -/// Then 2 blocks (24 doubles) fit on 3 cachelines. -/// Example: -/// [1 2 3] [1 2 3 0] [1 2 3 0 4 5 6 0] -/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1] -/// [7 8 9] [7 8 9 0] [block2 row2 block2 row3] -void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector& rowsPerColor, double *RDFDiag); -#endif - } // namespace Accelerator } // namespace Opm diff --git a/opm/simulators/linalg/bda/FPGABILU0.cpp b/opm/simulators/linalg/bda/FPGABILU0.cpp deleted file mode 100644 index 013fa406d..000000000 --- a/opm/simulators/linalg/bda/FPGABILU0.cpp +++ /dev/null @@ -1,418 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace Opm -{ -namespace Accelerator -{ - -using Opm::OpmLog; -using Dune::Timer; - -template -FPGABILU0::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) : - verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_) -{ - if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { - level_scheduling = true; - } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) { - graph_coloring = true; - } else { - OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n"); - } -} - - -template -FPGABILU0::~FPGABILU0() -{ - delete[] invDiagVals; -} - - -template -bool FPGABILU0::init(BlockedMatrix *mat) -{ - const unsigned int bs = block_size; - - resultPointers.resize(numResultPointers, nullptr); - resultSizes.resize(numResultSizes); - - // Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added. - const unsigned int nnzSplit = 1; - - this->N = mat->Nb * block_size; - this->Nb = mat->Nb; - this->nnz = mat->nnzbs * block_size * block_size; - this->nnzbs = mat->nnzbs; - - toOrder.resize(Nb); - fromOrder.resize(Nb); - - std::vector CSCRowIndices(nnzbs); - std::vector CSCColPointers(Nb + 1); - - if (level_scheduling) { - Timer t_convert; - csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb); - if (verbosity >= 3) { - std::ostringstream out; - out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s"; - OpmLog::info(out.str()); - } - } - - Timer t_analysis; - rMat = std::make_shared(mat->Nb, mat->nnzbs, block_size); - LUMat = std::make_unique(*rMat); - std::ostringstream out; - if (level_scheduling) { - out << "FPGABILU0 reordering strategy: " << "level_scheduling\n"; - findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); - } else if (graph_coloring) { - out << "FPGABILU0 reordering strategy: " << "graph_coloring\n"; - findGraphColoring(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor); - } - - if (numColors > maxNumColors) { - std::ostringstream errorstring; - errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n"; - OPM_THROW(std::logic_error, errorstring.str()); - } - - if (verbosity >= 3) { - out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors"; - } - OpmLog::info(out.str()); - - int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0; - int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0; - int blockDiagSize = 0; - // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes - reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get()); - int doneRows = 0; - for (int c = 0; c < numColors; c++) { - for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) { - for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) { - int columnIndex = rMat->colIndices[j]; - if (columnIndex < i) { - LColorRoundedValSize += 9; - LNROffsetSize += 9; - } - if (columnIndex > i) { - UColorRoundedValSize += 9; - UNROffsetSize += 9; - } - colorRoundedValSize += 9; - NROffsetSize += 9; - } - blockDiagSize += 12; - } - // End of color: round all sizes to nearest cacheline - colorRoundedValSize = roundUpTo(colorRoundedValSize, 32); - LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32); - UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32); - NROffsetSize = roundUpTo(NROffsetSize, 64); - LNROffsetSize = roundUpTo(LNROffsetSize, 64); - UNROffsetSize = roundUpTo(UNROffsetSize, 64); - blockDiagSize = roundUpTo(blockDiagSize, 8); - - doneRows += rowsPerColor[c]; - } - int colorSizesNum = 8 + roundUpTo(4 * numColors, 16); - int worstCaseColumnAccessNum = numColors * maxColsPerColor; - - nnzValues.resize(nnzSplit, std::vector(colorRoundedValSize)); - LnnzValues.resize(nnzSplit, std::vector(LColorRoundedValSize)); - UnnzValues.resize(nnzSplit, std::vector(UColorRoundedValSize)); - // initial number of nnz, used to allocate - nnzValsSizes.resize(nnzSplit, colorRoundedValSize); - LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize); - UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize); - colIndices.resize(colorRoundedValSize); - LColIndices.resize(LColorRoundedValSize); - UColIndices.resize(UColorRoundedValSize); - NROffsets.resize(NROffsetSize); - LNROffsets.resize(LNROffsetSize); - UNROffsets.resize(UNROffsetSize); - PIndicesAddr.resize(worstCaseColumnAccessNum); - LPIndicesAddr.resize(worstCaseColumnAccessNum); - UPIndicesAddr.resize(worstCaseColumnAccessNum); - colorSizes.resize(colorSizesNum); - LColorSizes.resize(colorSizesNum); - UColorSizes.resize(colorSizesNum); - blockDiag.resize(blockDiagSize); - colIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); - LColIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); - UColIndicesInColor.resize(numColors, std::vector(rMat->Nb * block_size, 0)); - - int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(), - maxRowsPerColor, maxColsPerColor, - colIndicesInColor, PIndicesAddr.data(), colorSizes.data(), - LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(), - UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data()); - if (err != 0) { - std::ostringstream errorstring; - errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n"; - OPM_THROW(std::logic_error, errorstring.str()); - } - - diagIndex.resize(mat->Nb, 0); - invDiagVals = new double[mat->Nb * bs * bs]; - LMat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size); - UMat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size); - resultPointers[0] = (void *) colorSizes.data(); - resultPointers[1] = (void *) PIndicesAddr.data(); - resultPointers[2] = (void *) nnzValues.data(); - resultPointers[3] = (void *) colIndices.data(); - resultPointers[4] = (void *) NROffsets.data(); - resultPointers[5] = (void *) nnzValsSizes.data(); - resultPointers[6] = (void *) LColorSizes.data(); - resultPointers[7] = (void *) LPIndicesAddr.data(); - resultPointers[8] = (void *) LnnzValues.data(); - resultPointers[9] = (void *) LColIndices.data(); - resultPointers[10] = (void *) LNROffsets.data(); - resultPointers[11] = (void *) LnnzValsSizes.data(); - resultPointers[12] = (void *) UColorSizes.data(); - resultPointers[13] = (void *) UPIndicesAddr.data(); - resultPointers[14] = (void *) UnnzValues.data(); - resultPointers[15] = (void *) UColIndices.data(); - resultPointers[16] = (void *) UNROffsets.data(); - resultPointers[17] = (void *) UnnzValsSizes.data(); - resultPointers[18] = (void *) blockDiag.data(); - //resultPointers[19] and [20] are set by the caller - resultSizes[0] = mat->Nb * block_size; - resultSizes[1] = colorRoundedValSize; // zeropadded valSize; - resultSizes[2] = numColors; - resultSizes[3] = worstCaseColumnAccessNum; //totalCols - resultSizes[4] = NROffsetSize; //NRFlagSize - resultSizes[5] = blockDiagSize; //diagValsSize - resultSizes[6] = mat->Nb * block_size; - resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize; - resultSizes[8] = numColors; - resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols - resultSizes[10] = LNROffsetSize; //LNRFlagSize - resultSizes[11] = blockDiagSize; //LDiagValsSize - resultSizes[12] = mat->Nb * block_size; - resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize; - resultSizes[14] = numColors; - resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols - resultSizes[16] = UNROffsetSize; //UNRFlagSize - resultSizes[17] = blockDiagSize; //UDiagValsSize - return true; -} // end init() - - -template -bool FPGABILU0::create_preconditioner(BlockedMatrix *mat) -{ - const unsigned int bs = block_size; - Timer t_reorder; - reorderBlockedMatrixByPattern(mat, toOrder.data(), fromOrder.data(), rMat.get()); - - if (verbosity >= 3) { - std::ostringstream out; - out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s"; - OpmLog::info(out.str()); - } - - // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp - Timer t_memcpy; - memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs); - - if (verbosity >= 3) { - std::ostringstream out; - out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s"; - OpmLog::info(out.str()); - } - - int i, j, ij, ik, jk; - int iRowStart, iRowEnd, jRowEnd; - double pivot[bs * bs]; - int LSize = 0; - Opm::Detail::Inverter inverter; // reuse inverter to invert blocks - - Timer t_decomposition; - - // go through all rows - for (i = 0; i < LUMat->Nb; i++) { - iRowStart = LUMat->rowPointers[i]; - iRowEnd = LUMat->rowPointers[i + 1]; - - // go through all elements of the row - for (ij = iRowStart; ij < iRowEnd; ij++) { - j = LUMat->colIndices[ij]; - // if the element is the diagonal, store the index and go to next row - if (j == i) { - diagIndex[i] = ij; - break; - } - // if an element beyond the diagonal is reach, no diagonal was found - // throw an error now. TODO: perform reordering earlier to prevent this - if (j > i) { - std::ostringstream out; - out << "BILU0 Error could not find diagonal value in row: " << i; - OpmLog::error(out.str()); - return false; - } - - LSize++; - // calculate the pivot of this row - blockMult(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0], bs); - - memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs); - - jRowEnd = LUMat->rowPointers[j + 1]; - jk = diagIndex[j] + 1; - ik = ij + 1; - // subtract that row scaled by the pivot from this row. - while (ik < iRowEnd && jk < jRowEnd) { - if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) { - blockMultSub(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs, bs); - ik++; - jk++; - } else { - if (LUMat->colIndices[ik] < LUMat->colIndices[jk]) - { ik++; } - else - { jk++; } - } - } - } - // store the inverse in the diagonal! - inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs); - memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs); - } - - LMat->rowPointers[0] = 0; - UMat->rowPointers[0] = 0; - - // Split the LU matrix into two by comparing column indices to diagonal indices - for (i = 0; i < LUMat->Nb; i++) { - LMat->rowPointers[i + 1] = LMat->rowPointers[i]; - for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) { - if (j < diagIndex[i]) { - memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs); - LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j]; - LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1; - } - } - } - // Reverse the order or the (blocked) rows for the U matrix, - // because the rows are accessed in reverse order when applying the ILU0 - int URowIndex = 0; - for (i = LUMat->Nb - 1; i >= 0; i--) { - UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex]; - for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) { - if (j > diagIndex[i]) { - memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs); - UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j]; - UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1; - } - } - URowIndex++; - } - - if (verbosity >= 3) { - std::ostringstream out; - out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s"; - OpmLog::info(out.str()); - } - - std::vector URowsPerColor(numColors); - rowSize = block_size * rMat->Nb; - LRowSize = block_size * LMat->Nb; - URowSize = block_size * UMat->Nb; - LNumColors = numColors; - UNumColors = numColors; - for (int c = 0; c < numColors; c++) { - URowsPerColor[numColors - c - 1] = rowsPerColor[c]; - } - int err; - err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false, - colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(), - nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize); - if (err != 0) { - return false; - } - err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false, - LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(), - LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize); - if (err != 0) { - return false; - } - err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true, - UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(), - UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize); - if (err != 0) { - return false; - } - blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data()); - // resultPointers are set in the init method - resultSizes[0] = rowSize; - resultSizes[1] = colorSizes[3]; // zeropadded valSize; - resultSizes[2] = numColors; - resultSizes[3] = colorSizes[2]; //totalCols - resultSizes[4] = colorSizes[5]; //NRFlagSize - resultSizes[5] = colorSizes[6]; //diagValsSize - resultSizes[6] = LRowSize; - resultSizes[7] = LColorSizes[3]; // zeropadded LValSize; - resultSizes[8] = LNumColors; - resultSizes[9] = LColorSizes[2]; //LTotalCols - resultSizes[10] = LColorSizes[5]; //LNRFlagSize - resultSizes[11] = LColorSizes[6]; //LDiagValsSize - resultSizes[12] = URowSize; - resultSizes[13] = UColorSizes[3]; // zeropadded UValSize; - resultSizes[14] = UNumColors; - resultSizes[15] = UColorSizes[2]; //UTotalCols - resultSizes[16] = UColorSizes[5]; //UNRFlagSize - resultSizes[17] = UColorSizes[6]; //UDiagValsSize - return true; -} // end create_preconditioner() - - -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template FPGABILU0::FPGABILU0(ILUReorder, int, int, int, int, int); \ -template FPGABILU0::~FPGABILU0(); \ -template bool FPGABILU0::init(BlockedMatrix*); \ -template bool FPGABILU0::create_preconditioner(BlockedMatrix *); - -INSTANTIATE_BDA_FUNCTIONS(1); -INSTANTIATE_BDA_FUNCTIONS(2); -INSTANTIATE_BDA_FUNCTIONS(3); -INSTANTIATE_BDA_FUNCTIONS(4); -INSTANTIATE_BDA_FUNCTIONS(5); -INSTANTIATE_BDA_FUNCTIONS(6); - -#undef INSTANTIATE_BDA_FUNCTIONS - -} // namespace Accelerator -} // namespace Opm diff --git a/opm/simulators/linalg/bda/FPGABILU0.hpp b/opm/simulators/linalg/bda/FPGABILU0.hpp deleted file mode 100644 index 423b7077e..000000000 --- a/opm/simulators/linalg/bda/FPGABILU0.hpp +++ /dev/null @@ -1,120 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#ifndef FPGA_BILU0_HEADER_INCLUDED -#define FPGA_BILU0_HEADER_INCLUDED - -#include - -#include -#include - -namespace Opm -{ -namespace Accelerator -{ - -/* - * This class implements a Blocked ILU0 preconditioner, with output data - * specifically formatted for the FPGA. - * The decomposition and reorders of the rows of the matrix are done on CPU. - */ - -template -class FPGABILU0 -{ - -private: - int N; // number of rows of the matrix - int Nb; // number of blockrows of the matrix - int nnz; // number of nonzeroes of the matrix (scalar) - int nnzbs; // number of blocks of the matrix - std::unique_ptr LMat = nullptr, UMat = nullptr, LUMat = nullptr; - std::shared_ptr rMat = nullptr; // reordered mat - double *invDiagVals = nullptr; - std::vector diagIndex; - std::vector toOrder, fromOrder; - std::vector rowsPerColor; - int numColors; - int verbosity; - - // sizes and arrays used during RDF generation - std::vector > nnzValues, LnnzValues, UnnzValues; - std::vector colIndices, LColIndices, UColIndices; - std::vector NROffsets, LNROffsets, UNROffsets; - std::vector PIndicesAddr, LPIndicesAddr, UPIndicesAddr; - std::vector colorSizes, LColorSizes, UColorSizes; - std::vector nnzValsSizes, LnnzValsSizes, UnnzValsSizes; - std::vector > colIndicesInColor, LColIndicesInColor, UColIndicesInColor; - - int rowSize, valSize; - int LRowSize, LValSize, LNumColors; - int URowSize, UValSize, UNumColors; - std::vector blockDiag; - ILUReorder opencl_ilu_reorder; - bool level_scheduling = false, graph_coloring = false; - int numResultPointers = 21; - std::vector resultPointers; - int numResultSizes = 18; - std::vector resultSizes; - int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor - -public: - - FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors); - - ~FPGABILU0(); - - // analysis (optional) - bool init(BlockedMatrix *mat); - - // ilu_decomposition - bool create_preconditioner(BlockedMatrix *mat); - - int* getToOrder() - { - return toOrder.data(); - } - - int* getFromOrder() - { - return fromOrder.data(); - } - - BlockedMatrix* getRMat() - { - return rMat.get(); - } - - void **getResultPointers() - { - return resultPointers.data(); - } - - int *getResultSizes() - { - return resultSizes.data(); - } - -}; - -} // namespace Accelerator -} // namespace Opm - -#endif // FPGA_BILU0_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.cpp b/opm/simulators/linalg/bda/FPGASolverBackend.cpp deleted file mode 100644 index 122331e9f..000000000 --- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp +++ /dev/null @@ -1,741 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#include - -#include -#include - -#include -#include - -#include -#include -#include - -// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA -// kernel will be disabled and execution will continue using DUNE -#define FPGA_EXIT_WITH_HW_FAILURE -//#undef FPGA_EXIT_WITH_HW_FAILURE - -// if defined, the function generate_statistics will create a CSV-formatted file -// with detailed statistics about the FPGA backend performance -//#define FPGA_STATISTICS_FILE_ENABLED -#undef FPGA_STATISTICS_FILE_ENABLED - -namespace Opm -{ -namespace Accelerator -{ - -using Opm::OpmLog; - -template -FpgaSolverBackend::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver(fpga_bitstream, verbosity_, maxit_, tolerance_) -{ - int err; - std::ostringstream oss; - double start = second(); - - // currently, only block size == 3 is supported by the FPGA backend - assert(block_size == 3); - - if (verbosity < 1) { - perf_call_enabled = false; - } - // setup bitstream name and other parameters - if (fpga_bitstream.compare("") == 0) { - OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified"); - } - if (!fileExists(fpga_bitstream.c_str())) { - OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable"); - } - // ----------------------------- - // FPGA: setup the OpenCL platform - // ----------------------------- - std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp - // auto-select the proper FPGA device and create context and other CL objects - err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1); - if (err != 0) { - oss << "Failed to setup the OpenCL device (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - oss << "Detected FPGA platform type is "; - if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; } - OpmLog::info(oss.str()); - oss.str(""); - oss.clear(); - // ----------------------------- - // FPGA: setup the debug buffer - // ----------------------------- - // set kernel debug lines depending on an environment variable - const char *xem = getenv("XCL_EMULATION_MODE"); - if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) { - debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU; - oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n"; - OpmLog::info(oss.str()); - oss.str(""); - oss.clear(); - } else { - // set to 2 to reduce overhead in reading back and interpreting the debug lines; - // increase to get more debug info from the kernel - // range is 2..DEBUG_OUTBUF_WORDS_MAX-1 - debug_outbuf_words = 2; - } - - // host debug buffer setup - err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize); - if (err != 0) { - oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - // device debug buffer setup - err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize); - if (err != 0) { - oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n"; - OPM_THROW(std::logic_error, oss.str()); - } - // copy debug buffer to device - err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words); - if (err != 0) { - oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n"; - OPM_THROW(std::logic_error, oss.str()); - } - // ------------------------------------------------ - // FPGA: query the kernel for limits/configuration - // ------------------------------------------------ - err = fpga_kernel_query(context, commands, kernel, cldebug, - debugBuffer, debug_outbuf_words, - rst_assert_cycles, rst_settle_cycles, - &hw_x_vector_elem, &hw_max_row_size, - &hw_max_column_size, &hw_max_colors_size, - &hw_max_nnzs_per_row, &hw_max_matrix_size, - &hw_use_uram, &hw_write_ilu0_results, - &hw_dma_data_width, &hw_mult_num, - &hw_x_vector_latency, &hw_add_latency, &hw_mult_latency, - &hw_num_read_ports, &hw_num_write_ports, - &hw_reset_cycles, &hw_reset_settle); - if (err != 0) { - oss << "Failed to call fpga_kernel_query (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - - if (verbosity >= 1) { - oss << "FPGA kernel limits/configuration:\n"; - oss << " x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n"; - oss << " max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n"; - oss << " use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n"; - oss << " dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n"; - oss << " x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n"; - oss << " add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n"; - oss << " num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n"; - oss << " reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle; - OpmLog::info(oss.str()); - oss.str(""); - oss.clear(); - } - - // check that LU results are generated by the kernel - if (use_LU_res && !hw_write_ilu0_results) { - OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage"); - oss.str(""); - oss.clear(); - use_LU_res = false; - } - - // setup preconditioner - double start_prec = second(); - prec = std::make_unique(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size); - perf_total.s_preconditioner_setup = second() - start_prec; - - if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) { - level_scheduling = true; - } - - perf_total.s_initialization = second() - start; -} // end fpgaSolverBackend - - -template -FpgaSolverBackend::~FpgaSolverBackend() -{ - if (verbosity >= 1) { - generate_statistics(); - } - delete[] rx; - delete[] rb; - if (nnzValArrays != nullptr) { free(nnzValArrays); } - if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); } - if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); } - // FPGA: buffers - free(debugBuffer); - for (int b = 0; b < RW_BUF; b++) { - free(dataBuffer[b]); - } - free(databufferSize); - // FPGA: OpenCL objects - if (cldebug != nullptr) { clReleaseMemObject(cldebug); } - for (int b = 0; b < RW_BUF; b++) { - if (cldata[b] != nullptr) { - clReleaseMemObject(cldata[b]); - } - } - clReleaseCommandQueue(commands); - clReleaseContext(context); - clReleaseKernel(kernel); - clReleaseProgram(program); - clReleaseDevice(device_id); -} // end ~fpgaSolverBackend() - - -// copy result to host memory -// caller must be sure that x is a valid array -template -void FpgaSolverBackend::get_result(double *x_) -{ - double start = 0; - - if (perf_call_enabled) { - start = second(); - } - // apply to results the reordering (stored in toOrder) - reorderBlockedVectorByPattern(mat->Nb, rx, toOrder, x_); - // TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private) - if (perf_call_enabled) { - perf_call.back().s_postprocess = second() - start; - } -} // end get_result() - - -template -SolverStatus FpgaSolverBackend::solve_system(std::shared_ptr matrix, - double *b, - [[maybe_unused]] std::shared_ptr jacMatrix, - WellContributions&, - BdaResult &res) -{ - if (initialized == false) { - initialize(matrix->Nb, matrix->nnzbzs, matrix->nnzValues, matrix->rowPointers, matrix->colIndices); - if (!analyse_matrix()) { - return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED; - } - } - perf_call.emplace_back(); - update_system(matrix->nnzValues, b); - if (!create_preconditioner()) { - return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED; - } - solve_system(res); - - if (verbosity >= 1) { - std::ostringstream oss; - oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \ - ", iterations: " << res.iterations << ", reduction: " << res.reduction << \ - ", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed; - OpmLog::info(oss.str()); - } - return SolverStatus::BDA_SOLVER_SUCCESS; -} - - -template -void FpgaSolverBackend::initialize(int Nb_, int nnzbs, double *vals, int *rows, int *cols) -{ - double start = second(); - this->Nb = Nb_; - this->N = Nb * block_size; - this->nnzb = nnzbs; - this->nnz = nnzbs * block_size * block_size; - - // allocate host memory for matrices and vectors - // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here - mat.reset(new BlockedMatrix(N_ / block_size, nnz_ / block_size / block_size, block_size, vals, cols, rows)); - - std::ostringstream oss; - oss << "Initializing FPGA data, matrix size: " << this->Nb << " blockrows, nnz: " << this->nnzb << " blocks, " << \ - "block size: " << block_size << ", total nnz: " << this->nnz << "\n"; - oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance; - OpmLog::info(oss.str()); - - rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; - rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))]; - - perf_total.s_initialization += second() - start; - initialized = true; -} // end initialize() - - -template -bool FpgaSolverBackend::analyse_matrix() -{ - std::ostringstream oss; - int err; - - double start = second(); - bool success = prec->init(mat.get()); - - if (!success) { - OpmLog::warning("Preconditioner for FPGA solver failed to initialize"); - return success; - } - - toOrder = prec->getToOrder(); - fromOrder = prec->getFromOrder(); - rMat = prec->getRMat(); - processedPointers = prec->getResultPointers(); - processedSizes = prec->getResultSizes(); - processedPointers[19] = rb; - processedPointers[20] = rx; - nnzValArrays_size = static_cast(processedPointers[5])[0]; - L_nnzValArrays_size = static_cast(processedPointers[11])[0]; - U_nnzValArrays_size = static_cast(processedPointers[17])[0]; - // ------------------------------------- - // FPGA: setup host/device data buffers - // ------------------------------------- - // allocate memory and setup data layout - err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits, - processedSizes, - &setupArray, - &nnzValArrays, &nnzValArrays_size, &columnIndexArray, &newRowOffsetArray, &PIndexArray, &colorSizesArray, - &L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray, - &U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray, - &BLKDArray, &X1Array, &R1Array, - &X2Array, &R2Array, - &LresArray, &UresArray, - &databufferSize, dataBuffer, - result_offsets, 1 /*num_nnz_arrays*/, - true /*reset_data_buffers*/, /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */ - debugbufferSize); - if (err) { - oss << "Failed to call fpga_setup_host_datamem (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - - // results buffers setup - if (use_LU_res) { - resultsBufferNum = 4; - } else { - resultsBufferNum = 2; - } - if (resultsBufferNum > RES_BUF_MAX) { - oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - - resultsNum = processedSizes[0]; // rowSize, invariant between system solves - for (int i = 0; i < resultsBufferNum; i++) { - resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double); - } - - // device data memory setup - err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata); - if (err != 0) { - oss << "Failed to call fpga_setup_device_datamem (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - - // ------------------------------------ - // FPGA: setup the kernel's parameters - // ------------------------------------ - err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit, - debug_sample_rate, tolerance, cldata, cldebug); - if (err != 0) { - oss << "Failed to call fpga_set_kernel_parameters (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - - perf_total.s_analysis = second() - start; - analysis_done = true; - - return success; -} // end analyse_matrix() - - -template -bool FpgaSolverBackend::create_preconditioner() -{ - double start = 0; - - if (perf_call_enabled) { - start = second(); - } - memset(rx, 0, sizeof(double) * N); - bool result = prec->create_preconditioner(mat.get()); - if (!result) { - OpmLog::warning("fpgaSolverBackend: create_preconditioner failed"); - } - - if (perf_call_enabled) { - perf_call.back().s_preconditioner_create = second() - start; - } - return result; -} // end create_preconditioner() - - -template -void FpgaSolverBackend::solve_system(BdaResult &res) -{ - std::ostringstream oss; - int err; - double start = 0, start_total = 0; - - // ------------------------------------ - // FPGA: return immediately if FPGA is disabled - // ------------------------------------ - if (fpga_disabled) { - res.converged = false; - OpmLog::warning("FPGA is disabled, fallback to SW execution"); - return; - } - - fpga_calls++; - - if (perf_call_enabled) { - start = second(); - start_total = start; - } - - // check if any buffer is larger than the size set in preconditioner->init - // TODO: add check for all other buffer sizes that may overflow? - err = 0; - if ( ((int *)processedPointers[5])[0] > nnzValArrays_size || - ((int *)processedPointers[11])[0] > L_nnzValArrays_size || - ((int *)processedPointers[17])[0] > U_nnzValArrays_size ) { - err = 1; - } - if (err != 0) { - OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)"); - } - - // ------------------------------------ - // FPGA: copy input data to host data buffers - // ------------------------------------ - if (perf_call_enabled) { - start = second(); - } - err = fpga_copy_host_datamem( - processedPointers, processedSizes, setupArray, - nnzValArrays, &nnzValArrays_size, columnIndexArray, newRowOffsetArray, PIndexArray, colorSizesArray, - L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray, - U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray, - BLKDArray, X1Array, R1Array, X2Array, R2Array, - use_LU_res, LresArray, UresArray, - databufferSize, dataBuffer, - 1 /* nnzValArrays_num */, - reset_data_buffers, fill_results_buffers, - dump_data_buffers, fpga_calls); - if (perf_call_enabled) { - perf_call.back().s_mem_setup = second() - start; - } - if (err != 0) { - oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - // ------------------------------------ - // FPGA: copy buffers to device - // ------------------------------------ - // copy debug buffer to device - if (perf_call_enabled) { - start = second(); - } - err = fpga_copy_to_device_debugbuf(commands, - cldebug, debugBuffer, debugbufferSize, - debug_outbuf_words); - if (err != 0) { - oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - // copy data buffers to device - err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata); - if (err != 0) { - oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - if (perf_call_enabled) { - perf_call.back().s_mem_h2d = second() - start; - } - // ------------------------------------ - // FPGA: execute the kernel - // ------------------------------------ - double time_elapsed_ms; - if (perf_call_enabled) { - start = second(); - } - err = fpga_kernel_run(commands, kernel, &time_elapsed_ms); - if (perf_call_enabled) { - perf_call.back().s_kernel_exec = second() - start; - } - if (err != 0) { - oss << "Failed to call fpga_kernel_run (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - // ---------------------------------------- - // FPGA: read back debug buffer from device - // ---------------------------------------- - if (perf_call_enabled) { - start = second(); - } - err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10), - commands, - debug_outbuf_words, debugbufferSize, - cldebug, debugBuffer, - abort_cycles, - &kernel_cycles, &kernel_iter_run, - norms, &last_norm_idx, - &kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults, - &kernel_wrafterend, &kernel_dbgfifofull); - if (err != 0) { - oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - if (kernel_wrafterend) { - OpmLog::warning("Detected recoverable FPGA error: kernel write after end"); - } - if (kernel_dbgfifofull) { - OpmLog::warning("Detected recoverable FPGA error: debug FIFO full"); - } - if (kernel_aborted || kernel_signature || kernel_overflow) { -#if defined(FPGA_EXIT_WITH_HW_FAILURE) - oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ - ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")"; - OPM_THROW(std::logic_error, oss.str()); -#else - oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \ - ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n"; - oss << "Disabling FPGA kernel: execution will continue with SW kernel"; - OpmLog::warning(oss.str()); - oss.str(""); - oss.clear(); - fpga_disabled = true; -#endif - } - if (perf_call_enabled) { - perf_call.back().n_kernel_exec_cycles = kernel_cycles; - } - // copy (back) results only if FPGA is not disabled - if (!fpga_disabled) { - if (kernel_noresults) { - OpmLog::warning("FPGA kernel did not return results because the required precision is already reached"); - // rx still contains zeros from initial guess - } else { - // ------------------------------------ - // FPGA: read back results from device - // ------------------------------------ - err = fpga_map_results(even(kernel_iter_run), - use_residuals, use_LU_res, commands, - resultsNum, resultsBufferNum, resultsBufferSize, - debugbufferSize, - cldata, resultsBuffer, - result_offsets, - dump_results, data_dir, basename, sequence); - if (err != 0) { - oss << "Failed to call fpga_map_results (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - // TODO: copy results buffers to reordering output buffers - memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double)); - err = fpga_unmap_results(even(kernel_iter_run), - use_residuals, use_LU_res, - commands, cldata, resultsBuffer); - if (err != 0) { - oss << "Failed to call fpga_unmap_results (" << err << ")"; - OPM_THROW(std::logic_error, oss.str()); - } - } - } - // set results and update statistics (if enabled) - if (perf_call_enabled) { - perf_call.back().s_mem_d2h = second() - start; - } - float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns - res.iterations = (int)iter; - res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm - res.conv_rate = pow(res.reduction, 1.0 / iter); - res.elapsed = second() - start_total; - if (perf_call_enabled) { - perf_call.back().s_solve = res.elapsed; - perf_call.back().n_kernel_exec_iters = iter; - } - // convergence depends on number of iterations reached and hw execution errors - res.converged = true; - if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) { - res.converged = false; - if (verbosity >= 1) { - oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \ - ", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \ - ", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit); - OpmLog::warning(oss.str()); - oss.str(""); - oss.clear(); - } - } - if (perf_call_enabled) { - perf_call.back().converged = res.converged; - perf_call.back().converged_flags = ((unsigned int)fpga_disabled) + - ((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) + - ((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4); - } -} // end solve_system() - - -template -void FpgaSolverBackend::update_system(double *vals, double *b) -{ - double start = 0; - - mat->nnzValues = vals; - // reorder inputs using previously found ordering (stored in fromOrder) - if (perf_call_enabled) { - start = second(); - } - reorderBlockedVectorByPattern(mat->Nb, b, fromOrder, rb); - if (perf_call_enabled) { - perf_call.back().s_reorder = second() - start; - } -} // end update_system() - - -template -void FpgaSolverBackend::generate_statistics() -{ - std::ostringstream oss; - unsigned int conv_iter = 0, conv_ovf = 0; - - if (!perf_call_enabled || fpga_calls == 0) { - OpmLog::warning("FPGA statistics were not collected"); - return; - } - std::printf("--- FPGA statistics ---\n"); - std::printf("total solver calls..........: %u\n", fpga_calls); - std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization); - std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup); - -#if defined(FPGA_STATISTICS_FILE_ENABLED) - // DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file - FILE *fout = fopen("fpga_statistics_details.csv", "w"); - if (fout != nullptr) { - std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n"); - } -#endif - unsigned int num_data_points = perf_call.size(); - for (unsigned int i = 0; i < num_data_points; i++) { - perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create; - if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; } - if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; } - perf_total.s_analysis += perf_call[i].s_analysis; - if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; } - if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; } - perf_total.s_reorder += perf_call[i].s_reorder; - if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; } - if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; } - perf_total.s_mem_setup += perf_call[i].s_mem_setup; - if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; } - if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; } - perf_total.s_mem_h2d += perf_call[i].s_mem_h2d; - if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; } - if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; } - perf_total.s_kernel_exec += perf_call[i].s_kernel_exec; - if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; } - if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; } - perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles; - if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; } - if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; } - perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters; - if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; } - if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; } - perf_total.s_mem_d2h += perf_call[i].s_mem_d2h; - if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; } - if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; } - perf_total.s_solve += perf_call[i].s_solve; - if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; } - if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; } - perf_total.s_postprocess += perf_call[i].s_postprocess; - if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; } - if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; } - perf_total.n_converged += (unsigned int)perf_call[i].converged; - if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; } - if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; } -#if defined(FPGA_STATISTICS_FILE_ENABLED) - if (fout != nullptr) { - std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n", - i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder, - perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles, - perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess, - (unsigned int)perf_call[i].converged); - } -#endif - } -#if defined(FPGA_STATISTICS_FILE_ENABLED) - if (fout != nullptr) { - fclose(fout); - } -#endif - perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points; - perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points; - perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points; - perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points; - perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points; - perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points; - perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points; - perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points; - perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points; - perf_total.s_solve_avg = perf_total.s_solve / num_data_points; - perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points; - std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max); - std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max); - std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max); - std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max); - std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max); - std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max); - std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n", - perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max); - std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n", - perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max); - std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max); - std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max); - std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n", - perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max); - std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n", - perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf); - std::printf("-----------------------\n"); -} //end generate_statistics() - - -#define INSTANTIATE_BDA_FUNCTIONS(n) \ -template FpgaSolverBackend::FpgaSolverBackend(std::string, int, int, double, ILUReorder); \ - -INSTANTIATE_BDA_FUNCTIONS(1); -INSTANTIATE_BDA_FUNCTIONS(2); -INSTANTIATE_BDA_FUNCTIONS(3); -INSTANTIATE_BDA_FUNCTIONS(4); -INSTANTIATE_BDA_FUNCTIONS(5); -INSTANTIATE_BDA_FUNCTIONS(6); - -#undef INSTANTIATE_BDA_FUNCTIONS - -} // namespace Accelerator -} // namespace Opm diff --git a/opm/simulators/linalg/bda/FPGASolverBackend.hpp b/opm/simulators/linalg/bda/FPGASolverBackend.hpp deleted file mode 100644 index 05e7c6d33..000000000 --- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp +++ /dev/null @@ -1,264 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED -#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED - -#include -#include - -#include -#include -#include - -namespace Opm -{ -namespace Accelerator -{ - -/// This class implements an ilu0-bicgstab solver on FPGA -template -class FpgaSolverBackend : public BdaSolver -{ - typedef BdaSolver Base; - typedef FPGABILU0 Preconditioner; - - using Base::N; - using Base::Nb; - using Base::nnz; - using Base::nnzb; - using Base::verbosity; - using Base::maxit; - using Base::tolerance; - using Base::initialized; - -private: - double *rx = nullptr; // reordered x - double *rb = nullptr; // reordered b - int *fromOrder = nullptr, *toOrder = nullptr; - bool analysis_done = false; - bool level_scheduling = false; - - // LUMat will shallow copy rowPointers and colIndices of mat/rMat - std::unique_ptr mat = nullptr; - BlockedMatrix *rMat = nullptr; - std::unique_ptr prec = nullptr; - - // vectors with data processed by the preconditioner (input to the kernel) - void **processedPointers = nullptr; - int *processedSizes = nullptr; - - unsigned int fpga_calls = 0; - bool perf_call_enabled = true; - - // per call performance metrics - typedef struct { - double s_preconditioner_create = 0.0; - double s_analysis = 0.0; - double s_reorder = 0.0; - double s_mem_setup = 0.0; - double s_mem_h2d = 0.0; - double s_kernel_exec = 0.0; - unsigned int n_kernel_exec_cycles = 0; - float n_kernel_exec_iters = 0.0; - double s_mem_d2h = 0.0; - double s_solve = 0.0; - double s_postprocess = 0.0; - bool converged = false; - unsigned int converged_flags = 0; - } perf_call_metrics_t; - // cumulative performance metrics - typedef struct { - double s_initialization; - double s_preconditioner_setup; - double s_preconditioner_create; - double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg; - double s_analysis; - double s_analysis_min,s_analysis_max,s_analysis_avg; - double s_reorder; - double s_reorder_min,s_reorder_max,s_reorder_avg; - double s_mem_setup; - double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg; - double s_mem_h2d; - double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg; - double s_kernel_exec; - double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg; - unsigned long n_kernel_exec_cycles; - unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg; - float n_kernel_exec_iters; - float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg; - double s_mem_d2h; - double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg; - double s_solve; - double s_solve_min,s_solve_max,s_solve_avg; - double s_postprocess; - double s_postprocess_min,s_postprocess_max,s_postprocess_avg; - unsigned int n_converged; - } perf_total_metrics_t; - std::vector perf_call; - perf_total_metrics_t perf_total; - // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky - // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental - unsigned int fpga_config_bits = 0; - bool fpga_disabled = false; - bool platform_awsf1; - unsigned int debugbufferSize; - unsigned long int *debugBuffer = nullptr; - unsigned int *databufferSize = nullptr; - unsigned char *dataBuffer[RW_BUF] = {nullptr}; - unsigned int debug_outbuf_words; - int resultsNum; - int resultsBufferNum; - unsigned int resultsBufferSize[RES_BUF_MAX]; - unsigned int result_offsets[6]; - unsigned int kernel_cycles, kernel_iter_run; - double norms[4]; - unsigned char last_norm_idx; - bool kernel_aborted, kernel_signature, kernel_overflow; - bool kernel_noresults; - bool kernel_wrafterend, kernel_dbgfifofull; - bool use_residuals = false; - bool use_LU_res = false; - int sequence = 0; - // TODO: these values may be sent via command line parameters - unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s - unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump - int nnzValArrays_size = 0; - int L_nnzValArrays_size = 0; - int U_nnzValArrays_size = 0; - // aliases to areas of the host data buffers - long unsigned int *setupArray = nullptr; - double **nnzValArrays = nullptr; - short unsigned int *columnIndexArray = nullptr; - unsigned char *newRowOffsetArray = nullptr; - unsigned int *PIndexArray = nullptr; - unsigned int *colorSizesArray = nullptr; - double **L_nnzValArrays = nullptr; - short unsigned int *L_columnIndexArray = nullptr; - unsigned char *L_newRowOffsetArray = nullptr; - unsigned int *L_PIndexArray = nullptr; - unsigned int *L_colorSizesArray = nullptr; - double **U_nnzValArrays = nullptr; - short unsigned int *U_columnIndexArray = nullptr; - unsigned char *U_newRowOffsetArray = nullptr; - unsigned int *U_PIndexArray = nullptr; - unsigned int *U_colorSizesArray = nullptr; - double *BLKDArray = nullptr; - double *X1Array = nullptr, *X2Array = nullptr; - double *R1Array = nullptr, *R2Array = nullptr; - double *LresArray = nullptr, *UresArray = nullptr; - double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region - // OpenCL variables - cl_device_id device_id; - cl_context context; - cl_command_queue commands; - cl_program program; - cl_kernel kernel; - cl_mem cldata[RW_BUF] = {nullptr}; - cl_mem cldebug = nullptr; - // HW limits/configuration variables - unsigned int hw_x_vector_elem; - unsigned int hw_max_row_size; - unsigned int hw_max_column_size; - unsigned int hw_max_colors_size; - unsigned short hw_max_nnzs_per_row; - unsigned int hw_max_matrix_size; - bool hw_use_uram; - bool hw_write_ilu0_results; - unsigned short hw_dma_data_width; - unsigned char hw_x_vector_latency; - unsigned char hw_add_latency; - unsigned char hw_mult_latency; - unsigned char hw_mult_num; - unsigned char hw_num_read_ports; - unsigned char hw_num_write_ports; - unsigned short hw_reset_cycles; - unsigned short hw_reset_settle; - // debug - bool reset_data_buffers = false; - bool fill_results_buffers = false; - int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format - bool dump_results = false; - char *data_dir = nullptr; - char *basename = nullptr; - unsigned short rst_assert_cycles = 0; - unsigned short rst_settle_cycles = 0; - - /// Allocate host memory - /// \param[in] Nb number of blockrows - /// \param[in] nnzbs number of blocks - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] rows array of rowPointers, contains N/dim+1 values - /// \param[in] cols array of columnIndices, contains nnz values - void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols); - - /// Reorder the linear system so it corresponds with the coloring - /// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values - /// \param[in] b input vector - void update_system(double *vals, double *b); - - /// Analyse sparsity pattern to extract parallelism - /// \return true iff analysis was successful - bool analyse_matrix(); - - /// Perform ilu0-decomposition - /// \return true iff decomposition was successful - bool create_preconditioner(); - - /// Solve linear system - /// \param[inout] res summary of solver result - void solve_system(BdaResult &res); - - /// Generate FPGA backend statistics - void generate_statistics(void); - -public: - - /// Construct an fpgaSolver - /// \param[in] fpga_bitstream FPGA bitstream file name - /// \param[in] linear_solver_verbosity verbosity of fpgaSolver - /// \param[in] maxit maximum number of iterations for fpgaSolver - /// \param[in] tolerance required relative tolerance for fpgaSolver - /// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation - FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder); - - /// Destroy an fpgaSolver, and free memory - ~FpgaSolverBackend(); - - /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format - /// \param[in] matrix matrix A - /// \param[in] b input vector, contains N values - /// \param[in] jacMatrix matrix for preconditioner - /// \param[in] wellContribs WellContributions, not used in FPGA solver because it requires them already added to matrix A - /// \param[inout] res summary of solver result - /// \return status code - SolverStatus solve_system(std::shared_ptr matrix, double *b, - std::shared_ptr jacMatrix, WellContributions& wellContribs, BdaResult &res) override; - - /// Get result after linear solve, and peform postprocessing if necessary - /// \param[inout] x resulting x vector, caller must guarantee that x points to a valid array - void get_result(double *x) override; - -}; // end class fpgaSolverBackend - -} // namespace Accelerator -} // namespace Opm - -#endif - diff --git a/opm/simulators/linalg/bda/FPGAUtils.cpp b/opm/simulators/linalg/bda/FPGAUtils.cpp deleted file mode 100644 index 667a87e0b..000000000 --- a/opm/simulators/linalg/bda/FPGAUtils.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#if HAVE_CONFIG_H -#include "config.h" -#endif // HAVE_CONFIG_H - -#include -#include - -namespace Opm -{ -namespace Accelerator -{ - -double second(void) -{ - struct timeval tv; - gettimeofday(&tv, nullptr); - return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; -} - -bool even(int n) -{ - if (n % 2 == 0) { - return true; - } else { - return false; - } -} - -int roundUpTo(int i, int n) -{ - if (i % n == 0) { - return i; - } else { - return (i / n + 1) * n; - } -} - -bool fileExists(const char *filename) -{ - FILE *fin; - fin = fopen(filename, "r"); - if (fin == nullptr) { - return false; - } else { - fclose(fin); - return true; - } -} - -} // namespace Accelerator -} // namespace Opm diff --git a/opm/simulators/linalg/bda/FPGAUtils.hpp b/opm/simulators/linalg/bda/FPGAUtils.hpp deleted file mode 100644 index 1214baf82..000000000 --- a/opm/simulators/linalg/bda/FPGAUtils.hpp +++ /dev/null @@ -1,42 +0,0 @@ -/* - Copyright 2020 Equinor ASA - - This file is part of the Open Porous Media project (OPM). - - OPM is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - OPM is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with OPM. If not, see . -*/ - -#ifndef FPGA_UTILS_HEADER_INCLUDED -#define FPGA_UTILS_HEADER_INCLUDED - -namespace Opm -{ -namespace Accelerator -{ - -union double2int -{ - unsigned long int int_val; - double double_val; -}; - -double second(void); -bool even(int n); -int roundUpTo(int i, int n); -bool fileExists(const char *filename); - -} // namespace Accelerator -} // namespace Opm - -#endif // FPGA_UTILS_HEADER_INCLUDED diff --git a/opm/simulators/linalg/bda/Matrix.cpp b/opm/simulators/linalg/bda/Matrix.cpp index 0edd51c1f..b0f8ae3e4 100644 --- a/opm/simulators/linalg/bda/Matrix.cpp +++ b/opm/simulators/linalg/bda/Matrix.cpp @@ -24,7 +24,6 @@ #include #include -#include namespace Opm { @@ -60,199 +59,5 @@ void sortRow(int *colIndices, double *data, int left, int right) { } -#if HAVE_FPGA -/* - * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows: - * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values - * and colors, and the size, in elements, of the NROffsets array, followed by: - * the number of rows (rounded to the nearest 32), the number of rows (not rounded), - * the number of columns (not rounded) and the number of non-zero values - * (rounded to the nearest 32) for every partition. - * This array is zero padded up to the nearest 64-byte cacheline. - * - The "colIndicesInColor" array, which contains for every partition, from which elements - * in the global X vector the elements of that X vector partition came. - * For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then - * that X vector partition will only have three elements, and the color_col_indices array - * will contain 1, 3 and 6 for that partition. - * This array is zero padded up to the nearest 64-byte cacheline for every partition. - * - The "nnzValues" array contains all non-zero values of each partition of the matrix. - * This array is zero-padded so that each color has a multiple of 32 elements (to have the - * same number of elements per partition as the column indices array). - * - The "colIndices" array contains all column indices of each partition of the matrix. - * These column indices are the local indices for that partition, so to be used, first a - * local X vector partition needs to be loaded into some local memory (this is done using - * data from the _color_col_indices array), before these column indices can be used as - * addresses to that local memory to read the desired X vector values. - * This array is zero-padded so that data for every partition fills up a number of complete - * cachelines (this means every color has a multiple of 32 elements). - * - "NROffsets" is the name of the array that contains the new row offsets for - * all elements of every partition of the matrix. New row offsets are 8-bit values which - * are 0 if that element is not the first element in a row, or which, if that element is - * the first element of a row) is equal to the amount of empty rows between that new row - * and the row before it plus 1. This array is zero-padded so that data for every partition - * fills up a number of complete cachelines (this means every color has a multiple of 64 elements). - */ -int Matrix::toRDF(int numColors, std::vector& nodesPerColor, - std::vector >& colIndicesInColor, int nnzsThisRowLimit, - std::vector >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes) -{ - auto mat = this; - - int doneRows = 0; - int totalRowNum = 0; // total number of non-empty rows - int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color - int maxNNZsPerColor = 0; // max of nnzsPerColor - - int totalValSize = 0; // sum of nnzsPerColor, padded - - std::vector nnzRowsPerColor(numColors); - - // find number of nnzs per color and number of non-empty rows - for (int c = 0; c < numColors; c++) { - int numRows = 0; - nnzRowsPerColor[c] = 0; - int firstNnzOfColor = mat->rowPointers[doneRows]; - int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]]; - nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices - totalValSize += nnzsPerColor; - maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor); - int row = doneRows; - for (; row < doneRows + nodesPerColor[c]; row++) { - if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) { - numRows++; - nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1; - } - } - - doneRows = row; - totalRowNum += numRows; - } - - int conseqZeroRows = 0; // number of consecutive empty rows - int maxConseqZeroRows = 0; - int numEmptyRows = 0; // total number of empty rows - std::vector rowOffsets(totalRowNum); - std::vector nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows - std::vector colorValPointers(numColors + 1); // points to first nnz of first row of each color - std::vector colorValZeroPointers(numColors); // points to first padded zero for each color - - int nonEmptyRowIdx = 0; // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen - doneRows = 0; - - int totalPaddingSize = 0; // number of padded zeros from previous colors - int NROffsetSize = 0; // number of NROffsets entries, padded to multiple of 64 for each color - int maxRows = 0; - int maxNNZsPerRow = 0; - - // determine the row offset of each row (amount of zero rows between it and the previous non-zero row) - // this is later converted to rowOffset for each nnz - for (int c = 0; c < numColors; c++) { - conseqZeroRows = 0; - for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) { - int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row]; - if (nnzsThisRow == 0) { - conseqZeroRows++; - numEmptyRows++; - } else { - maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow); - nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1]; - rowOffsets[nonEmptyRowIdx] = conseqZeroRows; - maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows); - conseqZeroRows = 0; - nonEmptyRowIdx++; - } - } - // calculate sizes that include zeropadding - colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize; - colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32); - totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c]; - NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64); - - doneRows += nodesPerColor[c]; - maxRows = std::max(nodesPerColor[c], maxRows); - } - - if (maxNNZsPerRow > nnzsThisRowLimit) { - std::ostringstream errorstring; - errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit; - OPM_THROW(std::logic_error, errorstring.str()); - } - - // create and fill RDF arrays - colorSizes[3] = colorValPointers[numColors]; // total number of nnzs the FPGA has to process, including zeropadding - colorSizes[5] = NROffsetSize; - - for (int c = 0; c < numColors; c++) { - colorSizes[c * 4 + 8] = nnzRowsPerColor[c]; - colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c]; - } - - int rowIndex = 0; // keep track of where to read/write - int valIndex = 0; - int NRIndex = 0; - int halfwayPoint = colorValPointers[numColors] / 2; - nnzValsSizes[0] = colorValPointers[numColors]; - - colorSizes[7] = halfwayPoint; - - for (int c = 0; c < numColors; c++) { - int nnzsThisRow; - // make sure 32 values are written in batches (pad with zeros if needed) - for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) { - for (int vb = 0; vb < 32; vb++) { - - // if there are enough values for the whole cacheline - if (v + vb < colorValZeroPointers[c]) { - ubNnzValues[0][v + vb] = mat->nnzValues[valIndex]; - ubColIndices[v + vb] = static_cast(colIndicesInColor[c][mat->colIndices[valIndex]]); - - // if this val is the first of a row - if (nnzRowPointers[rowIndex] == valIndex) { - - if (rowOffsets[rowIndex] + 1 >= 255) { - std::ostringstream errorstring; - errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1; - OPM_THROW(std::logic_error, errorstring.str()); - } - - NROffsets[NRIndex] = static_cast(rowOffsets[rowIndex] + 1); - - // skip all empty rows - while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) { - rowIndex++; - nnzsThisRow = 0; - } - nnzsThisRow++; - } - else - { - NROffsets[NRIndex] = (unsigned char) 0; - nnzsThisRow++; - } - valIndex++; - } - else // zeropadding is needed - { - ubNnzValues[0][v + vb] = 0.0; - ubColIndices[v + vb] = static_cast(colIndicesInColor[c][mat->colIndices[valIndex - 1]]); - NROffsets[NRIndex] = 0; - } - NRIndex++; - - } - } - - // zeropad the NROffsets file - while (NRIndex % 64 != 0) { - NROffsets[NRIndex] = 0; - NRIndex++; - } - } - - return 0; -} -#endif - - } // namespace Accelerator } // namespace Opm diff --git a/opm/simulators/linalg/bda/Matrix.hpp b/opm/simulators/linalg/bda/Matrix.hpp index d06ee57e6..ded6331c4 100644 --- a/opm/simulators/linalg/bda/Matrix.hpp +++ b/opm/simulators/linalg/bda/Matrix.hpp @@ -56,28 +56,6 @@ public: M = M_; } -#if HAVE_FPGA - /// Converts this matrix to the dataformat used by the FPGA. - /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset). - /// The purpose of this format is to allow the data to be streamable. - /// The rowPointers array has an unpredictable reading pattern/timing, - /// it also needs a extra work if a row is shorter than a cacheline. - /// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets. - /// The value of this offset is 0, unless the corresponding nnz is the first of a row, - /// in that case it is 'the number of empty rows preceeding it + 1'. - /// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx. - /// Example: - /// [1 0 0 3 0] nnzValues [1 3 2 2 1 4 3 4 1] - /// [0 2 2 0 1] colIndices [0 3 1 2 4 0 1 2 4] - /// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9] - /// [0 0 0 0 0] rowOffsets [1 0 1 0 0 1 2 0 0] - /// [0 3 4 0 1] - /// The rowOffset is stored in 1 byte, meaning the maximum value is 255. - int toRDF(int numColors, std::vector& nodesPerColor, - std::vector >& colIndicesInColor, int nnzsPerRowLimit, - std::vector >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes); -#endif - std::vector nnzValues; std::vector colIndices; std::vector rowPointers; diff --git a/opm/simulators/linalg/bda/WellContributions.cpp b/opm/simulators/linalg/bda/WellContributions.cpp index 58cfda8c5..3be495600 100644 --- a/opm/simulators/linalg/bda/WellContributions.cpp +++ b/opm/simulators/linalg/bda/WellContributions.cpp @@ -53,10 +53,6 @@ WellContributions::create(const std::string& accelerator_mode, bool useWellConn) OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled"); #endif } - else if(accelerator_mode.compare("fpga") == 0){ - // unused for FPGA, but must be defined to avoid error - return std::make_unique(); - } else if(accelerator_mode.compare("amgcl") == 0){ if (!useWellConn) { OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true"); diff --git a/tests/test_cusparseSolver.cpp b/tests/test_cusparseSolver.cpp index 322132509..1d08ed008 100644 --- a/tests/test_cusparseSolver.cpp +++ b/tests/test_cusparseSolver.cpp @@ -98,7 +98,6 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix& matrix, V const int platformID = 0; // unused const int deviceID = 0; const std::string accelerator_mode("cusparse"); - const std::string fpga_bitstream("empty"); // unused const std::string linsolver("ilu0"); Dune::InverseOperatorResult result; @@ -108,7 +107,6 @@ testCusparseSolver(const boost::property_tree::ptree& prm, Matrix& matrix, V std::unique_ptr, Vector, bz> > bridge; try { bridge = std::make_unique, Vector, bz> >(accelerator_mode, - fpga_bitstream, linear_solver_verbosity, maxit, tolerance, diff --git a/tests/test_openclSolver.cpp b/tests/test_openclSolver.cpp index 16163d30c..c9abfb6c1 100644 --- a/tests/test_openclSolver.cpp +++ b/tests/test_openclSolver.cpp @@ -97,12 +97,10 @@ createBridge(const boost::property_tree::ptree& prm, std::unique_ptr, Vector, bz> >(accelerator_mode, - fpga_bitstream, linear_solver_verbosity, maxit, tolerance, diff --git a/tests/test_rocalutionSolver.cpp b/tests/test_rocalutionSolver.cpp index e158c9658..ef48f9dfa 100644 --- a/tests/test_rocalutionSolver.cpp +++ b/tests/test_rocalutionSolver.cpp @@ -92,7 +92,6 @@ testRocalutionSolver(const boost::property_tree::ptree& prm, Matrix& matrix, const int platformID = 0; const int deviceID = 0; const std::string accelerator_mode("rocalution"); - const std::string fpga_bitstream("empty"); // unused const std::string linsolver("ilu0"); Dune::InverseOperatorResult result; @@ -101,7 +100,6 @@ testRocalutionSolver(const boost::property_tree::ptree& prm, Matrix& matrix, std::unique_ptr, Vector, bz> > bridge; try { bridge = std::make_unique, Vector, bz> >(accelerator_mode, - fpga_bitstream, linear_solver_verbosity, maxit, tolerance,