Merge pull request #2998 from g-marchiori/fpgasolver-integration

Added fpgaSolver, requires Xilinx Alveo U280 FPGA board
2025-02-25 18:55:30 -06:00 · 2021-04-15 11:21:39 +02:00 · 2021-04-15 11:21:39 +02:00 · 13f62a718b
commit 13f62a718b
parent 481c72a0fc 8ea19c66aa
24 changed files with 2697 additions and 162 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for
 option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
 option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
 option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
 option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)
 if(SIBLING_SEARCH AND NOT opm-common_DIR)
  # guess the sibling dir
@ -132,9 +133,72 @@ if(CUDA_FOUND)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()
 find_package(OpenCL)
 # include FPGA target only when ENABLE_FPGA is set to ON
 if(ENABLE_FPGA)
  # must include opencl.h which is a wrapper for all the OpenCL extensions
  find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS})
  # FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in <opm-simulators>/../FPGA
  find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA)
  # this code only runs if the user explicitly enabled the FPGA
  # hence fatal_error instead of warning
  if(NOT OPENCL_H OR NOT OpenCL_FOUND)
    set(HAVE_FPGA 0)
    message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.")
  elseif(NOT FPGA_MODULE)
    set(HAVE_FPGA 0)
    message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.")
  elseif(NOT DEFINED ENV{XILINX_XRT})
    # Xilinx XRT must be installed and properly setup
    set(HAVE_FPGA 0)
    message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.")
  else()
    set(HAVE_FPGA 1)
    message(STATUS "FPGA library and kernel integration active.")
    # FIXME: set the correct path to the FPGA module
    set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA)
    # configuration variables with default values: they can be overridden on the cmake command line
    # FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream
    if(NOT FPGA_PORTS_CONFIG)
      set(FPGA_PORTS_CONFIG 2r_3r3w_ddr)
    endif()
    # FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build)
    if(NOT FPGA_DEBUG_LEVEL)
      set(FPGA_DEBUG_LEVEL 0)
    endif()
    message(STATUS "Using the following settings for the FPGA library compilation: "
      "FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, "
      "FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}")
    add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG})
    add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL})
    # include directories for the FPGA library
    include_directories(${FPGA_SOURCE_DIR})
    include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app)
    include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common)
    # add external project to compile the FPGA library
    include (ExternalProject)
    ExternalProject_Add(FPGA_library
      # force the build step to always be run because source dependencies cannot be made explicit
      BUILD_ALWAYS 1
      DOWNLOAD_COMMAND ""
      UPDATE_COMMAND ""
      CONFIGURE_COMMAND ""
      BUILD_COMMAND make
        -f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile
        SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app
        PORTS_CONFIG=${FPGA_PORTS_CONFIG}
        DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}
      INSTALL_COMMAND ""
      TEST_COMMAND ""
    )
  endif()
 endif()
 if(OpenCL_FOUND)
  # the current OpenCL implementation relies on cl.hpp, not cl2.hpp
  # make sure it is available, otherwise disable OpenCL
@ -454,7 +518,8 @@ endif()
 add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)
-# must link libraries after target 'flow' has been defined
+# must link libraries after target 'opmsimulators' has been defined
 if(CUDA_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
@ -463,3 +528,9 @@ endif()
 if(OpenCL_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
 endif()
 if(HAVE_FPGA)
  add_dependencies(opmsimulators FPGA_library)
  ExternalProject_Get_Property(FPGA_library binary_dir)
  target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a)
 endif()
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -66,6 +66,16 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
 endif()
 if(HAVE_FPGA)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
 endif()
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
                                opm/simulators/utils/ParallelSerialization.cpp)
@ -184,6 +194,10 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
  opm/simulators/linalg/bda/ChowPatelIlu.hpp
  opm/simulators/linalg/bda/FPGAMatrix.hpp
  opm/simulators/linalg/bda/FPGABILU0.hpp
  opm/simulators/linalg/bda/FPGASolverBackend.hpp
  opm/simulators/linalg/bda/FPGAUtils.hpp
  opm/simulators/linalg/bda/Reorder.hpp
  opm/simulators/linalg/bda/ILUReorder.hpp
  opm/simulators/linalg/bda/opencl.hpp
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_PETSC
  HAVE_CUDA
  HAVE_OPENCL
  HAVE_FPGA
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -118,7 +118,7 @@ struct Linsolver {
    using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
-struct GpuMode {
+struct AcceleratorMode {
    using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
@ -133,6 +133,10 @@ template<class TypeTag, class MyTypeTag>
 struct OpenclIluReorder {
    using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
 struct FpgaBitstream {
    using type = UndefinedProperty;
 };
 template<class TypeTag>
 struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
@ -213,7 +217,7 @@ struct Linsolver<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr auto value = "ilu0";
 };
 template<class TypeTag>
-struct GpuMode<TypeTag, TTag::FlowIstlSolverParams> {
+struct AcceleratorMode<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr auto value = "none";
 };
 template<class TypeTag>
@ -226,7 +230,11 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
 };
 template<class TypeTag>
 struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
-    static constexpr auto value = "graph_coloring";
+    static constexpr auto value = ""; // note: default value is chosen depending on the solver used
 };
 template<class TypeTag>
 struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr auto value = "";
 };
 } // namespace Opm::Properties
@ -252,12 +260,13 @@ namespace Opm
        bool   ignoreConvergenceFailure_;
        bool scale_linear_system_;
        std::string linsolver_;
-        std::string gpu_mode_;
+        std::string accelerator_mode_;
        int bda_device_id_;
        int opencl_platform_id_;
        int cpr_max_ell_iter_ = 20;
        int cpr_reuse_setup_ = 0;
        std::string opencl_ilu_reorder_;
        std::string fpga_bitstream_;
        template <class TypeTag>
        void init()
@ -279,10 +288,11 @@ namespace Opm
            cpr_max_ell_iter_  =  EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
            cpr_reuse_setup_  =  EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
            linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
-            gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
            bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
            opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
            opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
            fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
        }
        template <class TypeTag>
@ -304,10 +314,11 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
            EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
            EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
        }
        FlowLinearSolverParameters() { reset(); }
@ -327,10 +338,11 @@ namespace Opm
            ilu_milu_                 = MILU_VARIANT::ILU;
            ilu_redblack_             = false;
            ilu_reorder_sphere_       = true;
-            gpu_mode_                 = "none";
+            accelerator_mode_         = "none";
            bda_device_id_            = 0;
            opencl_platform_id_       = 0;
-            opencl_ilu_reorder_       = "graph_coloring";
+            opencl_ilu_reorder_       = "";  // note: the default value is chosen depending on the solver used
            fpga_bitstream_           = "";
        }
    };
--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -35,7 +35,7 @@
 #include <opm/simulators/linalg/setupPropertyTree.hpp>
-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #endif
@ -92,7 +92,7 @@ namespace Opm
        using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
        using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;
-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
        static const unsigned int block_size = Matrix::block_type::rows;
        std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
 #endif
@ -126,14 +126,14 @@ namespace Opm
 #endif
            parameters_.template init<TypeTag>();
            prm_ = setupPropertyTree<TypeTag>(parameters_);
-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
            {
-                std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+                std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-                if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) {
+                if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
                    if (on_io_rank) {
-                        OpmLog::warning("Cannot use GPU with MPI, GPU is disabled");
+                        OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled");
                    }
-                    gpu_mode = "none";
+                    accelerator_mode = "none";
                }
                const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
                const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
@ -141,11 +141,12 @@ namespace Opm
                const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
                const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
                const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
+                std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
            }
 #else
-            if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") {
+            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
+                OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@ -157,6 +158,12 @@ namespace Opm
            detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);
            useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
 #if HAVE_FPGA
            // check usage of MatrixAddWellContributions: for FPGA they must be included
            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) {
                OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true");
            }
 #endif
            const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
            if (!ownersFirst) {
                const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
@ -242,43 +249,42 @@ namespace Opm
            // Solve system.
            Dune::InverseOperatorResult result;
-            bool gpu_was_used = false;
+            bool accelerator_was_used = false;
            // Use GPU if: available, chosen by user, and successful.
-#if HAVE_CUDA || HAVE_OPENCL
+            // Use FPGA if: support compiled, chosen by user, and successful.
 #if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
            bool use_gpu = bdaBridge->getUseGpu();
-            if (use_gpu) {
+            bool use_fpga = bdaBridge->getUseFpga();
-                const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            if (use_gpu || use_fpga) {
-                WellContributions wellContribs(gpu_mode);
+                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
-
+                WellContributions wellContribs(accelerator_mode);
                bdaBridge->initWellContributions(wellContribs);
                if (!useWellConn_) {
                    simulator_.problem().wellModel().getWellContributions(wellContribs);
                }
                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
                if (result.converged) {
                    // get result vector x from non-Dune backend, iff solve was successful
                    bdaBridge->get_result(x);
-                    gpu_was_used = true;
+                    accelerator_was_used = true;
                } else {
-                    // CPU fallback
+                    // warn about CPU fallback
-                    use_gpu = bdaBridge->getUseGpu();  // update value, BdaBridge might have disabled cusparseSolver
+                    // BdaBridge might have disabled its BdaSolver for this simulation due to some error
-                    if (use_gpu && simulator_.gridView().comm().rank() == 0) {
+                    // in that case the BdaBridge is disabled and flexibleSolver is always used
-                        if (gpu_mode.compare("cusparse") == 0) {
+                    // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
-                            OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
+                    if (simulator_.gridView().comm().rank() == 0) {
-                        }
+                        OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
                        if (gpu_mode.compare("opencl") == 0) {
                            OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
                        }
                    }
                }
            }
 #endif
            // Otherwise, use flexible istl solver.
-            if (!gpu_was_used) {
+            if (!accelerator_was_used) {
                assert(flexibleSolver_);
                flexibleSolver_->apply(x, *rhs_, result);
            }
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -45,11 +45,6 @@ namespace bda
 BILU0<block_size>::~BILU0()
 {
    delete[] invDiagVals;
        delete[] diagIndex;
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            delete[] toOrder;
            delete[] fromOrder;
        }
 }
    template <unsigned int block_size>
@ -68,8 +63,8 @@ namespace bda
        if (opencl_ilu_reorder == ILUReorder::NONE) {
            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
        } else {
-            toOrder = new int[Nb];
+            toOrder.resize(Nb);
-            fromOrder = new int[Nb];
+            fromOrder.resize(Nb);
            CSCRowIndices = new int[nnzbs];
            CSCColPointers = new int[Nb + 1];
            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
@ -88,10 +83,10 @@ namespace bda
        std::ostringstream out;
        if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
            out << "BILU0 reordering strategy: " << "level_scheduling\n";
-            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
        } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
            out << "BILU0 reordering strategy: " << "graph_coloring\n";
-            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
            out << "BILU0 reordering strategy: none\n";
            // numColors = 1;
@ -111,12 +106,13 @@ namespace bda
        }
        OpmLog::info(out.str());
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            delete[] CSCRowIndices;
            delete[] CSCColPointers;
        }
-        diagIndex = new int[mat->Nb];
+        diagIndex.resize(mat->Nb);
        invDiagVals = new double[mat->Nb * bs * bs];
 #if CHOW_PATEL
@ -482,7 +478,7 @@ namespace bda
                diagIndex[row] = candidate - LUmat->colIndices;
            }
            events.resize(8);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
            queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
            queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
            queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
@ -516,7 +512,7 @@ namespace bda
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            m = rmat.get();
            Timer t_reorder;
-            reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
+            reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());
            if (verbosity >= 3){
                std::ostringstream out;
@ -556,7 +552,7 @@ namespace bda
                diagIndex[row] = candidate - LUmat->colIndices;
            }
            events.resize(4);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
            queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
            queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
        });
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -61,10 +61,10 @@ namespace bda
        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
 #endif
        double *invDiagVals = nullptr;
-        int *diagIndex = nullptr;
+        std::vector<int> diagIndex;
        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
-        int *toOrder = nullptr, *fromOrder = nullptr;
+        std::vector<int> toOrder, fromOrder;
        int numColors;
        int verbosity;
        std::once_flag pattern_uploaded;
@ -128,12 +128,12 @@ namespace bda
        int* getToOrder()
        {
-            return toOrder;
+            return toOrder.data();
        }
        int* getFromOrder()
        {
-            return fromOrder;
+            return fromOrder.data();
        }
        BlockedMatrix<block_size>* getRMat()
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -18,8 +18,6 @@
 */
 #include <config.h>
 #include <memory>
 #include <sstream>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@ -54,21 +52,23 @@ namespace Opm
    using bda::ILUReorder;
 template <class BridgeMatrix, class BridgeVector, int block_size>
-BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
-: gpu_mode(gpu_mode_)
+: accelerator_mode(accelerator_mode_)
 {
-    if (gpu_mode.compare("cusparse") == 0) {
+    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
        backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
-    } else if (gpu_mode.compare("opencl") == 0) {
+    } else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
        use_gpu = true;
-        ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
+        ILUReorder ilu_reorder;
-        if (opencl_ilu_reorder == "level_scheduling") {
+        if (opencl_ilu_reorder == "") {
            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;  // default when not selected by user
        } else if (opencl_ilu_reorder == "level_scheduling") {
            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
@ -81,10 +81,28 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
-    } else if (gpu_mode.compare("none") == 0) {
+    } else if (accelerator_mode.compare("fpga") == 0) {
-        use_gpu = false;
+#if HAVE_FPGA
        use_fpga = true;
        ILUReorder ilu_reorder;
        if (opencl_ilu_reorder == "") {
            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;  // default when not selected by user
        } else if (opencl_ilu_reorder == "level_scheduling") {
            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
        } else {
-        OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
+            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
        }
        backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
 #else
        OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
 #endif
    } else if (accelerator_mode.compare("none") == 0) {
        use_gpu = false;
        use_fpga = false;
    } else {
        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
    }
 }
@ -130,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) {
 // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
-// sparsity pattern should stay the same due to matrix-add-well-contributions
+// sparsity pattern should stay the same
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
 template <class BridgeMatrix>
 void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
@ -161,14 +179,16 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
 {
-    if (use_gpu) {
+    if (use_gpu || use_fpga) {
        BdaResult result;
        result.converged = false;
        static std::vector<int> h_rows;
        static std::vector<int> h_cols;
        const int dim = (*mat)[0][0].N();
-        const int N = mat->N()*dim;
+        const int Nb = mat->N();
-        const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
+        const int N = Nb * dim;
        const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
        const int nnz = nnzb * dim * dim;
        if (dim != 3) {
            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@ -177,8 +197,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        }
        if (h_rows.capacity() == 0) {
-            h_rows.reserve(N+1);
+            h_rows.reserve(Nb+1);
-            h_cols.reserve(nnz);
+            h_cols.reserve(nnzb);
 #if PRINT_TIMERS_BRIDGE
            Dune::Timer t;
 #endif
@ -233,14 +253,14 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
-    if (use_gpu) {
+    if (use_gpu || use_fpga) {
        backend->get_result(static_cast<double*>(&(x[0][0])));
    }
 }
 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
-    if(gpu_mode.compare("opencl") == 0){
+    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
        const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
@ -250,12 +270,12 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(We
    }
 }
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                \
 template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,              \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
 n>::BdaBridge                                                                                                                       \
-(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);  \
+(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,               \
 unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);                                                    \
                                                                                                                                    \
 template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,         \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -30,6 +30,10 @@
 #include <opm/simulators/linalg/bda/WellContributions.hpp>
 #if HAVE_FPGA
 #include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
 #endif
 namespace Opm
 {
@ -42,19 +46,22 @@ class BdaBridge
 {
 private:
    bool use_gpu = false;
-    std::string gpu_mode;
+    bool use_fpga = false;
    std::string accelerator_mode;
    std::unique_ptr<bda::BdaSolver<block_size> > backend;
 public:
    /// Construct a BdaBridge
-    /// \param[in] gpu_mode                   to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
+    /// \param[in] accelerator_mode           to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]'
    /// \param[in] fpga_bitstream             FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[<filename>]'
    /// \param[in] linear_solver_verbosity    verbosity of BdaSolver
    /// \param[in] maxit                      maximum number of iterations for BdaSolver
    /// \param[in] tolerance                  required relative tolerance for BdaSolver
    /// \param[in] platformID                 the OpenCL platform ID to be used
    /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
-    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
+    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
-    BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
+    BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
    /// Solve linear system, A*x = b
    /// \warning Values of A might get overwritten!
@ -79,6 +86,16 @@ public:
    /// \param[in] wellContribs   container to hold all WellContributions
    void initWellContributions(WellContributions& wellContribs);
    /// Return whether the BdaBridge will use the FPGA or not
    /// return whether the BdaBridge will use the FPGA or not
    bool getUseFpga(){
        return use_fpga;
    }
    /// Return the selected accelerator mode, this is input via the command-line
    std::string getAccleratorName(){
        return accelerator_mode;
    }
 }; // end class BdaBridge
--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -55,6 +55,8 @@ namespace bda
        int maxit = 200;
        double tolerance = 1e-2;
        std::string bitstream = "";
        int N;           // number of rows
        int Nb;          // number of blocked rows (Nb*block_size == N)
        int nnz;         // number of nonzeroes (scalars)
@ -66,7 +68,8 @@ namespace bda
        bool initialized = false;
    public:
-        /// Construct a BdaSolver, can be cusparseSolver or openclSolver
+        /// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver
        /// \param[in] fpga_bitstream             FPGA bitstream file name (only for fpgaSolver)
        /// \param[in] linear_solver_verbosity    verbosity of solver
        /// \param[in] maxit                      maximum number of iterations for solver
        /// \param[in] tolerance                  required relative tolerance for solver
@ -74,6 +77,7 @@ namespace bda
        /// \param[in] deviceID                   the device to be used
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
        BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};
        /// Define virtual destructor, so that the derivedclass destructor will be called
        virtual ~BdaSolver() {};
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -20,13 +20,19 @@
 #include <cstring>
 #include <cmath>
-#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <config.h>
-using bda::BlockedMatrix;
+#include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>
 namespace bda
 {
 using Opm::OpmLog;
 /*Sort a row of matrix elements from a blocked CSR-format.*/
 template <unsigned int block_size>
@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
    }
 }
-// subtract c from b and store in a
+#if HAVE_FPGA
-// a = b - c
+
 /*Subtract two blocks from one another element by element*/
 template <unsigned int block_size>
-void blockSub(double *a, double *b, double *c)
+void blockSub(double *mat1, double *mat2, double *resMat) {
 {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
+            resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
        }
    }
 }
 /*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
 template <unsigned int block_size>
 void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
    for (unsigned int row = 0; row < block_size; row++) {
        if (resetRes) {
            resVect[row] = 0.0;
        }
        for (unsigned int col = 0; col < block_size; col++) {
            resVect[row] += scale * mat[row * block_size + col] * vect[col];
        }
    }
 }
 template <unsigned int block_size>
 int BlockedMatrix<block_size>::countUnblockedNnzs() {
    int numNnzsOverThreshold = 0;
    int totalNnzs = rowPointers[Nb];
    for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
        if (fabs(nnzValues[idx]) > nnzThreshold) {
            numNnzsOverThreshold++;
        }
    }
    return numNnzsOverThreshold;
 }
 /*
 * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
 * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
 */
 template <unsigned int block_size>
 void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
    const unsigned int bs = block_size;
    int valIndex = 0, nnzsPerRow;
    mat->rowPointers[0] = 0;
    // go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and
    // write all non-zero values and corresponding column indices that belong to the same row into the new matrix.
    for (int row = 0; row < Nb; row++) {
        for (unsigned int bRow = 0; bRow < bs; bRow++) {
            nnzsPerRow = 0;
            for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) {
                for (unsigned int bCol = 0; bCol < bs; bCol++) {
                    int idx = 0;
                    // If the matrix is the U matrix, store the rows inside a block in reverse order.
                    if (isUMatrix) {
                        idx = col * bs * bs + (bs - bRow - 1) * bs + bCol;
                    } else {
                        idx = col * bs * bs + bRow * bs + bCol;
                    }
                    if (fabs(nnzValues[idx]) > nnzThreshold) {
                        mat->nnzValues[valIndex] = nnzValues[idx];
                        mat->colIndices[valIndex] = colIndices[col] * bs + bCol;
                        valIndex++;
                        nnzsPerRow++;
                    }
                }
            }
            // Update the rowpointers of the new matrix
            mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow;
        }
    }
 }
 /*Optimized version*/
 // ub* prefixes indicate unblocked data
 template <unsigned int block_size>
 int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
                                     std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
                                     std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
 {
    int res;
    int numUnblockedNnzs = countUnblockedNnzs();
    // initialize the non-blocked matrix with the obtained size.
    std::unique_ptr<Matrix> ubMat = std::make_unique<Matrix>(Nb * block_size, numUnblockedNnzs);
    unblock(ubMat.get(), isUMatrix);
    std::vector<int> ubNodesPerColor(numColors);
    for (int i = 0; i < numColors; i++) {
        ubNodesPerColor[i] = nodesPerColor[i] * block_size;
    }
    *valSize = ubMat->nnzs;
    res = ubMat->toRDF(numColors, ubNodesPerColor,
                       colIndicesInColor, nnzsPerRowLimit,
                       ubNnzValues, ubColIndices, nnzValsSizes,
                       NROffsets, colorSizes);
    return res;
 }
 // coloring is already done, numColors and nodesPerColor are set
 // [rows|columns]PerColorLimit are already queried from the FPGA
 // colIndicesInColor, PIndicesAddr and colorSizes are written here
 // There are 3 matrices analysed: the full matrix for spmv, L and U for ILU
 // node == row
 // color == partition
 // colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs
 // colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF().
 //                    due to partitioning, lots of columns are removed, this matrix keeps track of the mapping
 // PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
 //               if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
 //               then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
 template <unsigned int block_size>
 int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
        int rowsPerColorLimit, int columnsPerColorLimit,
        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
        std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes)
 {
    // Data related to column indices per partition
    int doneRows = 0;
    std::vector<bool> isColAccessed(Nb); // std::vector<bool> might have some different optimized implementation, initialize in a loop
    std::vector<bool> isLColAccessed(Nb);
    int totalCols = 0;    // sum of numColAccesses for each color, blocked
    int LTotalCols = 0, UTotalCols = 0;
    int maxCols = 0;         // max value of numColAccesses for any color
    int maxRowsPerColor = 0; // max value of numRows for any color
    int maxColsPerRow = 0;   // max value of colsPerRow for any color
    // colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates
    // colsInColor[c][i] contains the ith column that color c accesses
    // initial size allows for each color to access all columns, with space for padding
    std::vector<std::vector<int> > colsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
    std::vector<std::vector<int> > LColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
    std::vector<std::vector<int> > UColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
    // find which columns are accessed in each color, as well as how many non-zeroes there are per color.
    for (int c = 0; c < numColors; c++) {
        int numRows = 0;
        // initialize
        for (int row = 0; row < Nb; row++) {
            isColAccessed[row] = false;
            isLColAccessed[row] = false;
        }
        if (c > 0) {
            for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
                isLColAccessed[i] = true;
            }
        }
        int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0;   // number of unique accesses, blocked
        // for every row in this color
        for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
            int colsPerRow = 0;    // number of blocks for this row
            bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]);
            for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) {
                // for every column in the current row, check if that column was accessed before this color
                int col = colIndices[idx];
                if (isColAccessed[col] == false) {
                    colsInColor[c][numColAccesses] = col;
                    isColAccessed[col] = true;
                    numColAccesses++;
                    if (col > row) {
                        UColsInColor[numColors - c - 1][UNumColAccesses] = col;
                        UNumColAccesses++;
                    }
                }
                if (isLColAccessed[col] == false) {
                    if (col < row) {
                        LColsInColor[c][LNumColAccesses] = col;
                        LNumColAccesses++;
                        isLColAccessed[col] = true;
                    }
                }
                colsPerRow++;
            }
            if (rowIsEmpty != true) {
                numRows++;
            }
            maxColsPerRow = std::max(maxColsPerRow, colsPerRow);
        }
        // add columns from previous color into L partition to simplify data forwarding
        if (c > 0) {
            for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
                LColsInColor[c][LNumColAccesses] = i;
                LNumColAccesses++;
            }
        }
        colorSizes[c * 4 + 10] = numColAccesses * block_size;
        LColorSizes[c * 4 + 10] = LNumColAccesses * block_size;
        UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size;
        // store mapping
        for (int col = 0; col < numColAccesses; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i;
            }
        }
        for (int col = 0; col < LNumColAccesses; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i;
            }
        }
        for (int col = 0; col < UNumColAccesses; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i;
            }
        }
        // zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline
        while (numColAccesses % 16 != 0) {
            colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1];
            numColAccesses++;
        }
        while (LNumColAccesses % 16 != 0) {
            LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1];
            LNumColAccesses++;
        }
        while (UNumColAccesses % 16 != 0) {
            UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1];
            UNumColAccesses++;
        }
        maxCols = std::max(numColAccesses, maxCols);
        totalCols += numColAccesses;
        LTotalCols += LNumColAccesses;
        UTotalCols += UNumColAccesses;
        doneRows = doneRows + nodesPerColor[c];
        maxRowsPerColor = std::max(numRows, maxRowsPerColor);
    }
    if (maxCols * static_cast<int>(block_size) > columnsPerColorLimit) {
        std::ostringstream errorstring;
        errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit;
        OPM_THROW(std::logic_error, errorstring.str());
    }
    doneRows = 0;
    int diagValsSize = 0;
    int maxRows = 0;
    for (int c = 0; c < numColors; c++) {
        // calculate sizes that include zeropadding
        diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8);
        doneRows += nodesPerColor[c];
        if (nodesPerColor[c] * static_cast<int>(block_size) > maxRows)
            maxRows = nodesPerColor[c];
        colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
        LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
        UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size;
    }
    if (maxRows * static_cast<int>(block_size) > rowsPerColorLimit) {
        std::ostringstream errorstring;
        errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit;
        OPM_THROW(std::logic_error, errorstring.str());
    }
    // create and fill sizes array as far as already possible
    colorSizes[0] = Nb * block_size;
    LColorSizes[0] = Nb * block_size;
    UColorSizes[0] = Nb * block_size;
    // col_sizes (but the matrix is square)
    colorSizes[1] = Nb * block_size;
    LColorSizes[1] = Nb * block_size;
    UColorSizes[1] = Nb * block_size;
    colorSizes[2] = totalCols * block_size;
    LColorSizes[2] = LTotalCols * block_size;
    UColorSizes[2] = UTotalCols * block_size;
    // missing val_size, written in Matrix::toRDF()
    colorSizes[4] = numColors;
    LColorSizes[4] = numColors;
    UColorSizes[4] = numColors;
    // missing NRFlagsSize, written in Matrix::toRDF()
    colorSizes[6] = diagValsSize;
    LColorSizes[6] = diagValsSize;
    UColorSizes[6] = diagValsSize;
    int paddingIdx = numColors;
    while (paddingIdx % 4 != 0) {
        for (unsigned int i = 0; i < 4; i++) {
            colorSizes[paddingIdx * 4 + 8 + i] = 0;
            LColorSizes[paddingIdx * 4 + 8 + i] = 0;
            UColorSizes[paddingIdx * 4 + 8 + i] = 0;
        }
        paddingIdx++;
    }
    int index = 0, Lindex = 0, Uindex = 0;
    for (int c = 0; c < numColors; c++) {
        // for each unique col access
        for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                PIndicesAddr[index] = colsInColor[c][col] * block_size + i;
                index++;
            }
        }
        // add padding
        while (index % 16 != 0) {
            PIndicesAddr[index] = PIndicesAddr[index - 1];
            index++;
        }
        for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i;
                Lindex++;
            }
        }
        while (Lindex % 16 != 0) {
            LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1];
            Lindex++;
        }
        for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
            for (unsigned int i = 0; i < block_size; i++) {
                UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i;
                Uindex++;
            }
        }
        while (Uindex % 16 != 0) {
            UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1];
            Uindex++;
        }
    }
    return 0;
 }
 void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag) {
    const unsigned int block_size = 3;
    int doneRows = rowSize - 1;    // since the rows of U are reversed, the rows of the diag are also reversed
    int RDFIndex = 0;
    for (int c = 0; c < numColors; c++) {
        for (int r = 0; r < rowsPerColor[c]; r++) {
            // the rows in the block are reversed
            for (int i = static_cast<int>(block_size) - 1; i >= 0; i--) {
                for (unsigned int j = 0; j < block_size; j++) {
                    RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j];
                    RDFIndex++;
                }
                // add 4th column, zeropadding
                RDFDiag[RDFIndex] = 0.0;
                RDFIndex++;
            }
        }
        doneRows -= rowsPerColor[c];
        // make sure the color completely fills a cacheline
        // a color with 3 blocks would otherwise leave space
        while (RDFIndex % 8 != 0) {
            RDFDiag[RDFIndex] = 0.0;
            RDFIndex++;
        }
    }
    assert(RDFIndex % 8 == 0);
 }
 #endif // HAVE_FPGA
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
 template void sortBlockedRow<n>(int *, double *, int, int);                 \
 template void blockMultSub<n>(double *, double *, double *);                \
 template void blockMult<n>(double *, double *, double *);                   \
 template void blockSub<n>(double *, double *, double *);             \
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4);
 #undef INSTANTIATE_BDA_FUNCTIONS
 #if HAVE_FPGA
 #define INSTANTIATE_BDA_FPGA_FUNCTIONS(n)                                             \
 template void blockSub<n>(double *, double *, double *);                              \
 template void blockVectMult<n>(double *, double *, double, double *, bool);           \
 template int BlockedMatrix<n>::toRDF(int, int *, bool,                                \
    std::vector<std::vector<int> >& , int, int *,                                     \
    std::vector<std::vector<double> >&, short int *, unsigned char *, int *,  int *); \
 template int BlockedMatrix<n>::findPartitionColumns(int, int *,                       \
        int, int,                                                                     \
        std::vector<std::vector<int> >& , int *, int *,                               \
        std::vector<std::vector<int> >& , int *, int *,                               \
        std::vector<std::vector<int> >& , int *, int *);
 INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
 INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
 INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
 INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
 #undef INSTANTIATE_BDA_FPGA_FUNCTIONS
 #endif // HAVE_FPGA
 } // end namespace bda
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -20,29 +20,40 @@
 #ifndef BLOCKED_MATRIX_HPP
 #define BLOCKED_MATRIX_HPP
 #if HAVE_FPGA
 #include <vector>
 #endif
 #include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
 namespace bda
 {
 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
-template<int BS>
+template<unsigned int block_size>
-struct BlockedMatrix{
+class BlockedMatrix
 {
 public:
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
    BlockedMatrix(int Nb_, int nnzbs_)
-    : nnzValues(new double[nnzbs_*BS*BS]),
+    : nnzValues(new double[nnzbs_*block_size*block_size]),
-      colIndices(new int[nnzbs_*BS*BS]),
+      colIndices(new int[nnzbs_*block_size*block_size]),
      rowPointers(new int[Nb_+1]),
      Nb(Nb_),
      nnzbs(nnzbs_),
      deleteNnzs(true),
      deleteSparsity(true)
    {}
    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*BS*BS]),
+    : nnzValues(new double[M.nnzbs*block_size*block_size]),
      colIndices(M.colIndices),
      rowPointers(M.rowPointers),
      Nb(M.Nb),
@ -50,10 +61,11 @@ struct BlockedMatrix{
      deleteNnzs(true),
      deleteSparsity(false)
    {}
    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
    /// \param[in] Nb             number of blockrows
    /// \param[in] nnzbs          number of nonzero blocks
-    /// \param[in] nnzValues      array of nonzero values, contains nnzb*BS*BS scalars
+    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
    BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
@ -76,6 +88,28 @@ struct BlockedMatrix{
        }
    }
 #if HAVE_FPGA
    constexpr static double nnzThreshold = 1e-80;  // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero
    int countUnblockedNnzs();
    void unblock(Matrix *mat, bool isUMatrix);
    /// Converts this matrix to the dataformat used by the FPGA
    /// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking
    int toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
        std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
        std::vector<std::vector<double> >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize);
    /// Analyses the sparsity pattern and prepares for toRDF()
    /// Is only called once
    int findPartitionColumns(int numColors, int *nodesPerColor,
        int rowsPerColorLimit, int columnsPerColorLimit,
        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
        std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes);
 #endif
    double *nnzValues;
    int *colIndices;
    int *rowPointers;
@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c);
 template <unsigned int block_size>
 void blockMult(double *mat1, double *mat2, double *resMat);
-/// Subtract blocks
+
-/// a = b - c
+#if HAVE_FPGA
 /// \param[out] a                result block
 /// \param[in] b                 input block
 /// \param[in] c                 input block
 template <unsigned int block_size>
-void blockSub(double *a, double *b, double *c);
+void blockSub(double *mat1, double *mat2, double *resMat);
 template <unsigned int block_size>
 void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
 /// Convert a blocked inverse diagonal to the FPGA format.
 /// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
 /// Since the rows of U are reversed, the rows of the diag are also reversed.
 /// The cachelines can hold 8 doubles, a block has 9 doubles.
 /// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows.
 /// Then 2 blocks (24 doubles) fit on 3 cachelines.
 /// Example:
 /// [1 2 3]    [1 2 3 0]              [1 2 3 0 4 5 6 0]
 /// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1]
 /// [7 8 9]    [7 8 9 0]              [block2 row2 block2 row3]
 void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
 #endif
 } // end namespace bda
--- a/opm/simulators/linalg/bda/FPGABILU0.cpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.cpp
@ -0,0 +1,413 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/MatrixBlock.hpp>
 #include <dune/common/timer.hh>
 #include <opm/simulators/linalg/bda/FPGABILU0.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>
 namespace bda
 {
 using Opm::OpmLog;
 using Dune::Timer;
 template <unsigned int block_size>
 FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
 {
    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
        level_scheduling = true;
    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
        graph_coloring = true;
    } else {
        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
    }
 }
 template <unsigned int block_size>
 FPGABILU0<block_size>::~FPGABILU0()
 {
    delete[] invDiagVals;
 }
 template <unsigned int block_size>
 bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
 {
    const unsigned int bs = block_size;
    resultPointers.resize(numResultPointers, nullptr);
    resultSizes.resize(numResultSizes);
    // Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
    const unsigned int nnzSplit = 1;
    this->N = mat->Nb * block_size;
    this->Nb = mat->Nb;
    this->nnz = mat->nnzbs * block_size * block_size;
    this->nnzbs = mat->nnzbs;
    toOrder.resize(Nb);
    fromOrder.resize(Nb);
    std::vector<int> CSCRowIndices(nnzbs);
    std::vector<int> CSCColPointers(Nb + 1);
    if (level_scheduling) {
        Timer t_convert;
        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
        if (verbosity >= 3) {
            std::ostringstream out;
            out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
            OpmLog::info(out.str());
        }
    }
    Timer t_analysis;
    rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
    LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
    std::ostringstream out;
    if (level_scheduling) {
        out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    } else if (graph_coloring) {
        out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
        findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
    }
    if (numColors > maxNumColors) {
        std::ostringstream errorstring;
        errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
        OPM_THROW(std::logic_error, errorstring.str());
    }
    if (verbosity >= 3) {
        out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
    }
    OpmLog::info(out.str());
    int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
    int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
    int blockDiagSize = 0;
    // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
    int doneRows = 0;
    for (int c = 0; c < numColors; c++) {
        for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
            for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
                int columnIndex = rMat->colIndices[j];
                if (columnIndex < i) {
                    LColorRoundedValSize += 9;
                    LNROffsetSize += 9;
                }
                if (columnIndex > i) {
                    UColorRoundedValSize += 9;
                    UNROffsetSize += 9;
                }
                colorRoundedValSize += 9;
                NROffsetSize += 9;
            }
            blockDiagSize += 12;
        }
        // End of color: round all sizes to nearest cacheline
        colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
        LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
        UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
        NROffsetSize = roundUpTo(NROffsetSize, 64);
        LNROffsetSize = roundUpTo(LNROffsetSize, 64);
        UNROffsetSize = roundUpTo(UNROffsetSize, 64);
        blockDiagSize = roundUpTo(blockDiagSize, 8);
        doneRows += rowsPerColor[c];
    }
    int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
    int worstCaseColumnAccessNum = numColors * maxColsPerColor;
    nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
    LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
    UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
    // initial number of nnz, used to allocate
    nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
    LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
    UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
    colIndices.resize(colorRoundedValSize);
    LColIndices.resize(LColorRoundedValSize);
    UColIndices.resize(UColorRoundedValSize);
    NROffsets.resize(NROffsetSize);
    LNROffsets.resize(LNROffsetSize);
    UNROffsets.resize(UNROffsetSize);
    PIndicesAddr.resize(worstCaseColumnAccessNum);
    LPIndicesAddr.resize(worstCaseColumnAccessNum);
    UPIndicesAddr.resize(worstCaseColumnAccessNum);
    colorSizes.resize(colorSizesNum);
    LColorSizes.resize(colorSizesNum);
    UColorSizes.resize(colorSizesNum);
    blockDiag.resize(blockDiagSize);
    colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
    LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
    UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
    int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
                                         maxRowsPerColor, maxColsPerColor,
                                         colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
                                         LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
                                         UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
    if (err != 0) {
        std::ostringstream errorstring;
        errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
        OPM_THROW(std::logic_error, errorstring.str());
    }
    diagIndex.resize(mat->Nb, 0);
    invDiagVals = new double[mat->Nb * bs * bs];
    LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
    UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
    resultPointers[0] = (void *) colorSizes.data();
    resultPointers[1] = (void *) PIndicesAddr.data();
    resultPointers[2] = (void *) nnzValues.data();
    resultPointers[3] = (void *) colIndices.data();
    resultPointers[4] = (void *) NROffsets.data();
    resultPointers[5] = (void *) nnzValsSizes.data();
    resultPointers[6] = (void *) LColorSizes.data();
    resultPointers[7] = (void *) LPIndicesAddr.data();
    resultPointers[8] = (void *) LnnzValues.data();
    resultPointers[9] = (void *) LColIndices.data();
    resultPointers[10] = (void *) LNROffsets.data();
    resultPointers[11] = (void *) LnnzValsSizes.data();
    resultPointers[12] = (void *) UColorSizes.data();
    resultPointers[13] = (void *) UPIndicesAddr.data();
    resultPointers[14] = (void *) UnnzValues.data();
    resultPointers[15] = (void *) UColIndices.data();
    resultPointers[16] = (void *) UNROffsets.data();
    resultPointers[17] = (void *) UnnzValsSizes.data();
    resultPointers[18] = (void *) blockDiag.data();
    //resultPointers[19] and [20] are set by the caller
    resultSizes[0] = mat->Nb * block_size;
    resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
    resultSizes[2] = numColors;
    resultSizes[3] = worstCaseColumnAccessNum; //totalCols
    resultSizes[4] = NROffsetSize; //NRFlagSize
    resultSizes[5] = blockDiagSize; //diagValsSize
    resultSizes[6] = mat->Nb * block_size;
    resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
    resultSizes[8] = numColors;
    resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
    resultSizes[10] = LNROffsetSize; //LNRFlagSize
    resultSizes[11] = blockDiagSize; //LDiagValsSize
    resultSizes[12] = mat->Nb * block_size;
    resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
    resultSizes[14] = numColors;
    resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
    resultSizes[16] = UNROffsetSize; //UNRFlagSize
    resultSizes[17] = blockDiagSize; //UDiagValsSize
    return true;
 } // end init()
 template <unsigned int block_size>
 bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
 {
    const unsigned int bs = block_size;
    Timer t_reorder;
    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
        OpmLog::info(out.str());
    }
    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
    Timer t_memcpy;
    memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
        OpmLog::info(out.str());
    }
    int i, j, ij, ik, jk;
    int iRowStart, iRowEnd, jRowEnd;
    double pivot[bs * bs];
    int LSize = 0;
    Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
    Timer t_decomposition;
    // go through all rows
    for (i = 0; i < LUMat->Nb; i++) {
        iRowStart = LUMat->rowPointers[i];
        iRowEnd = LUMat->rowPointers[i + 1];
        // go through all elements of the row
        for (ij = iRowStart; ij < iRowEnd; ij++) {
            j = LUMat->colIndices[ij];
            // if the element is the diagonal, store the index and go to next row
            if (j == i) {
                diagIndex[i] = ij;
                break;
            }
            // if an element beyond the diagonal is reach, no diagonal was found
            // throw an error now. TODO: perform reordering earlier to prevent this
            if (j > i) {
                std::ostringstream out;
                out << "BILU0 Error could not find diagonal value in row: " << i;
                OpmLog::error(out.str());
                return false;
            }
            LSize++;
            // calculate the pivot of this row
            blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
            memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
            jRowEnd = LUMat->rowPointers[j + 1];
            jk = diagIndex[j] + 1;
            ik = ij + 1;
            // substract that row scaled by the pivot from this row.
            while (ik < iRowEnd && jk < jRowEnd) {
                if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
                    blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
                    ik++;
                    jk++;
                } else {
                    if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
                    { ik++; }
                    else
                    { jk++; }
                }
            }
        }
        // store the inverse in the diagonal!
        inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
        memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
    }
    LMat->rowPointers[0] = 0;
    UMat->rowPointers[0] = 0;
    // Split the LU matrix into two by comparing column indices to diagonal indices
    for (i = 0; i < LUMat->Nb; i++) {
        LMat->rowPointers[i + 1] = LMat->rowPointers[i];
        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
            if (j < diagIndex[i]) {
                memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
                LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
                LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
            }
        }
    }
    // Reverse the order or the (blocked) rows for the U matrix,
    // because the rows are accessed in reverse order when applying the ILU0
    int URowIndex = 0;
    for (i = LUMat->Nb - 1; i >= 0; i--) {
        UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
            if (j > diagIndex[i]) {
                memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
                UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
                UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
            }
        }
        URowIndex++;
    }
    if (verbosity >= 3) {
        std::ostringstream out;
        out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
        OpmLog::info(out.str());
    }
    std::vector<int> URowsPerColor(numColors);
    rowSize = block_size * rMat->Nb;
    LRowSize = block_size * LMat->Nb;
    URowSize = block_size * UMat->Nb;
    LNumColors = numColors;
    UNumColors = numColors;
    for (int c = 0; c < numColors; c++) {
        URowsPerColor[numColors - c - 1] = rowsPerColor[c];
    }
    int err;
    err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
                      colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
                      nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
    if (err != 0) {
        return false;
    }
    err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
                      LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
                      LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
    if (err != 0) {
        return false;
    }
    err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
                      UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
                      UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
    if (err != 0) {
        return false;
    }
    blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
    // resultPointers are set in the init method
    resultSizes[0] = rowSize;
    resultSizes[1] = colorSizes[3]; // zeropadded valSize;
    resultSizes[2] = numColors;
    resultSizes[3] = colorSizes[2]; //totalCols
    resultSizes[4] = colorSizes[5]; //NRFlagSize
    resultSizes[5] = colorSizes[6]; //diagValsSize
    resultSizes[6] = LRowSize;
    resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
    resultSizes[8] = LNumColors;
    resultSizes[9] = LColorSizes[2]; //LTotalCols
    resultSizes[10] = LColorSizes[5]; //LNRFlagSize
    resultSizes[11] = LColorSizes[6]; //LDiagValsSize
    resultSizes[12] = URowSize;
    resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
    resultSizes[14] = UNumColors;
    resultSizes[15] = UColorSizes[2]; //UTotalCols
    resultSizes[16] = UColorSizes[5]; //UNRFlagSize
    resultSizes[17] = UColorSizes[6]; //UDiagValsSize
    return true;
 } // end create_preconditioner()
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
 template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int);  \
 template FPGABILU0<n>::~FPGABILU0();                                    \
 template bool FPGABILU0<n>::init(BlockedMatrix<n> *);                   \
 template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *);  \
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
 INSTANTIATE_BDA_FUNCTIONS(3);
 INSTANTIATE_BDA_FUNCTIONS(4);
 #undef INSTANTIATE_BDA_FUNCTIONS
 } //namespace bda
--- a/opm/simulators/linalg/bda/FPGABILU0.hpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.hpp
@ -0,0 +1,117 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef FPGA_BILU0_HEADER_INCLUDED
 #define FPGA_BILU0_HEADER_INCLUDED
 #include <vector>
 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 namespace bda
 {
 /*
 * This class implements a Blocked ILU0 preconditioner, with output data
 * specifically formatted for the FPGA.
 * The decomposition and reorders of the rows of the matrix are done on CPU.
 */
 template <unsigned int block_size>
 class FPGABILU0
 {
 private:
    int N;       // number of rows of the matrix
    int Nb;      // number of blockrows of the matrix
    int nnz;     // number of nonzeroes of the matrix (scalar)
    int nnzbs;   // number of blocks of the matrix
    std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
    std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
    double *invDiagVals = nullptr;
    std::vector<int> diagIndex;
    std::vector<int> toOrder, fromOrder;
    std::vector<int> rowsPerColor;
    int numColors;
    int verbosity;
    // sizes and arrays used during RDF generation
    std::vector<std::vector<double> > nnzValues, LnnzValues, UnnzValues;
    std::vector<short int> colIndices, LColIndices, UColIndices;
    std::vector<unsigned char> NROffsets, LNROffsets, UNROffsets;
    std::vector<int> PIndicesAddr, LPIndicesAddr, UPIndicesAddr;
    std::vector<int> colorSizes, LColorSizes, UColorSizes;
    std::vector<int> nnzValsSizes, LnnzValsSizes, UnnzValsSizes;
    std::vector<std::vector<int> > colIndicesInColor, LColIndicesInColor, UColIndicesInColor;
    int rowSize, valSize;
    int LRowSize, LValSize, LNumColors;
    int URowSize, UValSize, UNumColors;
    std::vector<double> blockDiag;
    ILUReorder opencl_ilu_reorder;
    bool level_scheduling = false, graph_coloring = false;
    int numResultPointers = 21;
    std::vector<void *> resultPointers;
    int numResultSizes = 18;
    std::vector<int> resultSizes;
    int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor
 public:
    FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors);
    ~FPGABILU0();
    // analysis (optional)
    bool init(BlockedMatrix<block_size> *mat);
    // ilu_decomposition
    bool create_preconditioner(BlockedMatrix<block_size> *mat);
    int* getToOrder()
    {
        return toOrder.data();
    }
    int* getFromOrder()
    {
        return fromOrder.data();
    }
    BlockedMatrix<block_size>* getRMat()
    {
        return rMat.get();
    }
    void **getResultPointers()
    {
        return resultPointers.data();
    }
    int *getResultSizes()
    {
        return resultSizes.data();
    }
 };
 } //namespace bda
 #endif // FPGA_BILU0_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGAMatrix.cpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.cpp
@ -0,0 +1,249 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>
 namespace bda
 {
 /*Sort a row of matrix elements from a CSR-format.*/
 void sortRow(int *colIndices, double *data, int left, int right) {
    int l = left;
    int r = right;
    int middle = colIndices[(l + r) >> 1];
    do {
        while (colIndices[l] < middle)
            l++;
        while (colIndices[r] > middle)
            r--;
        if (l <= r) {
            int lColIndex = colIndices[l];
            colIndices[l] = colIndices[r];
            colIndices[r] = lColIndex;
            double lDatum = data[l];
            data[l] = data[r];
            data[r] = lDatum;
            l++;
            r--;
        }
    } while (l < r);
    if (left < r)
        sortRow(colIndices, data, left, r);
    if (right > l)
        sortRow(colIndices, data, l, right);
 }
 /*
 * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
 * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
 *   and colors, and the size, in elements, of the NROffsets array, followed by:
 *   the number of rows (rounded to the nearest 32), the number of rows (not rounded),
 *   the number of columns (not rounded) and the number of non-zero values
 *   (rounded to the nearest 32) for every partition.
 *   This array is zero padded up to the nearest 64-byte cacheline.
 * - The "colIndicesInColor" array, which contains for every partition, from which elements
 *   in the global X vector the elements of that X vector partition came.
 *   For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then
 *   that X vector partition will only have three elements, and the color_col_indices array
 *   will contain 1, 3 and 6 for that partition.
 *   This array is zero padded up to the nearest 64-byte cacheline for every partition.
 * - The "nnzValues" array contains all non-zero values of each partition of the matrix.
 *   This array is zero-padded so that each color has a multiple of 32 elements (to have the
 *   same number of elements per partition as the column indices array).
 * - The "colIndices" array contains all column indices of each partition of the matrix.
 *   These column indices are the local indices for that partition, so to be used, first a
 *   local X vector partition needs to be loaded into some local memory (this is done using
 *   data from the _color_col_indices array), before these column indices can be used as
 *   addresses to that local memory to read the desired X vector values.
 *   This array is zero-padded so that data for every partition fills up a number of complete
 *   cachelines (this means every color has a multiple of 32 elements).
 * - "NROffsets" is the name of the array that contains the new row offsets for
 *   all elements of every partition of the matrix. New row offsets are 8-bit values which
 *   are 0 if that element is not the first element in a row, or which, if that element is
 *   the first element of a row) is equal to the amount of empty rows between that new row
 *   and the row before it plus 1. This array is zero-padded so that data for every partition
 *   fills up a number of complete cachelines (this means every color has a multiple of 64 elements).
 */
 int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
                  std::vector<std::vector<int> >& colIndicesInColor, int nnzsThisRowLimit,
                  std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes)
 {
    auto mat = this;
    int doneRows = 0;
    int totalRowNum = 0;  // total number of non-empty rows
    int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color
    int maxNNZsPerColor = 0; // max of nnzsPerColor
    int totalValSize = 0; // sum of nnzsPerColor, padded
    std::vector<int> nnzRowsPerColor(numColors);
    // find number of nnzs per color and number of non-empty rows
    for (int c = 0; c < numColors; c++) {
        int numRows = 0;
        nnzRowsPerColor[c] = 0;
        int firstNnzOfColor = mat->rowPointers[doneRows];
        int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]];
        nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices
        totalValSize += nnzsPerColor;
        maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor);
        int row = doneRows;
        for (; row < doneRows + nodesPerColor[c]; row++) {
            if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) {
                numRows++;
                nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1;
            }
        }
        doneRows = row;
        totalRowNum += numRows;
    }
    int conseqZeroRows = 0;       // number of consecutive empty rows
    int maxConseqZeroRows = 0;
    int numEmptyRows = 0;         // total number of empty rows
    std::vector<int> rowOffsets(totalRowNum);
    std::vector<int> nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows
    std::vector<int> colorValPointers(numColors + 1);    // points to first nnz of first row of each color
    std::vector<int> colorValZeroPointers(numColors);    // points to first padded zero for each color
    int nonEmptyRowIdx = 0;   // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen
    doneRows = 0;
    int totalPaddingSize = 0;  // number of padded zeros from previous colors
    int NROffsetSize = 0;  // number of NROffsets entries, padded to multiple of 64 for each color
    int maxRows = 0;
    int maxNNZsPerRow = 0;
    // determine the row offset of each row (amount of zero rows between it and the previous non-zero row)
    // this is later converted to rowOffset for each nnz
    for (int c = 0; c < numColors; c++) {
        conseqZeroRows = 0;
        for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
            int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row];
            if (nnzsThisRow == 0) {
                conseqZeroRows++;
                numEmptyRows++;
            } else {
                maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow);
                nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1];
                rowOffsets[nonEmptyRowIdx] = conseqZeroRows;
                maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows);
                conseqZeroRows = 0;
                nonEmptyRowIdx++;
            }
        }
        // calculate sizes that include zeropadding
        colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize;
        colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32);
        totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c];
        NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64);
        doneRows += nodesPerColor[c];
        maxRows = std::max(nodesPerColor[c], maxRows);
    }
    if (maxNNZsPerRow > nnzsThisRowLimit) {
        std::ostringstream errorstring;
        errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit;
        OPM_THROW(std::logic_error, errorstring.str());
    }
    // create and fill RDF arrays
    colorSizes[3] = colorValPointers[numColors];  // total number of nnzs the FPGA has to process, including zeropadding
    colorSizes[5] = NROffsetSize;
    for (int c = 0; c < numColors; c++) {
        colorSizes[c * 4 + 8] = nnzRowsPerColor[c];
        colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c];
    }
    int rowIndex = 0;  // keep track of where to read/write
    int valIndex = 0;
    int NRIndex = 0;
    int halfwayPoint = colorValPointers[numColors] / 2;
    nnzValsSizes[0] = colorValPointers[numColors];
    colorSizes[7] = halfwayPoint;
    for (int c = 0; c < numColors; c++) {
        int nnzsThisRow;
        // make sure 32 values are written in batches (pad with zeros if needed)
        for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) {
            for (int vb = 0; vb < 32; vb++) {
                // if there are enough values for the whole cacheline
                if (v + vb < colorValZeroPointers[c]) {
                    ubNnzValues[0][v + vb] = mat->nnzValues[valIndex];
                    ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex]]);
                    // if this val is the first of a row
                    if (nnzRowPointers[rowIndex] == valIndex) {
                        if (rowOffsets[rowIndex] + 1 >= 255) {
                            std::ostringstream errorstring;
                            errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1;
                            OPM_THROW(std::logic_error, errorstring.str());
                        }
                        NROffsets[NRIndex] =  static_cast<unsigned char>(rowOffsets[rowIndex] + 1);
                        // skip all empty rows
                        while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) {
                            rowIndex++;
                            nnzsThisRow = 0;
                        }
                        nnzsThisRow++;
                    }
                    else
                    {
                        NROffsets[NRIndex] = (unsigned char) 0;
                        nnzsThisRow++;
                    }
                    valIndex++;
                }
                else // zeropadding is needed
                {
                    ubNnzValues[0][v + vb] = 0.0;
                    ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex - 1]]);
                    NROffsets[NRIndex] = 0;
                }
                NRIndex++;
            }
        }
        // zeropad the NROffsets file
        while (NRIndex % 64 != 0) {
            NROffsets[NRIndex] = 0;
            NRIndex++;
        }
    }
    return 0;
 }
 } // end namespace bda
--- a/opm/simulators/linalg/bda/FPGAMatrix.hpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.hpp
@ -0,0 +1,84 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef FPGA_MATRIX_HEADER_INCLUDED
 #define FPGA_MATRIX_HEADER_INCLUDED
 #include <vector>
 namespace bda
 {
 /// This struct resembles a csr matrix, only doubles are supported
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
 class Matrix {
 public:
    /// Allocate Matrix and data arrays with given sizes
    /// \param[in] N               number of rows
    /// \param[in] nnzs            number of nonzeros
    Matrix(int N_, int nnzs_)
    : nnzValues(new double[nnzs_]),
      colIndices(new int[nnzs_]),
      rowPointers(new int[N_+1]),
      N(N_),
      nnzs(nnzs_)
    {}
    /// All constructors allocate new memory, so always delete here
    ~Matrix(){
        delete[] nnzValues;
        delete[] colIndices;
        delete[] rowPointers;
    }
    /// Converts this matrix to the dataformat used by the FPGA.
    /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
    /// The purpose of this format is to allow the data to be streamable.
    /// The rowPointers array has an unpredictable reading pattern/timing,
    /// it also needs a extra work if a row is shorter than a cacheline.
    /// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets.
    /// The value of this offset is 0, unless the corresponding nnz is the first of a row,
    /// in that case it is 'the number of empty rows preceeding it + 1'.
    /// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx.
    /// Example:
    /// [1 0 0 3 0]    nnzValues   [1 3 2 2 1 4 3 4 1]
    /// [0 2 2 0 1]    colIndices  [0 3 1 2 4 0 1 2 4]
    /// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9]
    /// [0 0 0 0 0]    rowOffsets  [1 0 1 0 0 1 2 0 0]
    /// [0 3 4 0 1]
    /// The rowOffset is stored in 1 byte, meaning the maximum value is 255.
    int toRDF(int numColors, std::vector<int>& nodesPerColor,
        std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, 
        std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
    double *nnzValues;
    int *colIndices;
    int *rowPointers;
    int N;
    int nnzs;
 };
 void sortRow(int *colIndices, double *data, int left, int right);
 } // end namespace bda
 #endif // FPGA_MATRIX_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@ -0,0 +1,732 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
 #include <cmath>
 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/material/common/Unused.hpp>
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
 #include <opm/simulators/linalg/bda/FPGAUtils.hpp>
 #include <opm/simulators/linalg/bda/Reorder.hpp>
 // if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA
 // kernel will be disabled and execution will continue using DUNE
 #define FPGA_EXIT_WITH_HW_FAILURE
 //#undef FPGA_EXIT_WITH_HW_FAILURE
 // if defined, the function generate_statistics will create a CSV-formatted file
 // with detailed statistics about the FPGA backend performance
 //#define FPGA_STATISTICS_FILE_ENABLED
 #undef FPGA_STATISTICS_FILE_ENABLED
 namespace bda
 {
 using Opm::OpmLog;
 template <unsigned int block_size>
 FpgaSolverBackend<block_size>::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(fpga_bitstream, verbosity_, maxit_, tolerance_)
 {
    int err;
    std::ostringstream oss;
    double start = second();
    // currently, only block size == 3 is supported by the FPGA backend
    assert(block_size == 3);
    if (verbosity < 1) {
        perf_call_enabled = false;
    }
    // setup bitstream name and other parameters
    if (fpga_bitstream.compare("") == 0) {
        OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified");
    }
    if (!fileExists(fpga_bitstream.c_str())) {
        OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable");
    }
    // -----------------------------
    // FPGA: setup the OpenCL platform
    // -----------------------------
    std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp
    // auto-select the proper FPGA device and create context and other CL objects
    err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1);
    if (err != 0) {
        oss << "Failed to setup the OpenCL device (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    oss << "Detected FPGA platform type is ";
    if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; }
    OpmLog::info(oss.str());
    oss.str("");
    oss.clear();
    // -----------------------------
    // FPGA: setup the debug buffer
    // -----------------------------
    // set kernel debug lines depending on an environment variable
    const char *xem = getenv("XCL_EMULATION_MODE");
    if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) {
        debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU;
        oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n";
        OpmLog::info(oss.str());
        oss.str("");
        oss.clear();
    } else {
        // set to 2 to reduce overhead in reading back and interpreting the debug lines;
        // increase to get more debug info from the kernel
        // range is 2..DEBUG_OUTBUF_WORDS_MAX-1
        debug_outbuf_words = 2;
    }
    // host debug buffer setup
    err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize);
    if (err != 0) {
        oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // device debug buffer setup
    err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize);
    if (err != 0) {
        oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n";
        OPM_THROW(std::logic_error, oss.str());
    }
    // copy debug buffer to device
    err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words);
    if (err != 0) {
        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n";
        OPM_THROW(std::logic_error, oss.str());
    }
    // ------------------------------------------------
    // FPGA: query the kernel for limits/configuration
    // ------------------------------------------------
    err = fpga_kernel_query(context, commands, kernel, cldebug,
                            debugBuffer, debug_outbuf_words,
                            rst_assert_cycles, rst_settle_cycles,
                            &hw_x_vector_elem, &hw_max_row_size,
                            &hw_max_column_size, &hw_max_colors_size,
                            &hw_max_nnzs_per_row, &hw_max_matrix_size,
                            &hw_use_uram, &hw_write_ilu0_results,
                            &hw_dma_data_width, &hw_mult_num,
                            &hw_x_vector_latency, &hw_add_latency, &hw_mult_latency,
                            &hw_num_read_ports, &hw_num_write_ports,
                            &hw_reset_cycles, &hw_reset_settle);
    if (err != 0) {
        oss << "Failed to call fpga_kernel_query (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    if (verbosity >= 1) {
        oss << "FPGA kernel limits/configuration:\n";
        oss << "  x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n";
        oss << "  max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n";
        oss << "  use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n";
        oss << "  dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n";
        oss << "  x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n";
        oss << "  add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n";
        oss << "  num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n";
        oss << "  reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle;
        OpmLog::info(oss.str());
        oss.str("");
        oss.clear();
    }
    // check that LU results are generated by the kernel
    if (use_LU_res && !hw_write_ilu0_results) {
        OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage");
        oss.str("");
        oss.clear();
        use_LU_res = false;
    }
    // setup preconditioner
    double start_prec = second();
    prec = std::make_unique<Preconditioner>(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size);
    perf_total.s_preconditioner_setup = second() - start_prec;
    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
        level_scheduling = true;
    }
    perf_total.s_initialization = second() - start;
 } // end fpgaSolverBackend
 template <unsigned int block_size>
 FpgaSolverBackend<block_size>::~FpgaSolverBackend()
 {
    if (verbosity >= 1) {
        generate_statistics();
    }
    delete[] rx;
    delete[] rb;
    if (nnzValArrays != nullptr) { free(nnzValArrays); }
    if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); }
    if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); }
    // FPGA: buffers
    free(debugBuffer);
    for (int b = 0; b < RW_BUF; b++) {
        free(dataBuffer[b]);
    }
    free(databufferSize);
    // FPGA: OpenCL objects
    if (cldebug != nullptr) { clReleaseMemObject(cldebug); }
    for (int b = 0; b < RW_BUF; b++) {
        if (cldata[b] != nullptr) {
            clReleaseMemObject(cldata[b]);
        }
    }
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseDevice(device_id);
 } // end ~fpgaSolverBackend()
 // copy result to host memory
 // caller must be sure that x is a valid array
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::get_result(double *x_)
 {
    double start = 0;
    if (perf_call_enabled) {
        start = second();
    }
    // apply to results the reordering (stored in toOrder)
    reorderBlockedVectorByPattern<block_size>(mat->Nb, rx, toOrder, x_);
    // TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private)
    if (perf_call_enabled) {
        perf_call.back().s_postprocess = second() - start;
    }
 } // end get_result()
 template <unsigned int block_size>
 SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res)
 {
    if (initialized == false) {
        initialize(N_, nnz_,  dim, vals, rows, cols);
        if (!analyse_matrix()) {
            return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
        }
    }
    perf_call.emplace_back();
    update_system(vals, b);
    if (!create_preconditioner()) {
        return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
    }
    solve_system(res);
    if (verbosity >= 1) {
        std::ostringstream oss;
        oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \
            ", iterations: " << res.iterations << ", reduction: " << res.reduction << \
            ", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed;
        OpmLog::info(oss.str());
    }
    return SolverStatus::BDA_SOLVER_SUCCESS;
 }
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
 {
    double start = second();
    this->N = N_;
    this->nnz = nnz_;
    this->nnzb = nnz_ / block_size / block_size;
    Nb = (N + dim - 1) / dim;
    // allocate host memory for matrices and vectors
    // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
    mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
    std::ostringstream oss;
    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
        "block size: " << dim << ", total nnz: " << this->nnz << "\n";
    oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
    OpmLog::info(oss.str());
    rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
    rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
    perf_total.s_initialization += second() - start;
    initialized = true;
 } // end initialize()
 template <unsigned int block_size>
 bool FpgaSolverBackend<block_size>::analyse_matrix()
 {
    std::ostringstream oss;
    int err;
    double start = second();
    bool success = prec->init(mat.get());
    if (!success) {
        OpmLog::warning("Preconditioner for FPGA solver failed to initialize");
        return success;
    }
    toOrder = prec->getToOrder();
    fromOrder = prec->getFromOrder();
    rMat = prec->getRMat();
    processedPointers = prec->getResultPointers();
    processedSizes = prec->getResultSizes();
    processedPointers[19] = rb;
    processedPointers[20] = rx;
    nnzValArrays_size = static_cast<int*>(processedPointers[5])[0];
    L_nnzValArrays_size = static_cast<int*>(processedPointers[11])[0];
    U_nnzValArrays_size = static_cast<int*>(processedPointers[17])[0];
    // -------------------------------------
    // FPGA: setup host/device data buffers
    // -------------------------------------
    // allocate memory and setup data layout
    err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits,
                                  processedSizes,
                                  &setupArray,
                                  &nnzValArrays,   &nnzValArrays_size,   &columnIndexArray,   &newRowOffsetArray,   &PIndexArray,   &colorSizesArray,
                                  &L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray,
                                  &U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray,
                                  &BLKDArray, &X1Array, &R1Array,
                                  &X2Array, &R2Array,
                                  &LresArray, &UresArray,
                                  &databufferSize, dataBuffer,
                                  result_offsets, 1 /*num_nnz_arrays*/,
                                  true /*reset_data_buffers*/,  /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */
                                  debugbufferSize);
    if (err) {
        oss << "Failed to call fpga_setup_host_datamem (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // results buffers setup
    if (use_LU_res) {
        resultsBufferNum = 4;
    } else {
        resultsBufferNum = 2;
    }
    if (resultsBufferNum > RES_BUF_MAX) {
        oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    resultsNum = processedSizes[0]; // rowSize, invariant between system solves
    for (int i = 0; i < resultsBufferNum; i++) {
        resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double);
    }
    // device data memory setup
    err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata);
    if (err != 0) {
        oss << "Failed to call fpga_setup_device_datamem (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // ------------------------------------
    // FPGA: setup the kernel's parameters
    // ------------------------------------
    err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit,
                                     debug_sample_rate, tolerance, cldata, cldebug);
    if (err != 0) {
        oss << "Failed to call fpga_set_kernel_parameters (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    perf_total.s_analysis = second() - start;
    analysis_done = true;
    return success;
 } // end analyse_matrix()
 template <unsigned int block_size>
 bool FpgaSolverBackend<block_size>::create_preconditioner()
 {
    double start = 0;
    if (perf_call_enabled) {
        start = second();
    }
    memset(rx, 0, sizeof(double) * N);
    bool result = prec->create_preconditioner(mat.get());
    if (!result) {
        OpmLog::warning("fpgaSolverBackend: create_preconditioner failed");
    }
    if (perf_call_enabled) {
        perf_call.back().s_preconditioner_create = second() - start;
    }
    return result;
 } // end create_preconditioner()
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::solve_system(BdaResult &res)
 {
    std::ostringstream oss;
    int err;
    double start = 0, start_total = 0;
    // ------------------------------------
    // FPGA: return immediately if FPGA is disabled
    // ------------------------------------
    if (fpga_disabled) {
        res.converged = false;
        OpmLog::warning("FPGA is disabled, fallback to SW execution");
        return;
    }
    fpga_calls++;
    if (perf_call_enabled) {
        start = second();
        start_total = start;
    }
    // check if any buffer is larger than the size set in preconditioner->init
    // TODO: add check for all other buffer sizes that may overflow?
    err = 0;
    if ( ((int *)processedPointers[5])[0]  > nnzValArrays_size ||
            ((int *)processedPointers[11])[0] > L_nnzValArrays_size ||
            ((int *)processedPointers[17])[0] > U_nnzValArrays_size ) {
        err = 1;
    }
    if (err != 0) {
        OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)");
    }
    // ------------------------------------
    // FPGA: copy input data to host data buffers
    // ------------------------------------
    if (perf_call_enabled) {
        start = second();
    }
    err = fpga_copy_host_datamem(
              processedPointers, processedSizes, setupArray,
              nnzValArrays,   &nnzValArrays_size,   columnIndexArray,   newRowOffsetArray,   PIndexArray,   colorSizesArray,
              L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray,
              U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray,
              BLKDArray, X1Array, R1Array, X2Array, R2Array,
              use_LU_res, LresArray, UresArray,
              databufferSize, dataBuffer,
              1 /* nnzValArrays_num */,
              reset_data_buffers, fill_results_buffers,
              dump_data_buffers, fpga_calls);
    if (perf_call_enabled) {
        perf_call.back().s_mem_setup = second() - start;
    }
    if (err != 0) {
        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // ------------------------------------
    // FPGA: copy buffers to device
    // ------------------------------------
    // copy debug buffer to device
    if (perf_call_enabled) {
        start = second();
    }
    err = fpga_copy_to_device_debugbuf(commands,
                                       cldebug, debugBuffer, debugbufferSize,
                                       debug_outbuf_words);
    if (err != 0) {
        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // copy data buffers to device
    err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata);
    if (err != 0) {
        oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    if (perf_call_enabled) {
        perf_call.back().s_mem_h2d = second() - start;
    }
    // ------------------------------------
    // FPGA: execute the kernel
    // ------------------------------------
    double time_elapsed_ms;
    if (perf_call_enabled) {
        start = second();
    }
    err = fpga_kernel_run(commands, kernel, &time_elapsed_ms);
    if (perf_call_enabled) {
        perf_call.back().s_kernel_exec = second() - start;
    }
    if (err != 0) {
        oss << "Failed to call fpga_kernel_run (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    // ----------------------------------------
    // FPGA: read back debug buffer from device
    // ----------------------------------------
    if (perf_call_enabled) {
        start = second();
    }
    err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10),
                                         commands,
                                         debug_outbuf_words, debugbufferSize,
                                         cldebug, debugBuffer,
                                         abort_cycles,
                                         &kernel_cycles, &kernel_iter_run,
                                         norms, &last_norm_idx,
                                         &kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults,
                                         &kernel_wrafterend, &kernel_dbgfifofull);
    if (err != 0) {
        oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")";
        OPM_THROW(std::logic_error, oss.str());
    }
    if (kernel_wrafterend) {
        OpmLog::warning("Detected recoverable FPGA error: kernel write after end");
    }
    if (kernel_dbgfifofull) {
        OpmLog::warning("Detected recoverable FPGA error: debug FIFO full");
    }
    if (kernel_aborted || kernel_signature || kernel_overflow) {
 #if defined(FPGA_EXIT_WITH_HW_FAILURE)
        oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
            ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")";
        OPM_THROW(std::logic_error, oss.str());
 #else
        oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
            ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n";
        oss << "Disabling FPGA kernel: execution will continue with SW kernel";
        OpmLog::warning(oss.str());
        oss.str("");
        oss.clear();
        fpga_disabled = true;
 #endif
    }
    if (perf_call_enabled) {
        perf_call.back().n_kernel_exec_cycles = kernel_cycles;
    }
    // copy (back) results only if FPGA is not disabled
    if (!fpga_disabled) {
        if (kernel_noresults) {
            OpmLog::warning("FPGA kernel did not return results because the required precision is already reached");
            // rx still contains zeros from initial guess
        } else {
            // ------------------------------------
            // FPGA: read back results from device
            // ------------------------------------
            err = fpga_map_results(even(kernel_iter_run),
                                   use_residuals, use_LU_res, commands,
                                   resultsNum, resultsBufferNum, resultsBufferSize,
                                   debugbufferSize,
                                   cldata, resultsBuffer,
                                   result_offsets,
                                   dump_results, data_dir, basename, sequence);
            if (err != 0) {
                oss << "Failed to call fpga_map_results (" << err << ")";
                OPM_THROW(std::logic_error, oss.str());
            }
            // TODO: copy results buffers to reordering output buffers
            memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double));
            err = fpga_unmap_results(even(kernel_iter_run),
                                     use_residuals, use_LU_res,
                                     commands, cldata, resultsBuffer);
            if (err != 0) {
                oss << "Failed to call fpga_unmap_results (" << err << ")";
                OPM_THROW(std::logic_error, oss.str());
            }
        }
    }
    // set results and update statistics (if enabled)
    if (perf_call_enabled) {
        perf_call.back().s_mem_d2h = second() - start;
    }
    float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns
    res.iterations = (int)iter;
    res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm
    res.conv_rate = pow(res.reduction, 1.0 / iter);
    res.elapsed = second() - start_total;
    if (perf_call_enabled) {
        perf_call.back().s_solve = res.elapsed;
        perf_call.back().n_kernel_exec_iters = iter;
    }
    // convergence depends on number of iterations reached and hw execution errors
    res.converged = true;
    if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) {
        res.converged = false;
        if (verbosity >= 1) {
            oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \
                ", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \
                ", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit);
            OpmLog::warning(oss.str());
            oss.str("");
            oss.clear();
        }
    }
    if (perf_call_enabled) {
        perf_call.back().converged = res.converged;
        perf_call.back().converged_flags = ((unsigned int)fpga_disabled) +
                                           ((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) +
                                           ((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4);
    }
 } // end solve_system()
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::update_system(double *vals, double *b)
 {
    double start = 0;
    mat->nnzValues = vals;
    // reorder inputs using previously found ordering (stored in fromOrder)
    if (perf_call_enabled) {
        start = second();
    }
    reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
    if (perf_call_enabled) {
        perf_call.back().s_reorder = second() - start;
    }
 } // end update_system()
 template <unsigned int block_size>
 void FpgaSolverBackend<block_size>::generate_statistics()
 {
    std::ostringstream oss;
    unsigned int conv_iter = 0, conv_ovf = 0;
    if (!perf_call_enabled || fpga_calls == 0) {
        OpmLog::warning("FPGA statistics were not collected");
        return;
    }
    std::printf("--- FPGA statistics ---\n");
    std::printf("total solver calls..........: %u\n", fpga_calls);
    std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization);
    std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup);
 #if defined(FPGA_STATISTICS_FILE_ENABLED)
    // DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file
    FILE *fout = fopen("fpga_statistics_details.csv", "w");
    if (fout != nullptr) {
        std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n");
    }
 #endif
    unsigned int num_data_points = perf_call.size();
    for (unsigned int i = 0; i < num_data_points; i++) {
        perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create;
        if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; }
        if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; }
        perf_total.s_analysis += perf_call[i].s_analysis;
        if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; }
        if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; }
        perf_total.s_reorder += perf_call[i].s_reorder;
        if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; }
        if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; }
        perf_total.s_mem_setup += perf_call[i].s_mem_setup;
        if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; }
        if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; }
        perf_total.s_mem_h2d += perf_call[i].s_mem_h2d;
        if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; }
        if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; }
        perf_total.s_kernel_exec += perf_call[i].s_kernel_exec;
        if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; }
        if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; }
        perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles;
        if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; }
        if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; }
        perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters;
        if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; }
        if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; }
        perf_total.s_mem_d2h += perf_call[i].s_mem_d2h;
        if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; }
        if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; }
        perf_total.s_solve += perf_call[i].s_solve;
        if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; }
        if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; }
        perf_total.s_postprocess += perf_call[i].s_postprocess;
        if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; }
        if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; }
        perf_total.n_converged += (unsigned int)perf_call[i].converged;
        if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; }
        if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; }
 #if defined(FPGA_STATISTICS_FILE_ENABLED)
        if (fout != nullptr) {
            std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n",
                         i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder,
                         perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles,
                         perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess,
                         (unsigned int)perf_call[i].converged);
        }
 #endif
    }
 #if defined(FPGA_STATISTICS_FILE_ENABLED)
    if (fout != nullptr) {
        fclose(fout);
    }
 #endif
    perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points;
    perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points;
    perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points;
    perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points;
    perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points;
    perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points;
    perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points;
    perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points;
    perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points;
    perf_total.s_solve_avg = perf_total.s_solve / num_data_points;
    perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points;
    std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max);
    std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max);
    std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max);
    std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max);
    std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max);
    std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max);
    std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n",
                perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max);
    std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n",
                perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max);
    std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max);
    std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max);
    std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
                perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max);
    std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n",
                perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf);
    std::printf("-----------------------\n");
 } //end generate_statistics()
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                          \
 template FpgaSolverBackend<n>::FpgaSolverBackend(std::string, int, int, double, ILUReorder);  \
 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
 INSTANTIATE_BDA_FUNCTIONS(3);
 INSTANTIATE_BDA_FUNCTIONS(4);
 #undef INSTANTIATE_BDA_FUNCTIONS
 } //namespace bda
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@ -0,0 +1,265 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
 #define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
 #include <opm/simulators/linalg/bda/BdaSolver.hpp>
 #include <opm/simulators/linalg/bda/FPGABILU0.hpp>
 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
 #include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
 namespace bda
 {
 /// This class implements an ilu0-bicgstab solver on FPGA
 template <unsigned int block_size>
 class FpgaSolverBackend : public BdaSolver<block_size>
 {
    typedef BdaSolver<block_size> Base;
    typedef FPGABILU0<block_size> Preconditioner;
    using Base::N;
    using Base::Nb;
    using Base::nnz;
    using Base::nnzb;
    using Base::verbosity;
    using Base::maxit;
    using Base::tolerance;
    using Base::initialized;
 private:
    double *rx = nullptr; // reordered x
    double *rb = nullptr; // reordered b
    int *fromOrder = nullptr, *toOrder = nullptr;
    bool analysis_done = false;
    bool level_scheduling = false;
    // LUMat will shallow copy rowPointers and colIndices of mat/rMat
    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
    BlockedMatrix<block_size> *rMat = nullptr;
    std::unique_ptr<Preconditioner> prec = nullptr;
    // vectors with data processed by the preconditioner (input to the kernel)
    void **processedPointers = nullptr;
    int *processedSizes = nullptr;
    unsigned int fpga_calls = 0;
    bool perf_call_enabled = true;
    // per call performance metrics
    typedef struct {
      double s_preconditioner_create = 0.0;
      double s_analysis = 0.0;
      double s_reorder = 0.0;
      double s_mem_setup = 0.0;
      double s_mem_h2d = 0.0;
      double s_kernel_exec = 0.0;
      unsigned int n_kernel_exec_cycles = 0;
      float n_kernel_exec_iters = 0.0;
      double s_mem_d2h = 0.0;
      double s_solve = 0.0;
      double s_postprocess = 0.0;
      bool converged = false;
      unsigned int converged_flags = 0;
    } perf_call_metrics_t;
    // cumulative performance metrics
    typedef struct {
      double s_initialization;
      double s_preconditioner_setup;
      double s_preconditioner_create;
      double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
      double s_analysis;
      double s_analysis_min,s_analysis_max,s_analysis_avg;
      double s_reorder;
      double s_reorder_min,s_reorder_max,s_reorder_avg;
      double s_mem_setup;
      double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
      double s_mem_h2d;
      double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
      double s_kernel_exec;
      double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
      unsigned long n_kernel_exec_cycles;
      unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
      float n_kernel_exec_iters;
      float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
      double s_mem_d2h;
      double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
      double s_solve;
      double s_solve_min,s_solve_max,s_solve_avg;
      double s_postprocess;
      double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
      unsigned int n_converged;
    } perf_total_metrics_t;
    std::vector<perf_call_metrics_t> perf_call;
    perf_total_metrics_t perf_total;
    // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
    // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
    unsigned int fpga_config_bits = 0;
    bool fpga_disabled = false;
    bool platform_awsf1;
    unsigned int debugbufferSize;
    unsigned long int *debugBuffer = nullptr;
    unsigned int *databufferSize = nullptr;
    unsigned char *dataBuffer[RW_BUF] = {nullptr};
    unsigned int debug_outbuf_words;
    int resultsNum;
    int resultsBufferNum;
    unsigned int resultsBufferSize[RES_BUF_MAX];
    unsigned int result_offsets[6];
    unsigned int kernel_cycles, kernel_iter_run;
    double norms[4];
    unsigned char last_norm_idx;
    bool kernel_aborted, kernel_signature, kernel_overflow;
    bool kernel_noresults;
    bool kernel_wrafterend, kernel_dbgfifofull;
    bool use_residuals = false;
    bool use_LU_res = false;
    int sequence = 0;
    // TODO: these values may be sent via command line parameters
    unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
    unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
    int nnzValArrays_size = 0;
    int L_nnzValArrays_size = 0;
    int U_nnzValArrays_size = 0;
    // aliases to areas of the host data buffers
    long unsigned int *setupArray = nullptr;
    double **nnzValArrays  = nullptr;
    short unsigned int *columnIndexArray = nullptr;
    unsigned char *newRowOffsetArray = nullptr;
    unsigned int *PIndexArray = nullptr;
    unsigned int *colorSizesArray = nullptr;
    double **L_nnzValArrays = nullptr;
    short unsigned int *L_columnIndexArray = nullptr;
    unsigned char *L_newRowOffsetArray = nullptr;
    unsigned int *L_PIndexArray = nullptr;
    unsigned int *L_colorSizesArray = nullptr;
    double **U_nnzValArrays = nullptr;
    short unsigned int *U_columnIndexArray = nullptr;
    unsigned char *U_newRowOffsetArray = nullptr;
    unsigned int *U_PIndexArray = nullptr;
    unsigned int *U_colorSizesArray = nullptr;
    double *BLKDArray = nullptr;
    double *X1Array = nullptr, *X2Array = nullptr;
    double *R1Array = nullptr, *R2Array = nullptr;
    double *LresArray = nullptr, *UresArray = nullptr;
    double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
    // OpenCL variables
    cl_device_id device_id;
    cl_context context;
    cl_command_queue commands;
    cl_program program;
    cl_kernel kernel;
    cl_mem cldata[RW_BUF] = {nullptr};
    cl_mem cldebug = nullptr;
    // HW limits/configuration variables
    unsigned int hw_x_vector_elem;
    unsigned int hw_max_row_size;
    unsigned int hw_max_column_size;
    unsigned int hw_max_colors_size;
    unsigned short hw_max_nnzs_per_row;
    unsigned int hw_max_matrix_size;
    bool hw_use_uram;
    bool hw_write_ilu0_results;
    unsigned short hw_dma_data_width;
    unsigned char hw_x_vector_latency;
    unsigned char hw_add_latency;
    unsigned char hw_mult_latency;
    unsigned char hw_mult_num;
    unsigned char hw_num_read_ports;
    unsigned char hw_num_write_ports;
    unsigned short hw_reset_cycles;
    unsigned short hw_reset_settle;
    // debug
    bool reset_data_buffers = false;
    bool fill_results_buffers = false;
    int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
    bool dump_results = false;
    char *data_dir = nullptr;
    char *basename = nullptr;
    unsigned short rst_assert_cycles = 0;
    unsigned short rst_settle_cycles = 0;
    /// Allocate host memory
    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
    /// \param[in] dim            size of block
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
    /// \param[in] cols           array of columnIndices, contains nnz values
    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
    /// Reorder the linear system so it corresponds with the coloring
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] b              input vector
    void update_system(double *vals, double *b);
    /// Analyse sparsity pattern to extract parallelism
    /// \return true iff analysis was successful
    bool analyse_matrix();
    /// Perform ilu0-decomposition
    /// \return true iff decomposition was successful
    bool create_preconditioner();
    /// Solve linear system
    /// \param[inout] res         summary of solver result
    void solve_system(BdaResult &res);
    /// Generate FPGA backend statistics
    void generate_statistics(void);
 public:
    /// Construct an fpgaSolver
    /// \param[in] fpga_bitstream             FPGA bitstream file name
    /// \param[in] linear_solver_verbosity    verbosity of fpgaSolver
    /// \param[in] maxit                      maximum number of iterations for fpgaSolver
    /// \param[in] tolerance                  required relative tolerance for fpgaSolver
    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
    FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
    /// Destroy an fpgaSolver, and free memory
    ~FpgaSolverBackend();
    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
    /// \param[in] N              number of rows, divide by dim to get number of blockrows
    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
    /// \param[in] dim            size of block
    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
    /// \param[in] cols           array of columnIndices, contains nnz values
    /// \param[in] b              input vector, contains N values
    /// \param[in] wellContribs   WellContributions, not used in FPGA solver because it requires them already added to matrix A
    /// \param[inout] res         summary of solver result
    /// \return                   status code
    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
    /// Get result after linear solve, and peform postprocessing if necessary
    /// \param[inout] x           resulting x vector, caller must guarantee that x points to a valid array
    void get_result(double *x) override;
 }; // end class fpgaSolverBackend
 } //namespace bda
 #endif
--- a/opm/simulators/linalg/bda/FPGAUtils.cpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.cpp
@ -0,0 +1,63 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <sys/time.h>
 #include <fstream>
 namespace bda
 {
 double second(void)
 {
    struct timeval tv;
    gettimeofday(&tv, nullptr);
    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
 }
 bool even(int n)
 {
    if (n % 2 == 0) {
        return true;
    } else {
        return false;
    }
 }
 int roundUpTo(int i, int n)
 {
    if (i % n == 0) {
        return i;
    } else {
        return (i / n + 1) * n;
    }
 }
 bool fileExists(const char *filename)
 {
    FILE *fin;
    fin = fopen(filename, "r");
    if (fin == nullptr) {
        return false;
    } else {
        fclose(fin);
        return true;
    }
 }
 } //namespace bda
--- a/opm/simulators/linalg/bda/FPGAUtils.hpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.hpp
@ -0,0 +1,39 @@
 /*
  Copyright 2020 Equinor ASA
  This file is part of the Open Porous Media project (OPM).
  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef FPGA_UTILS_HEADER_INCLUDED
 #define FPGA_UTILS_HEADER_INCLUDED
 namespace bda
 {
 union double2int
 {
    unsigned long int int_val;
    double double_val;
 };
 double second(void);
 bool even(int n);
 int roundUpTo(int i, int n);
 bool fileExists(const char *filename);
 } // end namespace bda
 #endif // FPGA_UTILS_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -17,12 +17,7 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <vector>
 #include <cstring>
 #include <algorithm> // for fill()
 #include <random>
 #include <limits>
 #include <sstream>
 #include <opm/common/ErrorMacros.hpp>
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -20,6 +20,8 @@
 #ifndef REORDER_HPP
 #define REORDER_HPP
 #include <vector>
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
 namespace bda
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -28,15 +28,18 @@
 namespace Opm
 {
-WellContributions::WellContributions(std::string gpu_mode){
+WellContributions::WellContributions(std::string accelerator_mode){
-    if(gpu_mode.compare("cusparse") == 0){
+    if(accelerator_mode.compare("cusparse") == 0){
        cuda_gpu = true;
    }
-    else if(gpu_mode.compare("opencl") == 0){
+    else if(accelerator_mode.compare("opencl") == 0){
        opencl_gpu = true;
    }
    else if(accelerator_mode.compare("fpga") == 0){
        // unused for FPGA, but must be defined to avoid error
    }
    else{
-        OPM_THROW(std::logic_error, "Error: invalid GPU mode");
+        OPM_THROW(std::logic_error, "Invalid accelerator mode");
    }
 }
--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -176,7 +176,7 @@ public:
    void alloc();
    /// Create a new WellContributions
-    WellContributions(std::string gpu_mode);
+    WellContributions(std::string accelerator_mode);
    /// Destroy a WellContributions, and free memory
    ~WellContributions();