Merge pull request #2998 from g-marchiori/fpgasolver-integration

Added fpgaSolver, requires Xilinx Alveo U280 FPGA board
2025-02-25 18:55:30 -06:00 · 2021-04-15 11:21:39 +02:00 · 2021-04-15 11:21:39 +02:00 · 13f62a718b
commit 13f62a718b
parent 481c72a0fc 8ea19c66aa
24 changed files with 2697 additions and 162 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -29,6 +29,7 @@ option(BUILD_EBOS_DEBUG_EXTENSIONS "Build the ebos variants which are purely for
 option(BUILD_FLOW_POLY_GRID "Build flow blackoil with polyhedral grid" OFF)
 option(OPM_ENABLE_PYTHON "Enable python bindings?" OFF)
 option(OPM_ENABLE_PYTHON_TESTS "Enable tests for the python bindings?" ON)
+option(ENABLE_FPGA "Enable FPGA kernels integration?" OFF)

 if(SIBLING_SEARCH AND NOT opm-common_DIR)
  # guess the sibling dir
@ -132,9 +133,72 @@ if(CUDA_FOUND)
  include_directories(${CUDA_INCLUDE_DIRS})
 endif()

-
 find_package(OpenCL)

+# include FPGA target only when ENABLE_FPGA is set to ON
+if(ENABLE_FPGA)
+  # must include opencl.h which is a wrapper for all the OpenCL extensions
+  find_file(OPENCL_H CL/opencl.h HINTS ${OpenCL_INCLUDE_DIRS})
+
+  # FIXME: devise a check for the presence of the "FPGA" module; currently looking for makefile in <opm-simulators>/../FPGA
+  find_file(FPGA_MODULE linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile HINTS ${opm-simulators_SOURCE_DIR}/../FPGA)
+
+  # this code only runs if the user explicitly enabled the FPGA
+  # hence fatal_error instead of warning
+  if(NOT OPENCL_H OR NOT OpenCL_FOUND)
+    set(HAVE_FPGA 0)
+    message(FATAL_ERROR " OpenCL packages/headers were not found. Make sure CL/opencl.h exists or deactivate FPGA.")
+  elseif(NOT FPGA_MODULE)
+    set(HAVE_FPGA 0)
+    message(FATAL_ERROR " FPGA module was not found. Make sure FPGA repository exists or deactivate FPGA.")
+  elseif(NOT DEFINED ENV{XILINX_XRT})
+    # Xilinx XRT must be installed and properly setup
+    set(HAVE_FPGA 0)
+    message(FATAL_ERROR " Xilinx XRT not found. Make sure it is installed and setup (check its documentation) or deactivate FPGA.")
+  else()
+    set(HAVE_FPGA 1)
+    message(STATUS "FPGA library and kernel integration active.")
+
+    # FIXME: set the correct path to the FPGA module
+    set(FPGA_SOURCE_DIR ${opm-simulators_SOURCE_DIR}/../FPGA)
+
+    # configuration variables with default values: they can be overridden on the cmake command line
+    # FPGA_PORTS_CONFIG selects the kernel's memory ports configuration; must be in sync with available kernel bitstream
+    if(NOT FPGA_PORTS_CONFIG)
+      set(FPGA_PORTS_CONFIG 2r_3r3w_ddr)
+    endif()
+    # FPGA_DEBUG_LEVEL sets the debug messages level for FPGA library functions (should be set to 0 for Release build)
+    if(NOT FPGA_DEBUG_LEVEL)
+      set(FPGA_DEBUG_LEVEL 0)
+    endif()
+    message(STATUS "Using the following settings for the FPGA library compilation: "
+      "FPGA_PORTS_CONFIG=${FPGA_PORTS_CONFIG}, "
+      "FPGA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}")
+    add_compile_options(-DPORTS_CONFIG=PORTS_${FPGA_PORTS_CONFIG})
+    add_compile_options(-DBDA_DEBUG_LEVEL=${FPGA_DEBUG_LEVEL})
+    # include directories for the FPGA library
+    include_directories(${FPGA_SOURCE_DIR})
+    include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app)
+    include_directories(${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common)
+    # add external project to compile the FPGA library
+    include (ExternalProject)
+    ExternalProject_Add(FPGA_library
+      # force the build step to always be run because source dependencies cannot be made explicit
+      BUILD_ALWAYS 1
+      DOWNLOAD_COMMAND ""
+      UPDATE_COMMAND ""
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND make
+        -f ${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/alveo_u280/vitis_20192/OPM-integration/makefile
+        SRCDIR=${FPGA_SOURCE_DIR}/linearalgebra/ilu0bicgstab/xilinx/src/sda_app
+        PORTS_CONFIG=${FPGA_PORTS_CONFIG}
+        DEBUG_LEVEL=${FPGA_DEBUG_LEVEL}
+      INSTALL_COMMAND ""
+      TEST_COMMAND ""
+    )
+  endif()
+endif()
+
 if(OpenCL_FOUND)
  # the current OpenCL implementation relies on cl.hpp, not cl2.hpp
  # make sure it is available, otherwise disable OpenCL
@ -454,7 +518,8 @@ endif()

 add_custom_target(extra_test ${CMAKE_CTEST_COMMAND} -C ExtraTests)

-# must link libraries after target 'flow' has been defined
+# must link libraries after target 'opmsimulators' has been defined
+
 if(CUDA_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cusparse_LIBRARY} )
  target_link_libraries( opmsimulators PUBLIC ${CUDA_cublas_LIBRARY} )
@ -463,3 +528,9 @@ endif()
 if(OpenCL_FOUND)
  target_link_libraries( opmsimulators PUBLIC ${OpenCL_LIBRARIES} )
 endif()
+
+if(HAVE_FPGA)
+  add_dependencies(opmsimulators FPGA_library)
+  ExternalProject_Get_Property(FPGA_library binary_dir)
+  target_link_libraries(opmsimulators PUBLIC ${binary_dir}/fpga_lib_alveo_u280.a)
+endif()
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -66,6 +66,16 @@ if(OPENCL_FOUND)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
 endif()
+if(HAVE_FPGA)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAMatrix.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGABILU0.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGASolverBackend.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/FPGAUtils.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
+  list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
+endif()
+
 if(MPI_FOUND)
  list(APPEND MAIN_SOURCE_FILES opm/simulators/utils/ParallelEclipseState.cpp
                                opm/simulators/utils/ParallelSerialization.cpp)
@ -184,6 +194,10 @@ list (APPEND PUBLIC_HEADER_FILES
  opm/simulators/linalg/bda/cuda_header.hpp
  opm/simulators/linalg/bda/cusparseSolverBackend.hpp
  opm/simulators/linalg/bda/ChowPatelIlu.hpp
+  opm/simulators/linalg/bda/FPGAMatrix.hpp
+  opm/simulators/linalg/bda/FPGABILU0.hpp
+  opm/simulators/linalg/bda/FPGASolverBackend.hpp
+  opm/simulators/linalg/bda/FPGAUtils.hpp
  opm/simulators/linalg/bda/Reorder.hpp
  opm/simulators/linalg/bda/ILUReorder.hpp
  opm/simulators/linalg/bda/opencl.hpp
--- a/opm-simulators-prereqs.cmake
+++ b/opm-simulators-prereqs.cmake
@ -7,6 +7,7 @@ set (opm-simulators_CONFIG_VAR
  HAVE_PETSC
  HAVE_CUDA
  HAVE_OPENCL
+  HAVE_FPGA
  HAVE_SUITESPARSE_UMFPACK_H
  HAVE_DUNE_ISTL
  DUNE_ISTL_VERSION_MAJOR
--- a/opm/simulators/linalg/FlowLinearSolverParameters.hpp
+++ b/opm/simulators/linalg/FlowLinearSolverParameters.hpp
@ -118,7 +118,7 @@ struct Linsolver {
    using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
-struct GpuMode {
+struct AcceleratorMode {
    using type = UndefinedProperty;
 };
 template<class TypeTag, class MyTypeTag>
@ -133,6 +133,10 @@ template<class TypeTag, class MyTypeTag>
 struct OpenclIluReorder {
    using type = UndefinedProperty;
 };
+template<class TypeTag, class MyTypeTag>
+struct FpgaBitstream {
+    using type = UndefinedProperty;
+};

 template<class TypeTag>
 struct LinearSolverReduction<TypeTag, TTag::FlowIstlSolverParams> {
@ -213,7 +217,7 @@ struct Linsolver<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr auto value = "ilu0";
 };
 template<class TypeTag>
-struct GpuMode<TypeTag, TTag::FlowIstlSolverParams> {
+struct AcceleratorMode<TypeTag, TTag::FlowIstlSolverParams> {
    static constexpr auto value = "none";
 };
 template<class TypeTag>
@ -226,7 +230,11 @@ struct OpenclPlatformId<TypeTag, TTag::FlowIstlSolverParams> {
 };
 template<class TypeTag>
 struct OpenclIluReorder<TypeTag, TTag::FlowIstlSolverParams> {
-    static constexpr auto value = "graph_coloring";
+    static constexpr auto value = ""; // note: default value is chosen depending on the solver used
+};
+template<class TypeTag>
+struct FpgaBitstream<TypeTag, TTag::FlowIstlSolverParams> {
+    static constexpr auto value = "";
 };

 } // namespace Opm::Properties
@ -252,12 +260,13 @@ namespace Opm
        bool   ignoreConvergenceFailure_;
        bool scale_linear_system_;
        std::string linsolver_;
-        std::string gpu_mode_;
+        std::string accelerator_mode_;
        int bda_device_id_;
        int opencl_platform_id_;
        int cpr_max_ell_iter_ = 20;
        int cpr_reuse_setup_ = 0;
        std::string opencl_ilu_reorder_;
+        std::string fpga_bitstream_;

        template <class TypeTag>
        void init()
@ -279,10 +288,11 @@ namespace Opm
            cpr_max_ell_iter_  =  EWOMS_GET_PARAM(TypeTag, int, CprMaxEllIter);
            cpr_reuse_setup_  =  EWOMS_GET_PARAM(TypeTag, int, CprReuseSetup);
            linsolver_ = EWOMS_GET_PARAM(TypeTag, std::string, Linsolver);
-            gpu_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
+            accelerator_mode_ = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
            bda_device_id_ = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
            opencl_platform_id_ = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
            opencl_ilu_reorder_ = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
+            fpga_bitstream_ = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
        }

        template <class TypeTag>
@ -304,10 +314,11 @@ namespace Opm
            EWOMS_REGISTER_PARAM(TypeTag, int, CprMaxEllIter, "MaxIterations of the elliptic pressure part of the cpr solver");
            EWOMS_REGISTER_PARAM(TypeTag, int, CprReuseSetup, "Reuse preconditioner setup. Valid options are 0: recreate the preconditioner for every linear solve, 1: recreate once every timestep, 2: recreate if last linear solve took more than 10 iterations, 3: never recreate");
            EWOMS_REGISTER_PARAM(TypeTag, std::string, Linsolver, "Configuration of solver. Valid options are: ilu0 (default), cpr (an alias for cpr_trueimpes), cpr_quasiimpes, cpr_trueimpes or amg. Alternatively, you can request a configuration to be read from a JSON file by giving the filename here, ending with '.json.'");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, GpuMode, "Use GPU cusparseSolver or openclSolver as the linear solver, usage: '--gpu-mode=[none|cusparse|opencl]'");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, AcceleratorMode, "Use GPU (cusparseSolver or openclSolver) or FPGA (fpgaSolver) as the linear solver, usage: '--accelerator-mode=[none|cusparse|opencl|fpga]'");
            EWOMS_REGISTER_PARAM(TypeTag, int, BdaDeviceId, "Choose device ID for cusparseSolver or openclSolver, use 'nvidia-smi' or 'clinfo' to determine valid IDs");
            EWOMS_REGISTER_PARAM(TypeTag, int, OpenclPlatformId, "Choose platform ID for openclSolver, use 'clinfo' to determine valid platform IDs");
-            EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, OpenclIluReorder, "Choose the reordering strategy for ILU for openclSolver and fpgaSolver, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring], level_scheduling behaves like Dune and cusparse, graph_coloring is more aggressive and likely to be faster, but is random-based and generally increases the number of linear solves and linear iterations significantly.");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, FpgaBitstream, "Specify the bitstream file for fpgaSolver (including path), usage: '--fpga-bitstream=<filename>'");
        }

        FlowLinearSolverParameters() { reset(); }
@ -327,10 +338,11 @@ namespace Opm
            ilu_milu_                 = MILU_VARIANT::ILU;
            ilu_redblack_             = false;
            ilu_reorder_sphere_       = true;
-            gpu_mode_                 = "none";
+            accelerator_mode_         = "none";
            bda_device_id_            = 0;
            opencl_platform_id_       = 0;
-            opencl_ilu_reorder_       = "graph_coloring";
+            opencl_ilu_reorder_       = "";  // note: the default value is chosen depending on the solver used
+            fpga_bitstream_           = "";
        }
    };

--- a/opm/simulators/linalg/ISTLSolverEbos.hpp
+++ b/opm/simulators/linalg/ISTLSolverEbos.hpp
@ -35,7 +35,7 @@
 #include <opm/simulators/linalg/setupPropertyTree.hpp>


-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
 #include <opm/simulators/linalg/bda/BdaBridge.hpp>
 #endif

@ -92,7 +92,7 @@ namespace Opm
        using WellModelOperator = WellModelAsLinearOperator<WellModel, Vector, Vector>;
        using ElementMapper = GetPropType<TypeTag, Properties::ElementMapper>;

-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
        static const unsigned int block_size = Matrix::block_type::rows;
        std::unique_ptr<BdaBridge<Matrix, Vector, block_size>> bdaBridge;
 #endif
@ -126,14 +126,14 @@ namespace Opm
 #endif
            parameters_.template init<TypeTag>();
            prm_ = setupPropertyTree<TypeTag>(parameters_);
-#if HAVE_CUDA || HAVE_OPENCL
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
            {
-                std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
-                if ((simulator_.vanguard().grid().comm().size() > 1) && (gpu_mode != "none")) {
+                std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
+                if ((simulator_.vanguard().grid().comm().size() > 1) && (accelerator_mode != "none")) {
                    if (on_io_rank) {
-                        OpmLog::warning("Cannot use GPU with MPI, GPU is disabled");
+                        OpmLog::warning("Cannot use GPU or FPGA with MPI, GPU/FPGA are disabled");
                    }
-                    gpu_mode = "none";
+                    accelerator_mode = "none";
                }
                const int platformID = EWOMS_GET_PARAM(TypeTag, int, OpenclPlatformId);
                const int deviceID = EWOMS_GET_PARAM(TypeTag, int, BdaDeviceId);
@ -141,11 +141,12 @@ namespace Opm
                const double tolerance = EWOMS_GET_PARAM(TypeTag, double, LinearSolverReduction);
                const std::string opencl_ilu_reorder = EWOMS_GET_PARAM(TypeTag, std::string, OpenclIluReorder);
                const int linear_solver_verbosity = parameters_.linear_solver_verbosity_;
-                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(gpu_mode, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
+                std::string fpga_bitstream = EWOMS_GET_PARAM(TypeTag, std::string, FpgaBitstream);
+                bdaBridge.reset(new BdaBridge<Matrix, Vector, block_size>(accelerator_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder));
            }
 #else
-            if (EWOMS_GET_PARAM(TypeTag, std::string, GpuMode) != "none") {
-                OPM_THROW(std::logic_error,"Error cannot use GPU solver since neither CUDA nor OpenCL were found by cmake");
+            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) != "none") {
+                OPM_THROW(std::logic_error,"Cannot use accelerated solver since neither CUDA nor OpenCL were found by cmake and FPGA was not enabled");
            }
 #endif
            extractParallelGridInformationToISTL(simulator_.vanguard().grid(), parallelInformation_);
@ -157,6 +158,12 @@ namespace Opm
            detail::findOverlapAndInterior(simulator_.vanguard().grid(), elemMapper, overlapRows_, interiorRows_);

            useWellConn_ = EWOMS_GET_PARAM(TypeTag, bool, MatrixAddWellContributions);
+#if HAVE_FPGA
+            // check usage of MatrixAddWellContributions: for FPGA they must be included
+            if (EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode) == "fpga" && !useWellConn_) {
+                OPM_THROW(std::logic_error,"fpgaSolver needs --matrix-add-well-contributions=true");
+            }
+#endif
            const bool ownersFirst = EWOMS_GET_PARAM(TypeTag, bool, OwnerCellsFirst);
            if (!ownersFirst) {
                const std::string msg = "The linear solver no longer supports --owner-cells-first=false.";
@ -242,43 +249,42 @@ namespace Opm

            // Solve system.
            Dune::InverseOperatorResult result;
-            bool gpu_was_used = false;
+            bool accelerator_was_used = false;

            // Use GPU if: available, chosen by user, and successful.
-#if HAVE_CUDA || HAVE_OPENCL
+            // Use FPGA if: support compiled, chosen by user, and successful.
+#if HAVE_CUDA || HAVE_OPENCL || HAVE_FPGA
            bool use_gpu = bdaBridge->getUseGpu();
-            if (use_gpu) {
-                const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
-                WellContributions wellContribs(gpu_mode);
-
+            bool use_fpga = bdaBridge->getUseFpga();
+            if (use_gpu || use_fpga) {
+                const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
+                WellContributions wellContribs(accelerator_mode);
                bdaBridge->initWellContributions(wellContribs);

                if (!useWellConn_) {
                    simulator_.problem().wellModel().getWellContributions(wellContribs);
                }
+
                // Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
                bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
                if (result.converged) {
                    // get result vector x from non-Dune backend, iff solve was successful
                    bdaBridge->get_result(x);
-                    gpu_was_used = true;
+                    accelerator_was_used = true;
                } else {
-                    // CPU fallback
-                    use_gpu = bdaBridge->getUseGpu();  // update value, BdaBridge might have disabled cusparseSolver
-                    if (use_gpu && simulator_.gridView().comm().rank() == 0) {
-                        if (gpu_mode.compare("cusparse") == 0) {
-                            OpmLog::warning("cusparseSolver did not converge, now trying Dune to solve current linear system...");
-                        }
-                        if (gpu_mode.compare("opencl") == 0) {
-                            OpmLog::warning("openclSolver did not converge, now trying Dune to solve current linear system...");
-                        }
+                    // warn about CPU fallback
+                    // BdaBridge might have disabled its BdaSolver for this simulation due to some error
+                    // in that case the BdaBridge is disabled and flexibleSolver is always used
+                    // or maybe the BdaSolver did not converge in time, then it will be used next linear solve
+                    if (simulator_.gridView().comm().rank() == 0) {
+                        OpmLog::warning(bdaBridge->getAccleratorName() + " did not converge, now trying Dune to solve current linear system...");
                    }
                }
            }
 #endif

            // Otherwise, use flexible istl solver.
-            if (!gpu_was_used) {
+            if (!accelerator_was_used) {
                assert(flexibleSolver_);
                flexibleSolver_->apply(x, *rhs_, result);
            }
--- a/opm/simulators/linalg/bda/BILU0.cpp
+++ b/opm/simulators/linalg/bda/BILU0.cpp
@ -33,24 +33,19 @@
 namespace bda
 {

-    using Opm::OpmLog;
-    using Dune::Timer;
+using Opm::OpmLog;
+using Dune::Timer;

-    template <unsigned int block_size>
-    BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
+template <unsigned int block_size>
+BILU0<block_size>::BILU0(ILUReorder opencl_ilu_reorder_, int verbosity_) :
    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_)
-    {}
+{}

-    template <unsigned int block_size>
-    BILU0<block_size>::~BILU0()
-    {
+template <unsigned int block_size>
+BILU0<block_size>::~BILU0()
+{
    delete[] invDiagVals;
-        delete[] diagIndex;
-        if (opencl_ilu_reorder != ILUReorder::NONE) {
-            delete[] toOrder;
-            delete[] fromOrder;
-        }
-    }
+}

    template <unsigned int block_size>
    bool BILU0<block_size>::init(BlockedMatrix<block_size> *mat)
@ -68,8 +63,8 @@ namespace bda
        if (opencl_ilu_reorder == ILUReorder::NONE) {
            LUmat = std::make_unique<BlockedMatrix<block_size> >(*mat);
        } else {
-            toOrder = new int[Nb];
-            fromOrder = new int[Nb];
+            toOrder.resize(Nb);
+            fromOrder.resize(Nb);
            CSCRowIndices = new int[nnzbs];
            CSCColPointers = new int[Nb + 1];
            rmat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
@ -88,10 +83,10 @@ namespace bda
        std::ostringstream out;
        if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
            out << "BILU0 reordering strategy: " << "level_scheduling\n";
-            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+            findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
        } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
            out << "BILU0 reordering strategy: " << "graph_coloring\n";
-            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder, fromOrder, rowsPerColor);
+            findGraphColoring<block_size>(mat->colIndices, mat->rowPointers, CSCRowIndices, CSCColPointers, mat->Nb, mat->Nb, mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
        } else if (opencl_ilu_reorder == ILUReorder::NONE) {
            out << "BILU0 reordering strategy: none\n";
            // numColors = 1;
@ -111,12 +106,13 @@ namespace bda
        }
        OpmLog::info(out.str());

+
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            delete[] CSCRowIndices;
            delete[] CSCColPointers;
        }

-        diagIndex = new int[mat->Nb];
+        diagIndex.resize(mat->Nb);
        invDiagVals = new double[mat->Nb * bs * bs];

 #if CHOW_PATEL
@ -159,7 +155,7 @@ namespace bda
        }

    return true;
-    } // end init()
+} // end init()


    // implements Fine-Grained Parallel ILU algorithm (FGPILU), Chow and Patel 2015
@ -482,7 +478,7 @@ namespace bda
                diagIndex[row] = candidate - LUmat->colIndices;
            }
            events.resize(8);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[3]);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[3]);
            queue->enqueueWriteBuffer(s.Lcols, CL_FALSE, 0, Lmat->nnzbs * sizeof(int), Lmat->colIndices, nullptr, &events[4]);
            queue->enqueueWriteBuffer(s.Lrows, CL_FALSE, 0, (Lmat->Nb + 1) * sizeof(int), Lmat->rowPointers, nullptr, &events[5]);
            queue->enqueueWriteBuffer(s.Ucols, CL_FALSE, 0, Umat->nnzbs * sizeof(int), cols.data(), nullptr, &events[6]);
@ -516,7 +512,7 @@ namespace bda
        if (opencl_ilu_reorder != ILUReorder::NONE) {
            m = rmat.get();
            Timer t_reorder;
-            reorderBlockedMatrixByPattern<block_size>(mat, toOrder, fromOrder, rmat.get());
+            reorderBlockedMatrixByPattern<block_size>(mat, toOrder.data(), fromOrder.data(), rmat.get());

            if (verbosity >= 3){
                std::ostringstream out;
@ -556,7 +552,7 @@ namespace bda
                diagIndex[row] = candidate - LUmat->colIndices;
            }
            events.resize(4);
-            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex, nullptr, &events[1]);
+            queue->enqueueWriteBuffer(s.diagIndex, CL_FALSE, 0, Nb * sizeof(int), diagIndex.data(), nullptr, &events[1]);
            queue->enqueueWriteBuffer(s.LUcols, CL_FALSE, 0, LUmat->nnzbs * sizeof(int), LUmat->colIndices, nullptr, &events[2]);
            queue->enqueueWriteBuffer(s.LUrows, CL_FALSE, 0, (LUmat->Nb + 1) * sizeof(int), LUmat->rowPointers, nullptr, &events[3]);
        });
@ -637,30 +633,30 @@ namespace bda
    }


-    template <unsigned int block_size>
-    void BILU0<block_size>::setOpenCLContext(cl::Context *context_){
+template <unsigned int block_size>
+void BILU0<block_size>::setOpenCLContext(cl::Context *context_) {
    this->context = context_;
-    }
-    template <unsigned int block_size>
-    void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_){
+}
+template <unsigned int block_size>
+void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
    this->queue = queue_;
-    }
-    template <unsigned int block_size>
-    void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_){
+}
+template <unsigned int block_size>
+void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
    this->work_group_size = work_group_size_;
    this->total_work_items = total_work_items_;
    this->lmem_per_work_group = lmem_per_work_group_;
-    }
-    template <unsigned int block_size>
-    void BILU0<block_size>::setKernels(
+}
+template <unsigned int block_size>
+void BILU0<block_size>::setKernels(
    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
    cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
    cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
-    ){
+){
    this->ILU_apply1 = ILU_apply1_;
    this->ILU_apply2 = ILU_apply2_;
    this->ilu_decomp_k = ilu_decomp_k_;
-    }
+}


 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                     \
--- a/opm/simulators/linalg/bda/BILU0.hpp
+++ b/opm/simulators/linalg/bda/BILU0.hpp
@ -61,10 +61,10 @@ namespace bda
        std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
 #endif
        double *invDiagVals = nullptr;
-        int *diagIndex = nullptr;
+        std::vector<int> diagIndex;
        std::vector<int> rowsPerColor;  // color i contains rowsPerColor[i] rows, which are processed in parallel
        std::vector<int> rowsPerColorPrefix;  // the prefix sum of rowsPerColor
-        int *toOrder = nullptr, *fromOrder = nullptr;
+        std::vector<int> toOrder, fromOrder;
        int numColors;
        int verbosity;
        std::once_flag pattern_uploaded;
@ -128,12 +128,12 @@ namespace bda

        int* getToOrder()
        {
-            return toOrder;
+            return toOrder.data();
        }

        int* getFromOrder()
        {
-            return fromOrder;
+            return fromOrder.data();
        }

        BlockedMatrix<block_size>* getRMat()
--- a/opm/simulators/linalg/bda/BdaBridge.cpp
+++ b/opm/simulators/linalg/bda/BdaBridge.cpp
@ -18,8 +18,6 @@
 */

 #include <config.h>
-#include <memory>
-#include <sstream>

 #include <opm/common/OpmLog/OpmLog.hpp>
 #include <opm/common/ErrorMacros.hpp>
@ -54,21 +52,23 @@ namespace Opm
    using bda::ILUReorder;

 template <class BridgeMatrix, class BridgeVector, int block_size>
-BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID OPM_UNUSED, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
-: gpu_mode(gpu_mode_)
+BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder OPM_UNUSED)
+: accelerator_mode(accelerator_mode_)
 {
-    if (gpu_mode.compare("cusparse") == 0) {
+    if (accelerator_mode.compare("cusparse") == 0) {
 #if HAVE_CUDA
        use_gpu = true;
        backend.reset(new bda::cusparseSolverBackend<block_size>(linear_solver_verbosity, maxit, tolerance, deviceID));
 #else
        OPM_THROW(std::logic_error, "Error cusparseSolver was chosen, but CUDA was not found by CMake");
 #endif
-    } else if (gpu_mode.compare("opencl") == 0) {
+    } else if (accelerator_mode.compare("opencl") == 0) {
 #if HAVE_OPENCL
        use_gpu = true;
-        ILUReorder ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
-        if (opencl_ilu_reorder == "level_scheduling") {
+        ILUReorder ilu_reorder;
+        if (opencl_ilu_reorder == "") {
+            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;  // default when not selected by user
+        } else if (opencl_ilu_reorder == "level_scheduling") {
            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
        } else if (opencl_ilu_reorder == "graph_coloring") {
            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
@ -81,10 +81,28 @@ BdaBridge<BridgeMatrix, BridgeVector, block_size>::BdaBridge(std::string gpu_mod
 #else
        OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
 #endif
-    } else if (gpu_mode.compare("none") == 0) {
-        use_gpu = false;
+    } else if (accelerator_mode.compare("fpga") == 0) {
+#if HAVE_FPGA
+        use_fpga = true;
+        ILUReorder ilu_reorder;
+        if (opencl_ilu_reorder == "") {
+            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;  // default when not selected by user
+        } else if (opencl_ilu_reorder == "level_scheduling") {
+            ilu_reorder = bda::ILUReorder::LEVEL_SCHEDULING;
+        } else if (opencl_ilu_reorder == "graph_coloring") {
+            ilu_reorder = bda::ILUReorder::GRAPH_COLORING;
        } else {
-        OPM_THROW(std::logic_error, "Error unknown value for parameter 'GpuMode', should be passed like '--gpu-mode=[none|cusparse|opencl]");
+            OPM_THROW(std::logic_error, "Error invalid argument for --opencl-ilu-reorder, usage: '--opencl-ilu-reorder=[level_scheduling|graph_coloring]'");
+        }
+        backend.reset(new bda::FpgaSolverBackend<block_size>(fpga_bitstream, linear_solver_verbosity, maxit, tolerance, ilu_reorder));
+#else
+        OPM_THROW(std::logic_error, "Error fpgaSolver was chosen, but FPGA was not enabled by CMake");
+#endif
+    } else if (accelerator_mode.compare("none") == 0) {
+        use_gpu = false;
+        use_fpga = false;
+    } else {
+        OPM_THROW(std::logic_error, "Error unknown value for parameter 'AcceleratorMode', should be passed like '--accelerator-mode=[none|cusparse|opencl|fpga]");
    }
 }

@ -130,7 +148,7 @@ int checkZeroDiagonal(BridgeMatrix& mat) {


 // iterate sparsity pattern from Matrix and put colIndices and rowPointers in arrays
-// sparsity pattern should stay the same due to matrix-add-well-contributions
+// sparsity pattern should stay the same
 // this could be removed if Dune::BCRSMatrix features an API call that returns colIndices and rowPointers
 template <class BridgeMatrix>
 void getSparsityPattern(BridgeMatrix& mat, std::vector<int> &h_rows, std::vector<int> &h_cols) {
@ -161,14 +179,16 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatrix *mat OPM_UNUSED, BridgeVector &b OPM_UNUSED, WellContributions& wellContribs OPM_UNUSED, InverseOperatorResult &res OPM_UNUSED)
 {

-    if (use_gpu) {
+    if (use_gpu || use_fpga) {
        BdaResult result;
        result.converged = false;
        static std::vector<int> h_rows;
        static std::vector<int> h_cols;
        const int dim = (*mat)[0][0].N();
-        const int N = mat->N()*dim;
-        const int nnz = (h_rows.empty()) ? mat->nonzeroes()*dim*dim : h_rows.back()*dim*dim;
+        const int Nb = mat->N();
+        const int N = Nb * dim;
+        const int nnzb = (h_rows.empty()) ? mat->nonzeroes() : h_rows.back();
+        const int nnz = nnzb * dim * dim;

        if (dim != 3) {
            OpmLog::warning("cusparseSolver only accepts blocksize = 3 at this time, will use Dune for the remainder of the program");
@ -177,8 +197,8 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        }

        if (h_rows.capacity() == 0) {
-            h_rows.reserve(N+1);
-            h_cols.reserve(nnz);
+            h_rows.reserve(Nb+1);
+            h_cols.reserve(nnzb);
 #if PRINT_TIMERS_BRIDGE
            Dune::Timer t;
 #endif
@ -225,7 +245,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri
        res.converged = result.converged;
        res.conv_rate = result.conv_rate;
        res.elapsed = result.elapsed;
-    }else{
+    } else {
        res.converged = false;
    }
 }
@ -233,14 +253,14 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::solve_system(BridgeMatri

 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector &x OPM_UNUSED) {
-    if (use_gpu) {
+    if (use_gpu || use_fpga) {
        backend->get_result(static_cast<double*>(&(x[0][0])));
    }
 }

 template <class BridgeMatrix, class BridgeVector, int block_size>
 void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(WellContributions& wellContribs) {
-    if(gpu_mode.compare("opencl") == 0){
+    if(accelerator_mode.compare("opencl") == 0){
 #if HAVE_OPENCL
        const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(backend.get());
        wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
@ -250,12 +270,12 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions(We
    }
 }

-
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                                                                                \
 template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,              \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
 n>::BdaBridge                                                                                                                       \
-(std::string gpu_mode_, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);  \
+(std::string accelerator_mode_, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance,               \
+unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);                                                    \
                                                                                                                                    \
 template void BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >,         \
 Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >,                                    \
--- a/opm/simulators/linalg/bda/BdaBridge.hpp
+++ b/opm/simulators/linalg/bda/BdaBridge.hpp
@ -30,6 +30,10 @@
 #include <opm/simulators/linalg/bda/WellContributions.hpp>


+#if HAVE_FPGA
+#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
+#endif
+
 namespace Opm
 {

@ -42,19 +46,22 @@ class BdaBridge
 {
 private:
    bool use_gpu = false;
-    std::string gpu_mode;
+    bool use_fpga = false;
+    std::string accelerator_mode;
    std::unique_ptr<bda::BdaSolver<block_size> > backend;

 public:
    /// Construct a BdaBridge
-    /// \param[in] gpu_mode                   to select if a gpu solver is used, is passed via command-line: '--gpu-mode=[none|cusparse|opencl]'
+    /// \param[in] accelerator_mode           to select if an accelerated solver is used, is passed via command-line: '--accelerator-mode=[none|cusparse|opencl|fpga]'
+    /// \param[in] fpga_bitstream             FPGA programming bitstream file name, is passed via command-line: '--fpga-bitstream=[<filename>]'
    /// \param[in] linear_solver_verbosity    verbosity of BdaSolver
    /// \param[in] maxit                      maximum number of iterations for BdaSolver
    /// \param[in] tolerance                  required relative tolerance for BdaSolver
    /// \param[in] platformID                 the OpenCL platform ID to be used
    /// \param[in] deviceID                   the device ID to be used by the cusparse- and openclSolvers, too high values could cause runtime errors
-    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
-    BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
+    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
+    BdaBridge(std::string accelerator_mode, std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
+

    /// Solve linear system, A*x = b
    /// \warning Values of A might get overwritten!
@ -79,6 +86,16 @@ public:
    /// \param[in] wellContribs   container to hold all WellContributions
    void initWellContributions(WellContributions& wellContribs);

+    /// Return whether the BdaBridge will use the FPGA or not
+    /// return whether the BdaBridge will use the FPGA or not
+    bool getUseFpga(){
+        return use_fpga;
+    }
+
+    /// Return the selected accelerator mode, this is input via the command-line
+    std::string getAccleratorName(){
+        return accelerator_mode;
+    }

 }; // end class BdaBridge

--- a/opm/simulators/linalg/bda/BdaSolver.hpp
+++ b/opm/simulators/linalg/bda/BdaSolver.hpp
@ -55,6 +55,8 @@ namespace bda
        int maxit = 200;
        double tolerance = 1e-2;

+        std::string bitstream = "";
+
        int N;           // number of rows
        int Nb;          // number of blocked rows (Nb*block_size == N)
        int nnz;         // number of nonzeroes (scalars)
@ -66,7 +68,8 @@ namespace bda
        bool initialized = false;

    public:
-        /// Construct a BdaSolver, can be cusparseSolver or openclSolver
+        /// Construct a BdaSolver, can be cusparseSolver, openclSolver, fpgaSolver
+        /// \param[in] fpga_bitstream             FPGA bitstream file name (only for fpgaSolver)
        /// \param[in] linear_solver_verbosity    verbosity of solver
        /// \param[in] maxit                      maximum number of iterations for solver
        /// \param[in] tolerance                  required relative tolerance for solver
@ -74,6 +77,7 @@ namespace bda
        /// \param[in] deviceID                   the device to be used
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), deviceID(deviceID_) {};
        BdaSolver(int linear_solver_verbosity, int max_it, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), platformID(platformID_), deviceID(deviceID_) {};
+        BdaSolver(std::string fpga_bitstream, int linear_solver_verbosity, int max_it, double tolerance_) : verbosity(linear_solver_verbosity), maxit(max_it), tolerance(tolerance_), bitstream(fpga_bitstream) {};

        /// Define virtual destructor, so that the derivedclass destructor will be called
        virtual ~BdaSolver() {};
--- a/opm/simulators/linalg/bda/BlockedMatrix.cpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.cpp
@ -20,13 +20,19 @@
 #include <cstring>
 #include <cmath>

-#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <config.h>

-using bda::BlockedMatrix;
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/FPGAUtils.hpp>

 namespace bda
 {

+using Opm::OpmLog;
+
 /*Sort a row of matrix elements from a blocked CSR-format.*/

 template <unsigned int block_size>
@ -93,23 +99,380 @@ void blockMult(double *mat1, double *mat2, double *resMat) {
    }
 }

-// subtract c from b and store in a
-// a = b - c
+#if HAVE_FPGA
+
+/*Subtract two blocks from one another element by element*/
 template <unsigned int block_size>
-void blockSub(double *a, double *b, double *c)
-{
+void blockSub(double *mat1, double *mat2, double *resMat) {
    for (unsigned int row = 0; row < block_size; row++) {
        for (unsigned int col = 0; col < block_size; col++) {
-            a[block_size * row + col] = b[block_size * row + col] - c[block_size * row + col];
+            resMat[row * block_size + col] = mat1[row * block_size + col] - mat2[row * block_size + col];
        }
    }
 }

+/*Multiply a block with a vector block, and add the result, scaled by a constant, to the result vector*/
+template <unsigned int block_size>
+void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes) {
+    for (unsigned int row = 0; row < block_size; row++) {
+        if (resetRes) {
+            resVect[row] = 0.0;
+        }
+        for (unsigned int col = 0; col < block_size; col++) {
+            resVect[row] += scale * mat[row * block_size + col] * vect[col];
+        }
+    }
+}
+
+
+
+template <unsigned int block_size>
+int BlockedMatrix<block_size>::countUnblockedNnzs() {
+    int numNnzsOverThreshold = 0;
+    int totalNnzs = rowPointers[Nb];
+    for (unsigned int idx = 0; idx < totalNnzs * block_size * block_size; idx++) {
+        if (fabs(nnzValues[idx]) > nnzThreshold) {
+            numNnzsOverThreshold++;
+        }
+    }
+    return numNnzsOverThreshold;
+}
+
+/*
+ * Unblock the blocked matrix. Input the blocked matrix and output a CSR matrix without blocks.
+ * If unblocking the U matrix, the rows in all blocks need to written to the new matrix in reverse order.
+*/
+template <unsigned int block_size>
+void BlockedMatrix<block_size>::unblock(Matrix *mat, bool isUMatrix) {
+    const unsigned int bs = block_size;
+    int valIndex = 0, nnzsPerRow;
+
+    mat->rowPointers[0] = 0;
+    // go through the blocked matrix row-by row of blocks, and then row-by-row inside the block, and
+    // write all non-zero values and corresponding column indices that belong to the same row into the new matrix.
+    for (int row = 0; row < Nb; row++) {
+        for (unsigned int bRow = 0; bRow < bs; bRow++) {
+            nnzsPerRow = 0;
+            for (int col = rowPointers[row]; col < rowPointers[row + 1]; col++) {
+                for (unsigned int bCol = 0; bCol < bs; bCol++) {
+                    int idx = 0;
+                    // If the matrix is the U matrix, store the rows inside a block in reverse order.
+                    if (isUMatrix) {
+                        idx = col * bs * bs + (bs - bRow - 1) * bs + bCol;
+                    } else {
+                        idx = col * bs * bs + bRow * bs + bCol;
+                    }
+
+                    if (fabs(nnzValues[idx]) > nnzThreshold) {
+                        mat->nnzValues[valIndex] = nnzValues[idx];
+                        mat->colIndices[valIndex] = colIndices[col] * bs + bCol;
+                        valIndex++;
+                        nnzsPerRow++;
+                    }
+                }
+            }
+            // Update the rowpointers of the new matrix
+            mat->rowPointers[row * bs + bRow + 1] = mat->rowPointers[row * bs + bRow] + nnzsPerRow;
+        }
+    }
+}
+
+
+
+/*Optimized version*/
+// ub* prefixes indicate unblocked data
+template <unsigned int block_size>
+int BlockedMatrix<block_size>::toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
+                                     std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
+                                     std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, unsigned char *NROffsets, int *colorSizes, int *valSize)
+{
+    int res;
+    int numUnblockedNnzs = countUnblockedNnzs();
+
+    // initialize the non-blocked matrix with the obtained size.
+    std::unique_ptr<Matrix> ubMat = std::make_unique<Matrix>(Nb * block_size, numUnblockedNnzs);
+
+    unblock(ubMat.get(), isUMatrix);
+
+    std::vector<int> ubNodesPerColor(numColors);
+    for (int i = 0; i < numColors; i++) {
+        ubNodesPerColor[i] = nodesPerColor[i] * block_size;
+    }
+
+    *valSize = ubMat->nnzs;
+
+    res = ubMat->toRDF(numColors, ubNodesPerColor,
+                       colIndicesInColor, nnzsPerRowLimit,
+                       ubNnzValues, ubColIndices, nnzValsSizes,
+                       NROffsets, colorSizes);
+    return res;
+}
+
+
+// coloring is already done, numColors and nodesPerColor are set
+// [rows|columns]PerColorLimit are already queried from the FPGA
+// colIndicesInColor, PIndicesAddr and colorSizes are written here
+// There are 3 matrices analysed: the full matrix for spmv, L and U for ILU
+// node == row
+// color == partition
+// colorSizes: contains meta info about a color/partition, like number of rows and number of nnzs
+// colIndicesInColor: for each color: mapping of colIdx to colValue, unblocked. Used in Matrix::toRDF().
+//                    due to partitioning, lots of columns are removed, this matrix keeps track of the mapping
+// PIndicesAddr: contiguously for each color: indices of x in global x vector, unblocked
+//               if color 0 has A unique colAccesses, PIndicesAddr[0 - A] are for color 0
+//               then PIndicesAddr[A - A+B] are for color 1. Directly copied to FPGA
+template <unsigned int block_size>
+int BlockedMatrix<block_size>::findPartitionColumns(int numColors, int *nodesPerColor,
+        int rowsPerColorLimit, int columnsPerColorLimit,
+        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
+        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
+        std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes)
+{
+    // Data related to column indices per partition
+    int doneRows = 0;
+    std::vector<bool> isColAccessed(Nb); // std::vector<bool> might have some different optimized implementation, initialize in a loop
+    std::vector<bool> isLColAccessed(Nb);
+    int totalCols = 0;    // sum of numColAccesses for each color, blocked
+    int LTotalCols = 0, UTotalCols = 0;
+    int maxCols = 0;         // max value of numColAccesses for any color
+    int maxRowsPerColor = 0; // max value of numRows for any color
+    int maxColsPerRow = 0;   // max value of colsPerRow for any color
+    // colsInColor holds all (blocked) columnIndices that are accessed by that color without duplicates
+    // colsInColor[c][i] contains the ith column that color c accesses
+    // initial size allows for each color to access all columns, with space for padding
+    std::vector<std::vector<int> > colsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
+    std::vector<std::vector<int> > LColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
+    std::vector<std::vector<int> > UColsInColor(numColors, std::vector<int>(roundUpTo(Nb, 16)));
+
+    // find which columns are accessed in each color, as well as how many non-zeroes there are per color.
+    for (int c = 0; c < numColors; c++) {
+        int numRows = 0;
+        // initialize
+        for (int row = 0; row < Nb; row++) {
+            isColAccessed[row] = false;
+            isLColAccessed[row] = false;
+        }
+        if (c > 0) {
+            for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
+                isLColAccessed[i] = true;
+            }
+        }
+        int numColAccesses = 0, LNumColAccesses = 0, UNumColAccesses = 0;   // number of unique accesses, blocked
+        // for every row in this color
+        for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
+            int colsPerRow = 0;    // number of blocks for this row
+            bool rowIsEmpty = (rowPointers[row] == rowPointers[row + 1]);
+            for (int idx = rowPointers[row]; idx < rowPointers[row + 1]; idx++) {
+                // for every column in the current row, check if that column was accessed before this color
+                int col = colIndices[idx];
+                if (isColAccessed[col] == false) {
+                    colsInColor[c][numColAccesses] = col;
+                    isColAccessed[col] = true;
+                    numColAccesses++;
+                    if (col > row) {
+                        UColsInColor[numColors - c - 1][UNumColAccesses] = col;
+                        UNumColAccesses++;
+                    }
+                }
+                if (isLColAccessed[col] == false) {
+                    if (col < row) {
+                        LColsInColor[c][LNumColAccesses] = col;
+                        LNumColAccesses++;
+                        isLColAccessed[col] = true;
+                    }
+                }
+                colsPerRow++;
+            }
+            if (rowIsEmpty != true) {
+                numRows++;
+            }
+            maxColsPerRow = std::max(maxColsPerRow, colsPerRow);
+        }
+
+        // add columns from previous color into L partition to simplify data forwarding
+        if (c > 0) {
+            for (int i = doneRows - nodesPerColor[c - 1]; i < doneRows; i++) {
+                LColsInColor[c][LNumColAccesses] = i;
+                LNumColAccesses++;
+            }
+        }
+
+        colorSizes[c * 4 + 10] = numColAccesses * block_size;
+        LColorSizes[c * 4 + 10] = LNumColAccesses * block_size;
+        UColorSizes[(numColors - c - 1) * 4 + 10] = UNumColAccesses * block_size;
+
+        // store mapping
+        for (int col = 0; col < numColAccesses; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                colIndicesInColor[c][colsInColor[c][col]*block_size + i] = col * block_size + i;
+            }
+        }
+        for (int col = 0; col < LNumColAccesses; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                LColIndicesInColor[c][LColsInColor[c][col]*block_size + i] = col * block_size + i;
+            }
+        }
+        for (int col = 0; col < UNumColAccesses; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                UColIndicesInColor[numColors - c - 1][UColsInColor[numColors - c - 1][col]*block_size + i] = col * block_size + i;
+            }
+        }
+
+        // zeropad the colsInColor number to the nearest multiple of 16, because there are 16 32-bit color_col_index values per cacheline
+        while (numColAccesses % 16 != 0) {
+            colsInColor[c][numColAccesses] = colsInColor[c][numColAccesses - 1];
+            numColAccesses++;
+        }
+        while (LNumColAccesses % 16 != 0) {
+            LColsInColor[c][LNumColAccesses] = LColsInColor[c][LNumColAccesses - 1];
+            LNumColAccesses++;
+        }
+        while (UNumColAccesses % 16 != 0) {
+            UColsInColor[numColors - c - 1][UNumColAccesses] = UColsInColor[numColors - c - 1][UNumColAccesses - 1];
+            UNumColAccesses++;
+        }
+        maxCols = std::max(numColAccesses, maxCols);
+        totalCols += numColAccesses;
+        LTotalCols += LNumColAccesses;
+        UTotalCols += UNumColAccesses;
+        doneRows = doneRows + nodesPerColor[c];
+        maxRowsPerColor = std::max(numRows, maxRowsPerColor);
+    }
+
+    if (maxCols * static_cast<int>(block_size) > columnsPerColorLimit) {
+        std::ostringstream errorstring;
+        errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxCols * block_size << " > " << columnsPerColorLimit;
+        OPM_THROW(std::logic_error, errorstring.str());
+    }
+
+    doneRows = 0;
+    int diagValsSize = 0;
+    int maxRows = 0;
+
+    for (int c = 0; c < numColors; c++) {
+        // calculate sizes that include zeropadding
+        diagValsSize += roundUpTo(nodesPerColor[c] * block_size * 4, 8);
+        doneRows += nodesPerColor[c];
+        if (nodesPerColor[c] * static_cast<int>(block_size) > maxRows)
+            maxRows = nodesPerColor[c];
+        colorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
+        LColorSizes[c * 4 + 9] = nodesPerColor[c] * block_size;
+        UColorSizes[c * 4 + 9] = nodesPerColor[numColors - c - 1] * block_size;
+    }
+
+    if (maxRows * static_cast<int>(block_size) > rowsPerColorLimit) {
+        std::ostringstream errorstring;
+        errorstring << "ERROR: Current reordering exceeds maximum number of columns per color limit: " << maxRows * block_size << " > " << rowsPerColorLimit;
+        OPM_THROW(std::logic_error, errorstring.str());
+    }
+
+    // create and fill sizes array as far as already possible
+    colorSizes[0] = Nb * block_size;
+    LColorSizes[0] = Nb * block_size;
+    UColorSizes[0] = Nb * block_size;
+    // col_sizes (but the matrix is square)
+    colorSizes[1] = Nb * block_size;
+    LColorSizes[1] = Nb * block_size;
+    UColorSizes[1] = Nb * block_size;
+    colorSizes[2] = totalCols * block_size;
+    LColorSizes[2] = LTotalCols * block_size;
+    UColorSizes[2] = UTotalCols * block_size;
+    // missing val_size, written in Matrix::toRDF()
+    colorSizes[4] = numColors;
+    LColorSizes[4] = numColors;
+    UColorSizes[4] = numColors;
+    // missing NRFlagsSize, written in Matrix::toRDF()
+    colorSizes[6] = diagValsSize;
+    LColorSizes[6] = diagValsSize;
+    UColorSizes[6] = diagValsSize;
+
+    int paddingIdx = numColors;
+    while (paddingIdx % 4 != 0) {
+        for (unsigned int i = 0; i < 4; i++) {
+            colorSizes[paddingIdx * 4 + 8 + i] = 0;
+            LColorSizes[paddingIdx * 4 + 8 + i] = 0;
+            UColorSizes[paddingIdx * 4 + 8 + i] = 0;
+        }
+        paddingIdx++;
+    }
+
+    int index = 0, Lindex = 0, Uindex = 0;
+    for (int c = 0; c < numColors; c++) {
+        // for each unique col access
+        for (int col = 0; col < colorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                PIndicesAddr[index] = colsInColor[c][col] * block_size + i;
+                index++;
+            }
+        }
+        // add padding
+        while (index % 16 != 0) {
+            PIndicesAddr[index] = PIndicesAddr[index - 1];
+            index++;
+        }
+        for (int col = 0; col < LColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                LPIndicesAddr[Lindex] = LColsInColor[c][col] * block_size + i;
+                Lindex++;
+            }
+        }
+        while (Lindex % 16 != 0) {
+            LPIndicesAddr[Lindex] = LPIndicesAddr[Lindex - 1];
+            Lindex++;
+        }
+        for (int col = 0; col < UColorSizes[c * 4 + 10] / static_cast<int>(block_size) ; col++) {
+            for (unsigned int i = 0; i < block_size; i++) {
+                UPIndicesAddr[Uindex] = UColsInColor[c][col] * block_size + i;
+                Uindex++;
+            }
+        }
+        while (Uindex % 16 != 0) {
+            UPIndicesAddr[Uindex] = UPIndicesAddr[Uindex - 1];
+            Uindex++;
+        }
+    }
+    return 0;
+}
+
+
+void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag) {
+    const unsigned int block_size = 3;
+    int doneRows = rowSize - 1;    // since the rows of U are reversed, the rows of the diag are also reversed
+    int RDFIndex = 0;
+    for (int c = 0; c < numColors; c++) {
+        for (int r = 0; r < rowsPerColor[c]; r++) {
+
+            // the rows in the block are reversed
+            for (int i = static_cast<int>(block_size) - 1; i >= 0; i--) {
+                for (unsigned int j = 0; j < block_size; j++) {
+                    RDFDiag[RDFIndex] = blockedDiagVals[(doneRows - r) * block_size * block_size + i * block_size + j];
+                    RDFIndex++;
+                }
+                // add 4th column, zeropadding
+                RDFDiag[RDFIndex] = 0.0;
+                RDFIndex++;
+            }
+        }
+        doneRows -= rowsPerColor[c];
+
+        // make sure the color completely fills a cacheline
+        // a color with 3 blocks would otherwise leave space
+        while (RDFIndex % 8 != 0) {
+            RDFDiag[RDFIndex] = 0.0;
+            RDFIndex++;
+        }
+    }
+    assert(RDFIndex % 8 == 0);
+}
+
+#endif // HAVE_FPGA
+
+
+
 #define INSTANTIATE_BDA_FUNCTIONS(n)                                        \
 template void sortBlockedRow<n>(int *, double *, int, int);                 \
 template void blockMultSub<n>(double *, double *, double *);                \
 template void blockMult<n>(double *, double *, double *);                   \
-template void blockSub<n>(double *, double *, double *);             \

 INSTANTIATE_BDA_FUNCTIONS(1);
 INSTANTIATE_BDA_FUNCTIONS(2);
@ -118,4 +481,26 @@ INSTANTIATE_BDA_FUNCTIONS(4);

 #undef INSTANTIATE_BDA_FUNCTIONS

+#if HAVE_FPGA
+#define INSTANTIATE_BDA_FPGA_FUNCTIONS(n)                                             \
+template void blockSub<n>(double *, double *, double *);                              \
+template void blockVectMult<n>(double *, double *, double, double *, bool);           \
+template int BlockedMatrix<n>::toRDF(int, int *, bool,                                \
+    std::vector<std::vector<int> >& , int, int *,                                     \
+    std::vector<std::vector<double> >&, short int *, unsigned char *, int *,  int *); \
+template int BlockedMatrix<n>::findPartitionColumns(int, int *,                       \
+        int, int,                                                                     \
+        std::vector<std::vector<int> >& , int *, int *,                               \
+        std::vector<std::vector<int> >& , int *, int *,                               \
+        std::vector<std::vector<int> >& , int *, int *);
+
+INSTANTIATE_BDA_FPGA_FUNCTIONS(1);
+INSTANTIATE_BDA_FPGA_FUNCTIONS(2);
+INSTANTIATE_BDA_FPGA_FUNCTIONS(3);
+INSTANTIATE_BDA_FPGA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FPGA_FUNCTIONS
+#endif // HAVE_FPGA
+
+
 } // end namespace bda
--- a/opm/simulators/linalg/bda/BlockedMatrix.hpp
+++ b/opm/simulators/linalg/bda/BlockedMatrix.hpp
@ -20,29 +20,40 @@
 #ifndef BLOCKED_MATRIX_HPP
 #define BLOCKED_MATRIX_HPP

+#if HAVE_FPGA
+#include <vector>
+#endif
+
+#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
+
 namespace bda
 {

 /// This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
 /// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
-template<int BS>
-struct BlockedMatrix{
+template<unsigned int block_size>
+class BlockedMatrix
+{
+
+public:
+
    /// Allocate BlockedMatrix and data arrays with given sizes
    /// \param[in] Nb               number of blockrows
    /// \param[in] nnzbs            number of nonzero blocks
    BlockedMatrix(int Nb_, int nnzbs_)
-    : nnzValues(new double[nnzbs_*BS*BS]),
-      colIndices(new int[nnzbs_*BS*BS]),
+    : nnzValues(new double[nnzbs_*block_size*block_size]),
+      colIndices(new int[nnzbs_*block_size*block_size]),
      rowPointers(new int[Nb_+1]),
      Nb(Nb_),
      nnzbs(nnzbs_),
      deleteNnzs(true),
      deleteSparsity(true)
    {}
+
    /// Allocate BlockedMatrix, but copy sparsity pattern instead of allocating new memory
    /// \param[in] M              matrix to be copied
    BlockedMatrix(const BlockedMatrix& M)
-    : nnzValues(new double[M.nnzbs*BS*BS]),
+    : nnzValues(new double[M.nnzbs*block_size*block_size]),
      colIndices(M.colIndices),
      rowPointers(M.rowPointers),
      Nb(M.Nb),
@ -50,10 +61,11 @@ struct BlockedMatrix{
      deleteNnzs(true),
      deleteSparsity(false)
    {}
+
    /// Allocate BlockedMatrix, but let data arrays point to existing arrays
    /// \param[in] Nb             number of blockrows
    /// \param[in] nnzbs          number of nonzero blocks
-    /// \param[in] nnzValues      array of nonzero values, contains nnzb*BS*BS scalars
+    /// \param[in] nnzValues      array of nonzero values, contains nnzb*block_size*block_size scalars
    /// \param[in] colIndices     array of column indices, contains nnzb entries
    /// \param[in] rowPointers    array of row pointers, contains Nb+1 entries
    BlockedMatrix(int Nb_, int nnzbs_, double *nnzValues_, int *colIndices_, int *rowPointers_)
@ -76,6 +88,28 @@ struct BlockedMatrix{
        }
    }

+#if HAVE_FPGA
+    constexpr static double nnzThreshold = 1e-80;  // for unblocking, a nonzero must be bigger than this threshold to be considered a nonzero
+
+    int countUnblockedNnzs();
+
+    void unblock(Matrix *mat, bool isUMatrix);
+
+    /// Converts this matrix to the dataformat used by the FPGA
+    /// Is done every linear solve. The exact sparsity pattern can change every time since the zeros are removed during unblocking
+    int toRDF(int numColors, int *nodesPerColor, bool isUMatrix,
+        std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, int *nnzValsSizes,
+        std::vector<std::vector<double> >& nnzValues, short int *colIndices, unsigned char *NROffsets, int *colorSizes, int *valSize);
+
+    /// Analyses the sparsity pattern and prepares for toRDF()
+    /// Is only called once
+    int findPartitionColumns(int numColors, int *nodesPerColor,
+        int rowsPerColorLimit, int columnsPerColorLimit,
+        std::vector<std::vector<int> >& colIndicesInColor, int *PIndicesAddr, int *colorSizes,
+        std::vector<std::vector<int> >& LColIndicesInColor, int *LPIndicesAddr, int *LColorSizes,
+        std::vector<std::vector<int> >& UColIndicesInColor, int *UPIndicesAddr, int *UColorSizes);
+#endif
+
    double *nnzValues;
    int *colIndices;
    int *rowPointers;
@ -109,13 +143,26 @@ void blockMultSub(double *a, double *b, double *c);
 template <unsigned int block_size>
 void blockMult(double *mat1, double *mat2, double *resMat);

-/// Subtract blocks
-/// a = b - c
-/// \param[out] a                result block
-/// \param[in] b                 input block
-/// \param[in] c                 input block
+
+#if HAVE_FPGA
 template <unsigned int block_size>
-void blockSub(double *a, double *b, double *c);
+void blockSub(double *mat1, double *mat2, double *resMat);
+
+template <unsigned int block_size>
+void blockVectMult(double *mat, double *vect, double scale, double *resVect, bool resetRes);
+
+/// Convert a blocked inverse diagonal to the FPGA format.
+/// This is the only blocked structure on the FPGA, since it needs blocked matrix-vector multiplication after the backwards substitution of U.
+/// Since the rows of U are reversed, the rows of the diag are also reversed.
+/// The cachelines can hold 8 doubles, a block has 9 doubles.
+/// The format converts 3x3 blocks to 3x4 blocks, so 1 cacheline holds 2 unblocked rows.
+/// Then 2 blocks (24 doubles) fit on 3 cachelines.
+/// Example:
+/// [1 2 3]    [1 2 3 0]              [1 2 3 0 4 5 6 0]
+/// [4 5 6] -> [4 5 6 0] -> hardware: [7 8 9 0 block2 row1]
+/// [7 8 9]    [7 8 9 0]              [block2 row2 block2 row3]
+void blockedDiagtoRDF(double *blockedDiagVals, int rowSize, int numColors, std::vector<int>& rowsPerColor, double *RDFDiag);
+#endif

 } // end namespace bda

--- a/opm/simulators/linalg/bda/FPGABILU0.cpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.cpp
@ -0,0 +1,413 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+#include <opm/simulators/linalg/MatrixBlock.hpp>
+#include <dune/common/timer.hh>
+
+#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
+
+namespace bda
+{
+
+using Opm::OpmLog;
+using Dune::Timer;
+
+template <unsigned int block_size>
+FPGABILU0<block_size>::FPGABILU0(ILUReorder opencl_ilu_reorder_, int verbosity_, int maxRowsPerColor_, int maxColsPerColor_, int maxNNZsPerRow_, int maxNumColors_) :
+    verbosity(verbosity_), opencl_ilu_reorder(opencl_ilu_reorder_), maxRowsPerColor(maxRowsPerColor_), maxColsPerColor(maxColsPerColor_), maxNNZsPerRow(maxNNZsPerRow_), maxNumColors(maxNumColors_)
+{
+    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
+        level_scheduling = true;
+    } else if (opencl_ilu_reorder == ILUReorder::GRAPH_COLORING) {
+        graph_coloring = true;
+    } else {
+        OPM_THROW(std::logic_error, "Error ilu reordering strategy not set correctly\n");
+    }
+}
+
+
+template <unsigned int block_size>
+FPGABILU0<block_size>::~FPGABILU0()
+{
+    delete[] invDiagVals;
+}
+
+
+template <unsigned int block_size>
+bool FPGABILU0<block_size>::init(BlockedMatrix<block_size> *mat)
+{
+    const unsigned int bs = block_size;
+
+    resultPointers.resize(numResultPointers, nullptr);
+    resultSizes.resize(numResultSizes);
+
+    // Set nnzSplit as hardcoded constant until support for more than one nnzVals read array is added.
+    const unsigned int nnzSplit = 1;
+
+    this->N = mat->Nb * block_size;
+    this->Nb = mat->Nb;
+    this->nnz = mat->nnzbs * block_size * block_size;
+    this->nnzbs = mat->nnzbs;
+
+    toOrder.resize(Nb);
+    fromOrder.resize(Nb);
+
+    std::vector<int> CSCRowIndices(nnzbs);
+    std::vector<int> CSCColPointers(Nb + 1);
+
+    if (level_scheduling) {
+        Timer t_convert;
+        csrPatternToCsc(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb);
+        if (verbosity >= 3) {
+            std::ostringstream out;
+            out << "FPGABILU0 convert CSR to CSC: " << t_convert.stop() << " s";
+            OpmLog::info(out.str());
+        }
+    }
+
+    Timer t_analysis;
+    rMat = std::make_shared<BlockedMatrix<block_size> >(mat->Nb, mat->nnzbs);
+    LUMat = std::make_unique<BlockedMatrix<block_size> >(*rMat);
+    std::ostringstream out;
+    if (level_scheduling) {
+        out << "FPGABILU0 reordering strategy: " << "level_scheduling\n";
+        findLevelScheduling(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    } else if (graph_coloring) {
+        out << "FPGABILU0 reordering strategy: " << "graph_coloring\n";
+        findGraphColoring<bs>(mat->colIndices, mat->rowPointers, CSCRowIndices.data(), CSCColPointers.data(), mat->Nb, maxRowsPerColor, maxColsPerColor, &numColors, toOrder.data(), fromOrder.data(), rowsPerColor);
+    }
+
+    if (numColors > maxNumColors) {
+        std::ostringstream errorstring;
+        errorstring << "ERROR: the matrix was reordered into too many colors. Created " << numColors << " colors, while hardware only supports up to " << maxNumColors << "\n";
+        OPM_THROW(std::logic_error, errorstring.str());
+    }
+
+    if (verbosity >= 3) {
+        out << "FPGABILU0 analysis took: " << t_analysis.stop() << " s, " << numColors << " colors";
+    }
+    OpmLog::info(out.str());
+
+    int colorRoundedValSize = 0, LColorRoundedValSize = 0, UColorRoundedValSize = 0;
+    int NROffsetSize = 0, LNROffsetSize = 0, UNROffsetSize = 0;
+    int blockDiagSize = 0;
+    // This reordering is needed here only to te result can be used to calculate worst-case scenario array sizes
+    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+    int doneRows = 0;
+    for (int c = 0; c < numColors; c++) {
+        for (int i = doneRows; i < doneRows + rowsPerColor[c]; i++) {
+            for (int j = rMat->rowPointers[i]; j < rMat->rowPointers[i + 1]; j++) {
+                int columnIndex = rMat->colIndices[j];
+                if (columnIndex < i) {
+                    LColorRoundedValSize += 9;
+                    LNROffsetSize += 9;
+                }
+                if (columnIndex > i) {
+                    UColorRoundedValSize += 9;
+                    UNROffsetSize += 9;
+                }
+                colorRoundedValSize += 9;
+                NROffsetSize += 9;
+            }
+            blockDiagSize += 12;
+        }
+        // End of color: round all sizes to nearest cacheline
+        colorRoundedValSize = roundUpTo(colorRoundedValSize, 32);
+        LColorRoundedValSize = roundUpTo(LColorRoundedValSize, 32);
+        UColorRoundedValSize = roundUpTo(UColorRoundedValSize, 32);
+        NROffsetSize = roundUpTo(NROffsetSize, 64);
+        LNROffsetSize = roundUpTo(LNROffsetSize, 64);
+        UNROffsetSize = roundUpTo(UNROffsetSize, 64);
+        blockDiagSize = roundUpTo(blockDiagSize, 8);
+
+        doneRows += rowsPerColor[c];
+    }
+    int colorSizesNum = 8 + roundUpTo(4 * numColors, 16);
+    int worstCaseColumnAccessNum = numColors * maxColsPerColor;
+
+    nnzValues.resize(nnzSplit, std::vector<double>(colorRoundedValSize));
+    LnnzValues.resize(nnzSplit, std::vector<double>(LColorRoundedValSize));
+    UnnzValues.resize(nnzSplit, std::vector<double>(UColorRoundedValSize));
+    // initial number of nnz, used to allocate
+    nnzValsSizes.resize(nnzSplit, colorRoundedValSize);
+    LnnzValsSizes.resize(nnzSplit, LColorRoundedValSize);
+    UnnzValsSizes.resize(nnzSplit, UColorRoundedValSize);
+    colIndices.resize(colorRoundedValSize);
+    LColIndices.resize(LColorRoundedValSize);
+    UColIndices.resize(UColorRoundedValSize);
+    NROffsets.resize(NROffsetSize);
+    LNROffsets.resize(LNROffsetSize);
+    UNROffsets.resize(UNROffsetSize);
+    PIndicesAddr.resize(worstCaseColumnAccessNum);
+    LPIndicesAddr.resize(worstCaseColumnAccessNum);
+    UPIndicesAddr.resize(worstCaseColumnAccessNum);
+    colorSizes.resize(colorSizesNum);
+    LColorSizes.resize(colorSizesNum);
+    UColorSizes.resize(colorSizesNum);
+    blockDiag.resize(blockDiagSize);
+    colIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
+    LColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
+    UColIndicesInColor.resize(numColors, std::vector<int>(rMat->Nb * block_size, 0));
+
+    int err = rMat->findPartitionColumns(numColors, rowsPerColor.data(),
+                                         maxRowsPerColor, maxColsPerColor,
+                                         colIndicesInColor, PIndicesAddr.data(), colorSizes.data(),
+                                         LColIndicesInColor, LPIndicesAddr.data(), LColorSizes.data(),
+                                         UColIndicesInColor, UPIndicesAddr.data(), UColorSizes.data());
+    if (err != 0) {
+        std::ostringstream errorstring;
+        errorstring << "ERROR: findPartitionColumns failed, code " << err << "\n";
+        OPM_THROW(std::logic_error, errorstring.str());
+    }
+
+    diagIndex.resize(mat->Nb, 0);
+    invDiagVals = new double[mat->Nb * bs * bs];
+    LMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    UMat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
+    resultPointers[0] = (void *) colorSizes.data();
+    resultPointers[1] = (void *) PIndicesAddr.data();
+    resultPointers[2] = (void *) nnzValues.data();
+    resultPointers[3] = (void *) colIndices.data();
+    resultPointers[4] = (void *) NROffsets.data();
+    resultPointers[5] = (void *) nnzValsSizes.data();
+    resultPointers[6] = (void *) LColorSizes.data();
+    resultPointers[7] = (void *) LPIndicesAddr.data();
+    resultPointers[8] = (void *) LnnzValues.data();
+    resultPointers[9] = (void *) LColIndices.data();
+    resultPointers[10] = (void *) LNROffsets.data();
+    resultPointers[11] = (void *) LnnzValsSizes.data();
+    resultPointers[12] = (void *) UColorSizes.data();
+    resultPointers[13] = (void *) UPIndicesAddr.data();
+    resultPointers[14] = (void *) UnnzValues.data();
+    resultPointers[15] = (void *) UColIndices.data();
+    resultPointers[16] = (void *) UNROffsets.data();
+    resultPointers[17] = (void *) UnnzValsSizes.data();
+    resultPointers[18] = (void *) blockDiag.data();
+    //resultPointers[19] and [20] are set by the caller
+    resultSizes[0] = mat->Nb * block_size;
+    resultSizes[1] = colorRoundedValSize; // zeropadded valSize;
+    resultSizes[2] = numColors;
+    resultSizes[3] = worstCaseColumnAccessNum; //totalCols
+    resultSizes[4] = NROffsetSize; //NRFlagSize
+    resultSizes[5] = blockDiagSize; //diagValsSize
+    resultSizes[6] = mat->Nb * block_size;
+    resultSizes[7] = LColorRoundedValSize; // zeropadded LValSize;
+    resultSizes[8] = numColors;
+    resultSizes[9] = worstCaseColumnAccessNum; //LTotalCols
+    resultSizes[10] = LNROffsetSize; //LNRFlagSize
+    resultSizes[11] = blockDiagSize; //LDiagValsSize
+    resultSizes[12] = mat->Nb * block_size;
+    resultSizes[13] = UColorRoundedValSize; // zeropadded UValSize;
+    resultSizes[14] = numColors;
+    resultSizes[15] = worstCaseColumnAccessNum; //UTotalCols
+    resultSizes[16] = UNROffsetSize; //UNRFlagSize
+    resultSizes[17] = blockDiagSize; //UDiagValsSize
+    return true;
+} // end init()
+
+
+template <unsigned int block_size>
+bool FPGABILU0<block_size>::create_preconditioner(BlockedMatrix<block_size> *mat)
+{
+    const unsigned int bs = block_size;
+    Timer t_reorder;
+    reorderBlockedMatrixByPattern<bs>(mat, toOrder.data(), fromOrder.data(), rMat.get());
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "FPGABILU0 reorder matrix: " << t_reorder.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    // TODO: remove this copy by replacing inplace ilu decomp by out-of-place ilu decomp
+    Timer t_memcpy;
+    memcpy(LUMat->nnzValues, rMat->nnzValues, sizeof(double) * bs * bs * rMat->nnzbs);
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "FPGABILU0 memcpy: " << t_memcpy.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    int i, j, ij, ik, jk;
+    int iRowStart, iRowEnd, jRowEnd;
+    double pivot[bs * bs];
+    int LSize = 0;
+    Opm::Detail::Inverter<bs> inverter;   // reuse inverter to invert blocks
+
+    Timer t_decomposition;
+
+    // go through all rows
+    for (i = 0; i < LUMat->Nb; i++) {
+        iRowStart = LUMat->rowPointers[i];
+        iRowEnd = LUMat->rowPointers[i + 1];
+
+        // go through all elements of the row
+        for (ij = iRowStart; ij < iRowEnd; ij++) {
+            j = LUMat->colIndices[ij];
+            // if the element is the diagonal, store the index and go to next row
+            if (j == i) {
+                diagIndex[i] = ij;
+                break;
+            }
+            // if an element beyond the diagonal is reach, no diagonal was found
+            // throw an error now. TODO: perform reordering earlier to prevent this
+            if (j > i) {
+                std::ostringstream out;
+                out << "BILU0 Error could not find diagonal value in row: " << i;
+                OpmLog::error(out.str());
+                return false;
+            }
+
+            LSize++;
+            // calculate the pivot of this row
+            blockMult<bs>(LUMat->nnzValues + ij * bs * bs, invDiagVals + j * bs * bs, &pivot[0]);
+
+            memcpy(LUMat->nnzValues + ij * bs * bs, &pivot[0], sizeof(double) * bs * bs);
+
+            jRowEnd = LUMat->rowPointers[j + 1];
+            jk = diagIndex[j] + 1;
+            ik = ij + 1;
+            // substract that row scaled by the pivot from this row.
+            while (ik < iRowEnd && jk < jRowEnd) {
+                if (LUMat->colIndices[ik] == LUMat->colIndices[jk]) {
+                    blockMultSub<bs>(LUMat->nnzValues + ik * bs * bs, pivot, LUMat->nnzValues + jk * bs * bs);
+                    ik++;
+                    jk++;
+                } else {
+                    if (LUMat->colIndices[ik] < LUMat->colIndices[jk])
+                    { ik++; }
+                    else
+                    { jk++; }
+                }
+            }
+        }
+        // store the inverse in the diagonal!
+        inverter(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs);
+        memcpy(LUMat->nnzValues + ij * bs * bs, invDiagVals + i * bs * bs, sizeof(double) * bs * bs);
+    }
+
+    LMat->rowPointers[0] = 0;
+    UMat->rowPointers[0] = 0;
+
+    // Split the LU matrix into two by comparing column indices to diagonal indices
+    for (i = 0; i < LUMat->Nb; i++) {
+        LMat->rowPointers[i + 1] = LMat->rowPointers[i];
+        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
+            if (j < diagIndex[i]) {
+                memcpy(LMat->nnzValues + (LMat->rowPointers[i + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
+                LMat->colIndices[LMat->rowPointers[i + 1]] = LUMat->colIndices[j];
+                LMat->rowPointers[i + 1] = LMat->rowPointers[i + 1] + 1;
+            }
+        }
+    }
+    // Reverse the order or the (blocked) rows for the U matrix,
+    // because the rows are accessed in reverse order when applying the ILU0
+    int URowIndex = 0;
+    for (i = LUMat->Nb - 1; i >= 0; i--) {
+        UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex];
+        for (j = LUMat->rowPointers[i]; j < LUMat->rowPointers[i + 1]; j++) {
+            if (j > diagIndex[i]) {
+                memcpy(UMat->nnzValues + (UMat->rowPointers[URowIndex + 1]) * bs * bs, LUMat->nnzValues + j * bs * bs, sizeof(double) * bs * bs);
+                UMat->colIndices[UMat->rowPointers[URowIndex + 1]] = LUMat->colIndices[j];
+                UMat->rowPointers[URowIndex + 1] = UMat->rowPointers[URowIndex + 1] + 1;
+            }
+        }
+        URowIndex++;
+    }
+
+    if (verbosity >= 3) {
+        std::ostringstream out;
+        out << "FPGABILU0 decomposition: " << t_decomposition.stop() << " s";
+        OpmLog::info(out.str());
+    }
+
+    std::vector<int> URowsPerColor(numColors);
+    rowSize = block_size * rMat->Nb;
+    LRowSize = block_size * LMat->Nb;
+    URowSize = block_size * UMat->Nb;
+    LNumColors = numColors;
+    UNumColors = numColors;
+    for (int c = 0; c < numColors; c++) {
+        URowsPerColor[numColors - c - 1] = rowsPerColor[c];
+    }
+    int err;
+    err = rMat->toRDF(numColors, rowsPerColor.data(), /*isUMatrix:*/ false,
+                      colIndicesInColor, maxNNZsPerRow, nnzValsSizes.data(),
+                      nnzValues, colIndices.data(), NROffsets.data(), colorSizes.data(), &valSize);
+    if (err != 0) {
+        return false;
+    }
+    err = LMat->toRDF(LNumColors, rowsPerColor.data(), /*isUMatrix:*/ false,
+                      LColIndicesInColor, maxNNZsPerRow, LnnzValsSizes.data(),
+                      LnnzValues, LColIndices.data(), LNROffsets.data(), LColorSizes.data(), &LValSize);
+    if (err != 0) {
+        return false;
+    }
+    err = UMat->toRDF(UNumColors, URowsPerColor.data(), /*isUMatrix:*/ true,
+                      UColIndicesInColor, maxNNZsPerRow, UnnzValsSizes.data(),
+                      UnnzValues, UColIndices.data(), UNROffsets.data(), UColorSizes.data(), &UValSize);
+    if (err != 0) {
+        return false;
+    }
+    blockedDiagtoRDF(invDiagVals, rMat->Nb, numColors, URowsPerColor, blockDiag.data());
+    // resultPointers are set in the init method
+    resultSizes[0] = rowSize;
+    resultSizes[1] = colorSizes[3]; // zeropadded valSize;
+    resultSizes[2] = numColors;
+    resultSizes[3] = colorSizes[2]; //totalCols
+    resultSizes[4] = colorSizes[5]; //NRFlagSize
+    resultSizes[5] = colorSizes[6]; //diagValsSize
+    resultSizes[6] = LRowSize;
+    resultSizes[7] = LColorSizes[3]; // zeropadded LValSize;
+    resultSizes[8] = LNumColors;
+    resultSizes[9] = LColorSizes[2]; //LTotalCols
+    resultSizes[10] = LColorSizes[5]; //LNRFlagSize
+    resultSizes[11] = LColorSizes[6]; //LDiagValsSize
+    resultSizes[12] = URowSize;
+    resultSizes[13] = UColorSizes[3]; // zeropadded UValSize;
+    resultSizes[14] = UNumColors;
+    resultSizes[15] = UColorSizes[2]; //UTotalCols
+    resultSizes[16] = UColorSizes[5]; //UNRFlagSize
+    resultSizes[17] = UColorSizes[6]; //UDiagValsSize
+    return true;
+} // end create_preconditioner()
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                    \
+template FPGABILU0<n>::FPGABILU0(ILUReorder, int, int, int, int, int);  \
+template FPGABILU0<n>::~FPGABILU0();                                    \
+template bool FPGABILU0<n>::init(BlockedMatrix<n> *);                   \
+template bool FPGABILU0<n>::create_preconditioner(BlockedMatrix<n> *);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} //namespace bda
--- a/opm/simulators/linalg/bda/FPGABILU0.hpp
+++ b/opm/simulators/linalg/bda/FPGABILU0.hpp
@ -0,0 +1,117 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef FPGA_BILU0_HEADER_INCLUDED
+#define FPGA_BILU0_HEADER_INCLUDED
+
+#include <vector>
+
+#include <opm/simulators/linalg/bda/ILUReorder.hpp>
+#include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
+
+namespace bda
+{
+
+/*
+ * This class implements a Blocked ILU0 preconditioner, with output data
+ * specifically formatted for the FPGA.
+ * The decomposition and reorders of the rows of the matrix are done on CPU.
+ */
+
+template <unsigned int block_size>
+class FPGABILU0
+{
+
+private:
+    int N;       // number of rows of the matrix
+    int Nb;      // number of blockrows of the matrix
+    int nnz;     // number of nonzeroes of the matrix (scalar)
+    int nnzbs;   // number of blocks of the matrix
+    std::unique_ptr<BlockedMatrix<block_size> > LMat = nullptr, UMat = nullptr, LUMat = nullptr;
+    std::shared_ptr<BlockedMatrix<block_size> > rMat = nullptr; // reordered mat
+    double *invDiagVals = nullptr;
+    std::vector<int> diagIndex;
+    std::vector<int> toOrder, fromOrder;
+    std::vector<int> rowsPerColor;
+    int numColors;
+    int verbosity;
+
+    // sizes and arrays used during RDF generation
+    std::vector<std::vector<double> > nnzValues, LnnzValues, UnnzValues;
+    std::vector<short int> colIndices, LColIndices, UColIndices;
+    std::vector<unsigned char> NROffsets, LNROffsets, UNROffsets;
+    std::vector<int> PIndicesAddr, LPIndicesAddr, UPIndicesAddr;
+    std::vector<int> colorSizes, LColorSizes, UColorSizes;
+    std::vector<int> nnzValsSizes, LnnzValsSizes, UnnzValsSizes;
+    std::vector<std::vector<int> > colIndicesInColor, LColIndicesInColor, UColIndicesInColor;
+
+    int rowSize, valSize;
+    int LRowSize, LValSize, LNumColors;
+    int URowSize, UValSize, UNumColors;
+    std::vector<double> blockDiag;
+    ILUReorder opencl_ilu_reorder;
+    bool level_scheduling = false, graph_coloring = false;
+    int numResultPointers = 21;
+    std::vector<void *> resultPointers;
+    int numResultSizes = 18;
+    std::vector<int> resultSizes;
+    int maxRowsPerColor, maxColsPerColor, maxNNZsPerRow, maxNumColors; // are set via the constructor
+
+public:
+
+    FPGABILU0(ILUReorder opencl_ilu_reorder, int verbosity, int maxRowsPerColor, int maxColsPerColor, int maxNNZsPerRow, int maxNumColors);
+
+    ~FPGABILU0();
+
+    // analysis (optional)
+    bool init(BlockedMatrix<block_size> *mat);
+
+    // ilu_decomposition
+    bool create_preconditioner(BlockedMatrix<block_size> *mat);
+
+    int* getToOrder()
+    {
+        return toOrder.data();
+    }
+
+    int* getFromOrder()
+    {
+        return fromOrder.data();
+    }
+
+    BlockedMatrix<block_size>* getRMat()
+    {
+        return rMat.get();
+    }
+
+    void **getResultPointers()
+    {
+        return resultPointers.data();
+    }
+
+    int *getResultSizes()
+    {
+        return resultSizes.data();
+    }
+
+};
+
+} //namespace bda
+
+#endif // FPGA_BILU0_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGAMatrix.cpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.cpp
@ -0,0 +1,249 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/FPGAMatrix.hpp>
+#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
+
+namespace bda
+{
+
+/*Sort a row of matrix elements from a CSR-format.*/
+void sortRow(int *colIndices, double *data, int left, int right) {
+    int l = left;
+    int r = right;
+    int middle = colIndices[(l + r) >> 1];
+    do {
+        while (colIndices[l] < middle)
+            l++;
+        while (colIndices[r] > middle)
+            r--;
+        if (l <= r) {
+            int lColIndex = colIndices[l];
+            colIndices[l] = colIndices[r];
+            colIndices[r] = lColIndex;
+            double lDatum = data[l];
+            data[l] = data[r];
+            data[r] = lDatum;
+
+            l++;
+            r--;
+        }
+    } while (l < r);
+    if (left < r)
+        sortRow(colIndices, data, left, r);
+    if (right > l)
+        sortRow(colIndices, data, l, right);
+
+}
+
+/*
+ * Write all data used by the VHDL testbenches to raw data arrays. The arrays are as follows:
+ * - The "colorSizes" array, which first contains the number of rows, columns, non-zero values
+ *   and colors, and the size, in elements, of the NROffsets array, followed by:
+ *   the number of rows (rounded to the nearest 32), the number of rows (not rounded),
+ *   the number of columns (not rounded) and the number of non-zero values
+ *   (rounded to the nearest 32) for every partition.
+ *   This array is zero padded up to the nearest 64-byte cacheline.
+ * - The "colIndicesInColor" array, which contains for every partition, from which elements
+ *   in the global X vector the elements of that X vector partition came.
+ *   For example, if a matrix partition only has non-zero values in columns 1, 3 and 6, then
+ *   that X vector partition will only have three elements, and the color_col_indices array
+ *   will contain 1, 3 and 6 for that partition.
+ *   This array is zero padded up to the nearest 64-byte cacheline for every partition.
+ * - The "nnzValues" array contains all non-zero values of each partition of the matrix.
+ *   This array is zero-padded so that each color has a multiple of 32 elements (to have the
+ *   same number of elements per partition as the column indices array).
+ * - The "colIndices" array contains all column indices of each partition of the matrix.
+ *   These column indices are the local indices for that partition, so to be used, first a
+ *   local X vector partition needs to be loaded into some local memory (this is done using
+ *   data from the _color_col_indices array), before these column indices can be used as
+ *   addresses to that local memory to read the desired X vector values.
+ *   This array is zero-padded so that data for every partition fills up a number of complete
+ *   cachelines (this means every color has a multiple of 32 elements).
+ * - "NROffsets" is the name of the array that contains the new row offsets for
+ *   all elements of every partition of the matrix. New row offsets are 8-bit values which
+ *   are 0 if that element is not the first element in a row, or which, if that element is
+ *   the first element of a row) is equal to the amount of empty rows between that new row
+ *   and the row before it plus 1. This array is zero-padded so that data for every partition
+ *   fills up a number of complete cachelines (this means every color has a multiple of 64 elements).
+ */
+int Matrix::toRDF(int numColors, std::vector<int>& nodesPerColor,
+                  std::vector<std::vector<int> >& colIndicesInColor, int nnzsThisRowLimit,
+                  std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes)
+{
+    auto mat = this;
+
+    int doneRows = 0;
+    int totalRowNum = 0;  // total number of non-empty rows
+    int nnzsPerColor = 0; // total number of nnzs in current color, padded to multiple of 32 for each color
+    int maxNNZsPerColor = 0; // max of nnzsPerColor
+
+    int totalValSize = 0; // sum of nnzsPerColor, padded
+
+    std::vector<int> nnzRowsPerColor(numColors);
+
+    // find number of nnzs per color and number of non-empty rows
+    for (int c = 0; c < numColors; c++) {
+        int numRows = 0;
+        nnzRowsPerColor[c] = 0;
+        int firstNnzOfColor = mat->rowPointers[doneRows];
+        int lastNnzOfColor = mat->rowPointers[doneRows + nodesPerColor[c]];
+        nnzsPerColor = roundUpTo(lastNnzOfColor - firstNnzOfColor, 32); // round up to nearest 16 for short ints of column indices
+        totalValSize += nnzsPerColor;
+        maxNNZsPerColor = std::max(nnzsPerColor, maxNNZsPerColor);
+        int row = doneRows;
+        for (; row < doneRows + nodesPerColor[c]; row++) {
+            if ( mat->rowPointers[row] != mat->rowPointers[row + 1]) {
+                numRows++;
+                nnzRowsPerColor[c] = nnzRowsPerColor[c] + 1;
+            }
+        }
+
+        doneRows = row;
+        totalRowNum += numRows;
+    }
+
+    int conseqZeroRows = 0;       // number of consecutive empty rows
+    int maxConseqZeroRows = 0;
+    int numEmptyRows = 0;         // total number of empty rows
+    std::vector<int> rowOffsets(totalRowNum);
+    std::vector<int> nnzRowPointers(totalRowNum + 1, 0); // rowPointers, but only for non empty rows
+    std::vector<int> colorValPointers(numColors + 1);    // points to first nnz of first row of each color
+    std::vector<int> colorValZeroPointers(numColors);    // points to first padded zero for each color
+
+    int nonEmptyRowIdx = 0;   // read all rows, but only keep non empty rows, this idx keeps track of how many non empty rows where seen
+    doneRows = 0;
+
+    int totalPaddingSize = 0;  // number of padded zeros from previous colors
+    int NROffsetSize = 0;  // number of NROffsets entries, padded to multiple of 64 for each color
+    int maxRows = 0;
+    int maxNNZsPerRow = 0;
+
+    // determine the row offset of each row (amount of zero rows between it and the previous non-zero row)
+    // this is later converted to rowOffset for each nnz
+    for (int c = 0; c < numColors; c++) {
+        conseqZeroRows = 0;
+        for (int row = doneRows; row < doneRows + nodesPerColor[c]; row++) {
+            int nnzsThisRow = mat->rowPointers[row + 1] - mat->rowPointers[row];
+            if (nnzsThisRow == 0) {
+                conseqZeroRows++;
+                numEmptyRows++;
+            } else {
+                maxNNZsPerRow = std::max(nnzsThisRow, maxNNZsPerRow);
+                nnzRowPointers[nonEmptyRowIdx + 1] = mat->rowPointers[row + 1];
+                rowOffsets[nonEmptyRowIdx] = conseqZeroRows;
+                maxConseqZeroRows = std::max(conseqZeroRows, maxConseqZeroRows);
+                conseqZeroRows = 0;
+                nonEmptyRowIdx++;
+            }
+        }
+        // calculate sizes that include zeropadding
+        colorValZeroPointers[c] = nnzRowPointers[nonEmptyRowIdx] + totalPaddingSize;
+        colorValPointers[c + 1] = roundUpTo(colorValZeroPointers[c], 32);
+        totalPaddingSize += colorValPointers[c + 1] - colorValZeroPointers[c];
+        NROffsetSize += roundUpTo(colorValPointers[c + 1] - colorValPointers[c], 64);
+
+        doneRows += nodesPerColor[c];
+        maxRows = std::max(nodesPerColor[c], maxRows);
+    }
+
+    if (maxNNZsPerRow > nnzsThisRowLimit) {
+        std::ostringstream errorstring;
+        errorstring << "ERROR: Current reordering exceeds maximum number of non-zero values per row limit: " << maxNNZsPerRow << " > " << nnzsThisRowLimit;
+        OPM_THROW(std::logic_error, errorstring.str());
+    }
+
+    // create and fill RDF arrays
+    colorSizes[3] = colorValPointers[numColors];  // total number of nnzs the FPGA has to process, including zeropadding
+    colorSizes[5] = NROffsetSize;
+
+    for (int c = 0; c < numColors; c++) {
+        colorSizes[c * 4 + 8] = nnzRowsPerColor[c];
+        colorSizes[c * 4 + 11] = colorValPointers[c + 1] - colorValPointers[c];
+    }
+
+    int rowIndex = 0;  // keep track of where to read/write
+    int valIndex = 0;
+    int NRIndex = 0;
+    int halfwayPoint = colorValPointers[numColors] / 2;
+    nnzValsSizes[0] = colorValPointers[numColors];
+
+    colorSizes[7] = halfwayPoint;
+
+    for (int c = 0; c < numColors; c++) {
+        int nnzsThisRow;
+        // make sure 32 values are written in batches (pad with zeros if needed)
+        for (int v = colorValPointers[c]; v < colorValPointers[c + 1]; v += 32) {
+            for (int vb = 0; vb < 32; vb++) {
+
+                // if there are enough values for the whole cacheline
+                if (v + vb < colorValZeroPointers[c]) {
+                    ubNnzValues[0][v + vb] = mat->nnzValues[valIndex];
+                    ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex]]);
+
+                    // if this val is the first of a row
+                    if (nnzRowPointers[rowIndex] == valIndex) {
+
+                        if (rowOffsets[rowIndex] + 1 >= 255) {
+                            std::ostringstream errorstring;
+                            errorstring << "ERROR: row offset size exceeded in row " << rowIndex << " with an offset of " << rowOffsets[rowIndex] + 1;
+                            OPM_THROW(std::logic_error, errorstring.str());
+                        }
+
+                        NROffsets[NRIndex] =  static_cast<unsigned char>(rowOffsets[rowIndex] + 1);
+
+                        // skip all empty rows
+                        while (rowIndex < mat->N && nnzRowPointers[rowIndex] == valIndex) {
+                            rowIndex++;
+                            nnzsThisRow = 0;
+                        }
+                        nnzsThisRow++;
+                    }
+                    else
+                    {
+                        NROffsets[NRIndex] = (unsigned char) 0;
+                        nnzsThisRow++;
+                    }
+                    valIndex++;
+                }
+                else // zeropadding is needed
+                {
+                    ubNnzValues[0][v + vb] = 0.0;
+                    ubColIndices[v + vb] = static_cast<short int>(colIndicesInColor[c][mat->colIndices[valIndex - 1]]);
+                    NROffsets[NRIndex] = 0;
+                }
+                NRIndex++;
+
+            }
+        }
+
+        // zeropad the NROffsets file
+        while (NRIndex % 64 != 0) {
+            NROffsets[NRIndex] = 0;
+            NRIndex++;
+        }
+    }
+
+    return 0;
+}
+
+} // end namespace bda
--- a/opm/simulators/linalg/bda/FPGAMatrix.hpp
+++ b/opm/simulators/linalg/bda/FPGAMatrix.hpp
@ -0,0 +1,84 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef FPGA_MATRIX_HEADER_INCLUDED
+#define FPGA_MATRIX_HEADER_INCLUDED
+
+#include <vector>
+
+namespace bda
+{
+
+/// This struct resembles a csr matrix, only doubles are supported
+/// The data is stored in contiguous memory, such that they can be copied to a device in one transfer.
+class Matrix {
+
+public:
+
+    /// Allocate Matrix and data arrays with given sizes
+    /// \param[in] N               number of rows
+    /// \param[in] nnzs            number of nonzeros
+    Matrix(int N_, int nnzs_)
+    : nnzValues(new double[nnzs_]),
+      colIndices(new int[nnzs_]),
+      rowPointers(new int[N_+1]),
+      N(N_),
+      nnzs(nnzs_)
+    {}
+
+    /// All constructors allocate new memory, so always delete here
+    ~Matrix(){
+        delete[] nnzValues;
+        delete[] colIndices;
+        delete[] rowPointers;
+    }
+
+
+    /// Converts this matrix to the dataformat used by the FPGA.
+    /// The FPGA uses a new data format called CSRO (Compressed Sparse Row Offset).
+    /// The purpose of this format is to allow the data to be streamable.
+    /// The rowPointers array has an unpredictable reading pattern/timing,
+    /// it also needs a extra work if a row is shorter than a cacheline.
+    /// The array of N+1 rowPointers is replaced by an array of nnz rowOffsets.
+    /// The value of this offset is 0, unless the corresponding nnz is the first of a row,
+    /// in that case it is 'the number of empty rows preceeding it + 1'.
+    /// The FPGA can simply add the rowOffset to the current rowIdx to get the new rowIdx.
+    /// Example:
+    /// [1 0 0 3 0]    nnzValues   [1 3 2 2 1 4 3 4 1]
+    /// [0 2 2 0 1]    colIndices  [0 3 1 2 4 0 1 2 4]
+    /// [4 0 0 0 0] -> rowPointers [0 2 5 6 6 9]
+    /// [0 0 0 0 0]    rowOffsets  [1 0 1 0 0 1 2 0 0]
+    /// [0 3 4 0 1]
+    /// The rowOffset is stored in 1 byte, meaning the maximum value is 255.
+    int toRDF(int numColors, std::vector<int>& nodesPerColor,
+        std::vector<std::vector<int> >& colIndicesInColor, int nnzsPerRowLimit, 
+        std::vector<std::vector<double> >& ubNnzValues, short int *ubColIndices, int *nnzValsSizes, unsigned char *NROffsets, int *colorSizes);
+
+    double *nnzValues;
+    int *colIndices;
+    int *rowPointers;
+    int N;
+    int nnzs;
+};
+
+void sortRow(int *colIndices, double *data, int left, int right);
+
+} // end namespace bda
+
+#endif // FPGA_MATRIX_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/FPGASolverBackend.cpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.cpp
@ -0,0 +1,732 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <cmath>
+
+#include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/material/common/Unused.hpp>
+#include <opm/common/ErrorMacros.hpp>
+
+#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
+#include <opm/simulators/linalg/bda/FPGAUtils.hpp>
+#include <opm/simulators/linalg/bda/Reorder.hpp>
+
+// if defined, any FPGA kernel failure will terminate flow; otherwise, the FPGA
+// kernel will be disabled and execution will continue using DUNE
+#define FPGA_EXIT_WITH_HW_FAILURE
+//#undef FPGA_EXIT_WITH_HW_FAILURE
+
+// if defined, the function generate_statistics will create a CSV-formatted file
+// with detailed statistics about the FPGA backend performance
+//#define FPGA_STATISTICS_FILE_ENABLED
+#undef FPGA_STATISTICS_FILE_ENABLED
+
+namespace bda
+{
+
+using Opm::OpmLog;
+
+template <unsigned int block_size>
+FpgaSolverBackend<block_size>::FpgaSolverBackend(std::string fpga_bitstream, int verbosity_, int maxit_, double tolerance_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(fpga_bitstream, verbosity_, maxit_, tolerance_)
+{
+    int err;
+    std::ostringstream oss;
+    double start = second();
+
+    // currently, only block size == 3 is supported by the FPGA backend
+    assert(block_size == 3);
+
+    if (verbosity < 1) {
+        perf_call_enabled = false;
+    }
+    // setup bitstream name and other parameters
+    if (fpga_bitstream.compare("") == 0) {
+        OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file has not been specified");
+    }
+    if (!fileExists(fpga_bitstream.c_str())) {
+        OPM_THROW(std::logic_error, "fpgaSolver called but bitstream file specified does not exists or is not readable");
+    }
+    // -----------------------------
+    // FPGA: setup the OpenCL platform
+    // -----------------------------
+    std::string main_kernel_name(KERNEL_NAME); // macro defined in bicgstab_solver_config.hpp
+    // auto-select the proper FPGA device and create context and other CL objects
+    err = setup_opencl(nullptr, &device_id, &context, &commands, &program, &kernel, main_kernel_name.c_str(), fpga_bitstream.c_str(), &platform_awsf1);
+    if (err != 0) {
+        oss << "Failed to setup the OpenCL device (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    oss << "Detected FPGA platform type is ";
+    if (platform_awsf1) { oss << "AWS-F1."; } else { oss << "Xilinx Alveo."; }
+    OpmLog::info(oss.str());
+    oss.str("");
+    oss.clear();
+    // -----------------------------
+    // FPGA: setup the debug buffer
+    // -----------------------------
+    // set kernel debug lines depending on an environment variable
+    const char *xem = getenv("XCL_EMULATION_MODE");
+    if ((xem != nullptr) && (strcmp(xem, "sw_emu") == 0 || strcmp(xem, "hw_emu") == 0)) {
+        debug_outbuf_words = DEBUG_OUTBUF_WORDS_MAX_EMU;
+        oss << "Detected co-simulation mode, debug_outbuf_words set to " << debug_outbuf_words << ".\n";
+        OpmLog::info(oss.str());
+        oss.str("");
+        oss.clear();
+    } else {
+        // set to 2 to reduce overhead in reading back and interpreting the debug lines;
+        // increase to get more debug info from the kernel
+        // range is 2..DEBUG_OUTBUF_WORDS_MAX-1
+        debug_outbuf_words = 2;
+    }
+
+    // host debug buffer setup
+    err = fpga_setup_host_debugbuf(debug_outbuf_words, &debugBuffer, &debugbufferSize);
+    if (err != 0) {
+        oss << "Failed to call fpga_setup_host_debug_buffer (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // device debug buffer setup
+    err = fpga_setup_device_debugbuf(context, debugBuffer, &cldebug, debugbufferSize);
+    if (err != 0) {
+        oss << "Failed to call fpga_setup_device_debug_buffer (" << err << ").\n";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // copy debug buffer to device
+    err = fpga_copy_to_device_debugbuf(commands, cldebug, debugBuffer, debugbufferSize, debug_outbuf_words);
+    if (err != 0) {
+        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ").\n";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // ------------------------------------------------
+    // FPGA: query the kernel for limits/configuration
+    // ------------------------------------------------
+    err = fpga_kernel_query(context, commands, kernel, cldebug,
+                            debugBuffer, debug_outbuf_words,
+                            rst_assert_cycles, rst_settle_cycles,
+                            &hw_x_vector_elem, &hw_max_row_size,
+                            &hw_max_column_size, &hw_max_colors_size,
+                            &hw_max_nnzs_per_row, &hw_max_matrix_size,
+                            &hw_use_uram, &hw_write_ilu0_results,
+                            &hw_dma_data_width, &hw_mult_num,
+                            &hw_x_vector_latency, &hw_add_latency, &hw_mult_latency,
+                            &hw_num_read_ports, &hw_num_write_ports,
+                            &hw_reset_cycles, &hw_reset_settle);
+    if (err != 0) {
+        oss << "Failed to call fpga_kernel_query (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+
+    if (verbosity >= 1) {
+        oss << "FPGA kernel limits/configuration:\n";
+        oss << "  x_vector_elem=" << hw_max_colors_size << ", max_row_size=" << hw_max_nnzs_per_row << ", max_column_size=" << hw_max_matrix_size << "\n";
+        oss << "  max_colors_size=" << hw_x_vector_elem << ", max_nnzs_per_row=" << hw_max_row_size << ", max_matrix_size=" << hw_max_column_size << "\n";
+        oss << "  use_uram=" << hw_use_uram << ", write_ilu0_results=" << hw_write_ilu0_results << "\n";
+        oss << "  dma_data_width=" << hw_dma_data_width << ", mult_num=" << (unsigned int)hw_mult_num << "\n";
+        oss << "  x_vector_latency=" << (unsigned int)hw_x_vector_latency << "\n";
+        oss << "  add_latency=" << (unsigned int)hw_add_latency << ", mult_latency=" << (unsigned int)hw_mult_latency << "\n";
+        oss << "  num_read_ports=" << (unsigned int)hw_num_read_ports << ", num_write_ports=" << (unsigned int)hw_num_write_ports << "\n";
+        oss << "  reset_cycles=" << hw_reset_cycles << ", reset_settle=" << hw_reset_settle;
+        OpmLog::info(oss.str());
+        oss.str("");
+        oss.clear();
+    }
+
+    // check that LU results are generated by the kernel
+    if (use_LU_res && !hw_write_ilu0_results) {
+        OpmLog::warning("Kernel reports that LU results are not written to memory, but use_LU_res is set; disabling LU results usage");
+        oss.str("");
+        oss.clear();
+        use_LU_res = false;
+    }
+
+    // setup preconditioner
+    double start_prec = second();
+    prec = std::make_unique<Preconditioner>(opencl_ilu_reorder, verbosity_, hw_max_row_size, hw_max_column_size, hw_max_nnzs_per_row, hw_max_colors_size);
+    perf_total.s_preconditioner_setup = second() - start_prec;
+
+    if (opencl_ilu_reorder == ILUReorder::LEVEL_SCHEDULING) {
+        level_scheduling = true;
+    }
+
+    perf_total.s_initialization = second() - start;
+} // end fpgaSolverBackend
+
+
+template <unsigned int block_size>
+FpgaSolverBackend<block_size>::~FpgaSolverBackend()
+{
+    if (verbosity >= 1) {
+        generate_statistics();
+    }
+    delete[] rx;
+    delete[] rb;
+    if (nnzValArrays != nullptr) { free(nnzValArrays); }
+    if (L_nnzValArrays != nullptr) { free(L_nnzValArrays); }
+    if (U_nnzValArrays != nullptr) { free(U_nnzValArrays); }
+    // FPGA: buffers
+    free(debugBuffer);
+    for (int b = 0; b < RW_BUF; b++) {
+        free(dataBuffer[b]);
+    }
+    free(databufferSize);
+    // FPGA: OpenCL objects
+    if (cldebug != nullptr) { clReleaseMemObject(cldebug); }
+    for (int b = 0; b < RW_BUF; b++) {
+        if (cldata[b] != nullptr) {
+            clReleaseMemObject(cldata[b]);
+        }
+    }
+    clReleaseCommandQueue(commands);
+    clReleaseContext(context);
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    clReleaseDevice(device_id);
+} // end ~fpgaSolverBackend()
+
+
+// copy result to host memory
+// caller must be sure that x is a valid array
+template <unsigned int block_size>
+void FpgaSolverBackend<block_size>::get_result(double *x_)
+{
+    double start = 0;
+
+    if (perf_call_enabled) {
+        start = second();
+    }
+    // apply to results the reordering (stored in toOrder)
+    reorderBlockedVectorByPattern<block_size>(mat->Nb, rx, toOrder, x_);
+    // TODO: check if it is more efficient to avoid copying resultsBuffer[0] to rx in solve_system (private)
+    if (perf_call_enabled) {
+        perf_call.back().s_postprocess = second() - start;
+    }
+} // end get_result()
+
+
+template <unsigned int block_size>
+SolverStatus FpgaSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs OPM_UNUSED, BdaResult &res)
+{
+    if (initialized == false) {
+        initialize(N_, nnz_,  dim, vals, rows, cols);
+        if (!analyse_matrix()) {
+            return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
+        }
+    }
+    perf_call.emplace_back();
+    update_system(vals, b);
+    if (!create_preconditioner()) {
+        return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
+    }
+    solve_system(res);
+
+    if (verbosity >= 1) {
+        std::ostringstream oss;
+        oss << "fpgaSolverBackend::" << __func__ << " - converged: " << res.converged << \
+            ", iterations: " << res.iterations << ", reduction: " << res.reduction << \
+            ", conv_rate: " << res.conv_rate << ", elapsed: " << res.elapsed;
+        OpmLog::info(oss.str());
+    }
+    return SolverStatus::BDA_SOLVER_SUCCESS;
+}
+
+
+template <unsigned int block_size>
+void FpgaSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols)
+{
+    double start = second();
+    this->N = N_;
+    this->nnz = nnz_;
+    this->nnzb = nnz_ / block_size / block_size;
+    Nb = (N + dim - 1) / dim;
+
+    // allocate host memory for matrices and vectors
+    // actual data for mat points to std::vector.data() in ISTLSolverEbos, so no alloc/free here
+    mat.reset(new BlockedMatrix<block_size>(N_ / block_size, nnz_ / block_size / block_size, vals, cols, rows));
+
+    std::ostringstream oss;
+    oss << "Initializing FPGA data, matrix size: " << this->N << " blocks, nnz: " << this->nnzb << " blocks, " << \
+        "block size: " << dim << ", total nnz: " << this->nnz << "\n";
+    oss << "Maxit: " << maxit << std::scientific << ", tolerance: " << tolerance;
+    OpmLog::info(oss.str());
+
+    rx = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
+    rb = new double[roundUpTo(N_, CACHELINE_BYTES / sizeof(double))];
+
+    perf_total.s_initialization += second() - start;
+    initialized = true;
+} // end initialize()
+
+
+template <unsigned int block_size>
+bool FpgaSolverBackend<block_size>::analyse_matrix()
+{
+    std::ostringstream oss;
+    int err;
+
+    double start = second();
+    bool success = prec->init(mat.get());
+
+    if (!success) {
+        OpmLog::warning("Preconditioner for FPGA solver failed to initialize");
+        return success;
+    }
+
+    toOrder = prec->getToOrder();
+    fromOrder = prec->getFromOrder();
+    rMat = prec->getRMat();
+    processedPointers = prec->getResultPointers();
+    processedSizes = prec->getResultSizes();
+    processedPointers[19] = rb;
+    processedPointers[20] = rx;
+    nnzValArrays_size = static_cast<int*>(processedPointers[5])[0];
+    L_nnzValArrays_size = static_cast<int*>(processedPointers[11])[0];
+    U_nnzValArrays_size = static_cast<int*>(processedPointers[17])[0];
+    // -------------------------------------
+    // FPGA: setup host/device data buffers
+    // -------------------------------------
+    // allocate memory and setup data layout
+    err = fpga_setup_host_datamem(level_scheduling, fpga_config_bits,
+                                  processedSizes,
+                                  &setupArray,
+                                  &nnzValArrays,   &nnzValArrays_size,   &columnIndexArray,   &newRowOffsetArray,   &PIndexArray,   &colorSizesArray,
+                                  &L_nnzValArrays, &L_nnzValArrays_size, &L_columnIndexArray, &L_newRowOffsetArray, &L_PIndexArray, &L_colorSizesArray,
+                                  &U_nnzValArrays, &U_nnzValArrays_size, &U_columnIndexArray, &U_newRowOffsetArray, &U_PIndexArray, &U_colorSizesArray,
+                                  &BLKDArray, &X1Array, &R1Array,
+                                  &X2Array, &R2Array,
+                                  &LresArray, &UresArray,
+                                  &databufferSize, dataBuffer,
+                                  result_offsets, 1 /*num_nnz_arrays*/,
+                                  true /*reset_data_buffers*/,  /* WARNING: leave reset_data_buffers always ENABLED to avoid data corruption! */
+                                  debugbufferSize);
+    if (err) {
+        oss << "Failed to call fpga_setup_host_datamem (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+
+    // results buffers setup
+    if (use_LU_res) {
+        resultsBufferNum = 4;
+    } else {
+        resultsBufferNum = 2;
+    }
+    if (resultsBufferNum > RES_BUF_MAX) {
+        oss << "Number of results buffer (" << resultsBufferNum << ") is out of range (max " << RES_BUF_MAX << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+
+    resultsNum = processedSizes[0]; // rowSize, invariant between system solves
+    for (int i = 0; i < resultsBufferNum; i++) {
+        resultsBufferSize[i] = roundUpTo(resultsNum, CACHELINE_BYTES / sizeof(double)) * sizeof(double);
+    }
+
+    // device data memory setup
+    err = fpga_setup_device_datamem(context, databufferSize, dataBuffer, cldata);
+    if (err != 0) {
+        oss << "Failed to call fpga_setup_device_datamem (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+
+    // ------------------------------------
+    // FPGA: setup the kernel's parameters
+    // ------------------------------------
+    err = fpga_set_kernel_parameters(kernel, abort_cycles, debug_outbuf_words - 1, maxit,
+                                     debug_sample_rate, tolerance, cldata, cldebug);
+    if (err != 0) {
+        oss << "Failed to call fpga_set_kernel_parameters (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+
+    perf_total.s_analysis = second() - start;
+    analysis_done = true;
+
+    return success;
+} // end analyse_matrix()
+
+
+template <unsigned int block_size>
+bool FpgaSolverBackend<block_size>::create_preconditioner()
+{
+    double start = 0;
+
+    if (perf_call_enabled) {
+        start = second();
+    }
+    memset(rx, 0, sizeof(double) * N);
+    bool result = prec->create_preconditioner(mat.get());
+    if (!result) {
+        OpmLog::warning("fpgaSolverBackend: create_preconditioner failed");
+    }
+
+    if (perf_call_enabled) {
+        perf_call.back().s_preconditioner_create = second() - start;
+    }
+    return result;
+} // end create_preconditioner()
+
+
+template <unsigned int block_size>
+void FpgaSolverBackend<block_size>::solve_system(BdaResult &res)
+{
+    std::ostringstream oss;
+    int err;
+    double start = 0, start_total = 0;
+
+    // ------------------------------------
+    // FPGA: return immediately if FPGA is disabled
+    // ------------------------------------
+    if (fpga_disabled) {
+        res.converged = false;
+        OpmLog::warning("FPGA is disabled, fallback to SW execution");
+        return;
+    }
+
+    fpga_calls++;
+
+    if (perf_call_enabled) {
+        start = second();
+        start_total = start;
+    }
+
+    // check if any buffer is larger than the size set in preconditioner->init
+    // TODO: add check for all other buffer sizes that may overflow?
+    err = 0;
+    if ( ((int *)processedPointers[5])[0]  > nnzValArrays_size ||
+            ((int *)processedPointers[11])[0] > L_nnzValArrays_size ||
+            ((int *)processedPointers[17])[0] > U_nnzValArrays_size ) {
+        err = 1;
+    }
+    if (err != 0) {
+        OPM_THROW(std::logic_error, "A buffer size is larger than the initial allocation in solve_system (check preconditioner init)");
+    }
+
+    // ------------------------------------
+    // FPGA: copy input data to host data buffers
+    // ------------------------------------
+    if (perf_call_enabled) {
+        start = second();
+    }
+    err = fpga_copy_host_datamem(
+              processedPointers, processedSizes, setupArray,
+              nnzValArrays,   &nnzValArrays_size,   columnIndexArray,   newRowOffsetArray,   PIndexArray,   colorSizesArray,
+              L_nnzValArrays, &L_nnzValArrays_size, L_columnIndexArray, L_newRowOffsetArray, L_PIndexArray, L_colorSizesArray,
+              U_nnzValArrays, &U_nnzValArrays_size, U_columnIndexArray, U_newRowOffsetArray, U_PIndexArray, U_colorSizesArray,
+              BLKDArray, X1Array, R1Array, X2Array, R2Array,
+              use_LU_res, LresArray, UresArray,
+              databufferSize, dataBuffer,
+              1 /* nnzValArrays_num */,
+              reset_data_buffers, fill_results_buffers,
+              dump_data_buffers, fpga_calls);
+    if (perf_call_enabled) {
+        perf_call.back().s_mem_setup = second() - start;
+    }
+    if (err != 0) {
+        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // ------------------------------------
+    // FPGA: copy buffers to device
+    // ------------------------------------
+    // copy debug buffer to device
+    if (perf_call_enabled) {
+        start = second();
+    }
+    err = fpga_copy_to_device_debugbuf(commands,
+                                       cldebug, debugBuffer, debugbufferSize,
+                                       debug_outbuf_words);
+    if (err != 0) {
+        oss << "Failed to call fpga_copy_to_device_debugbuf (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // copy data buffers to device
+    err = fpga_copy_to_device_datamem(commands, RW_BUF, cldata);
+    if (err != 0) {
+        oss << "Failed to call fpga_copy_to_device_datamem (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    if (perf_call_enabled) {
+        perf_call.back().s_mem_h2d = second() - start;
+    }
+    // ------------------------------------
+    // FPGA: execute the kernel
+    // ------------------------------------
+    double time_elapsed_ms;
+    if (perf_call_enabled) {
+        start = second();
+    }
+    err = fpga_kernel_run(commands, kernel, &time_elapsed_ms);
+    if (perf_call_enabled) {
+        perf_call.back().s_kernel_exec = second() - start;
+    }
+    if (err != 0) {
+        oss << "Failed to call fpga_kernel_run (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    // ----------------------------------------
+    // FPGA: read back debug buffer from device
+    // ----------------------------------------
+    if (perf_call_enabled) {
+        start = second();
+    }
+    err = fpga_copy_from_device_debugbuf((bool)(verbosity < 10),
+                                         commands,
+                                         debug_outbuf_words, debugbufferSize,
+                                         cldebug, debugBuffer,
+                                         abort_cycles,
+                                         &kernel_cycles, &kernel_iter_run,
+                                         norms, &last_norm_idx,
+                                         &kernel_aborted, &kernel_signature, &kernel_overflow, &kernel_noresults,
+                                         &kernel_wrafterend, &kernel_dbgfifofull);
+    if (err != 0) {
+        oss << "Failed to call fpga_copy_from_device_debugbuf (" << err << ")";
+        OPM_THROW(std::logic_error, oss.str());
+    }
+    if (kernel_wrafterend) {
+        OpmLog::warning("Detected recoverable FPGA error: kernel write after end");
+    }
+    if (kernel_dbgfifofull) {
+        OpmLog::warning("Detected recoverable FPGA error: debug FIFO full");
+    }
+    if (kernel_aborted || kernel_signature || kernel_overflow) {
+#if defined(FPGA_EXIT_WITH_HW_FAILURE)
+        oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
+            ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")";
+        OPM_THROW(std::logic_error, oss.str());
+#else
+        oss << "Detected unrecoverable FPGA error (ABRT=" << kernel_aborted << \
+            ",SIG=" << kernel_signature << ",OVF=" << kernel_overflow << ")\n";
+        oss << "Disabling FPGA kernel: execution will continue with SW kernel";
+        OpmLog::warning(oss.str());
+        oss.str("");
+        oss.clear();
+        fpga_disabled = true;
+#endif
+    }
+    if (perf_call_enabled) {
+        perf_call.back().n_kernel_exec_cycles = kernel_cycles;
+    }
+    // copy (back) results only if FPGA is not disabled
+    if (!fpga_disabled) {
+        if (kernel_noresults) {
+            OpmLog::warning("FPGA kernel did not return results because the required precision is already reached");
+            // rx still contains zeros from initial guess
+        } else {
+            // ------------------------------------
+            // FPGA: read back results from device
+            // ------------------------------------
+            err = fpga_map_results(even(kernel_iter_run),
+                                   use_residuals, use_LU_res, commands,
+                                   resultsNum, resultsBufferNum, resultsBufferSize,
+                                   debugbufferSize,
+                                   cldata, resultsBuffer,
+                                   result_offsets,
+                                   dump_results, data_dir, basename, sequence);
+            if (err != 0) {
+                oss << "Failed to call fpga_map_results (" << err << ")";
+                OPM_THROW(std::logic_error, oss.str());
+            }
+            // TODO: copy results buffers to reordering output buffers
+            memcpy(rx, resultsBuffer[0], resultsNum * sizeof(double));
+            err = fpga_unmap_results(even(kernel_iter_run),
+                                     use_residuals, use_LU_res,
+                                     commands, cldata, resultsBuffer);
+            if (err != 0) {
+                oss << "Failed to call fpga_unmap_results (" << err << ")";
+                OPM_THROW(std::logic_error, oss.str());
+            }
+        }
+    }
+    // set results and update statistics (if enabled)
+    if (perf_call_enabled) {
+        perf_call.back().s_mem_d2h = second() - start;
+    }
+    float iter = ((float)kernel_iter_run / 2.0) + 0.5; // convert from half iteration int to actual iterationns
+    res.iterations = (int)iter;
+    res.reduction = norms[0] / norms[last_norm_idx]; // norms[0] is the initial norm
+    res.conv_rate = pow(res.reduction, 1.0 / iter);
+    res.elapsed = second() - start_total;
+    if (perf_call_enabled) {
+        perf_call.back().s_solve = res.elapsed;
+        perf_call.back().n_kernel_exec_iters = iter;
+    }
+    // convergence depends on number of iterations reached and hw execution errors
+    res.converged = true;
+    if (fpga_disabled || kernel_aborted || kernel_signature || kernel_overflow || iter >= (float)maxit) {
+        res.converged = false;
+        if (verbosity >= 1) {
+            oss << "FPGA kernel did not converge, reason: fpga_disabled=" << fpga_disabled << \
+                ", kernel_aborted=" << kernel_aborted << ", kernel_signature=" << kernel_signature << \
+                ", kernel_overflow=" << kernel_overflow << ", (iter>=" << maxit << ")=" << (iter >= (float)maxit);
+            OpmLog::warning(oss.str());
+            oss.str("");
+            oss.clear();
+        }
+    }
+    if (perf_call_enabled) {
+        perf_call.back().converged = res.converged;
+        perf_call.back().converged_flags = ((unsigned int)fpga_disabled) +
+                                           ((unsigned int)kernel_aborted << 1) + ((unsigned int)kernel_signature << 2) +
+                                           ((unsigned int)kernel_overflow << 3) + ((unsigned int)(iter >= (float)maxit) << 4);
+    }
+} // end solve_system()
+
+
+template <unsigned int block_size>
+void FpgaSolverBackend<block_size>::update_system(double *vals, double *b)
+{
+    double start = 0;
+
+    mat->nnzValues = vals;
+    // reorder inputs using previously found ordering (stored in fromOrder)
+    if (perf_call_enabled) {
+        start = second();
+    }
+    reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
+    if (perf_call_enabled) {
+        perf_call.back().s_reorder = second() - start;
+    }
+} // end update_system()
+
+
+template <unsigned int block_size>
+void FpgaSolverBackend<block_size>::generate_statistics()
+{
+    std::ostringstream oss;
+    unsigned int conv_iter = 0, conv_ovf = 0;
+
+    if (!perf_call_enabled || fpga_calls == 0) {
+        OpmLog::warning("FPGA statistics were not collected");
+        return;
+    }
+    std::printf("--- FPGA statistics ---\n");
+    std::printf("total solver calls..........: %u\n", fpga_calls);
+    std::printf("time initialization.........: %8.6f s\n", perf_total.s_initialization);
+    std::printf("time preconditioner setup...: %8.6f s\n", perf_total.s_preconditioner_setup);
+
+#if defined(FPGA_STATISTICS_FILE_ENABLED)
+    // DEBUG: this can be enabled to gather all the statistics in a CSV-formatted file
+    FILE *fout = fopen("fpga_statistics_details.csv", "w");
+    if (fout != nullptr) {
+        std::fprintf(fout, "call,preconditioner_create,analysis,reorder,mem_setup,mem_h2d,kernel_exec,kernel_cycles,kernel_iters,mem_d2h,solve,postprocess,converged\n");
+    }
+#endif
+    unsigned int num_data_points = perf_call.size();
+    for (unsigned int i = 0; i < num_data_points; i++) {
+        perf_total.s_preconditioner_create += perf_call[i].s_preconditioner_create;
+        if (perf_call[i].s_preconditioner_create > perf_total.s_preconditioner_create_max) { perf_total.s_preconditioner_create_max = perf_call[i].s_preconditioner_create; }
+        if (perf_call[i].s_preconditioner_create < perf_total.s_preconditioner_create_min) { perf_total.s_preconditioner_create_min = perf_call[i].s_preconditioner_create; }
+        perf_total.s_analysis += perf_call[i].s_analysis;
+        if (perf_call[i].s_analysis > perf_total.s_analysis_max) { perf_total.s_analysis_max = perf_call[i].s_analysis; }
+        if (perf_call[i].s_analysis < perf_total.s_analysis_min) { perf_total.s_analysis_min = perf_call[i].s_analysis; }
+        perf_total.s_reorder += perf_call[i].s_reorder;
+        if (perf_call[i].s_reorder > perf_total.s_reorder_max) { perf_total.s_reorder_max = perf_call[i].s_reorder; }
+        if (perf_call[i].s_reorder < perf_total.s_reorder_min) { perf_total.s_reorder_min = perf_call[i].s_reorder; }
+        perf_total.s_mem_setup += perf_call[i].s_mem_setup;
+        if (perf_call[i].s_mem_setup > perf_total.s_mem_setup_max) { perf_total.s_mem_setup_max = perf_call[i].s_mem_setup; }
+        if (perf_call[i].s_mem_setup < perf_total.s_mem_setup_min) { perf_total.s_mem_setup_min = perf_call[i].s_mem_setup; }
+        perf_total.s_mem_h2d += perf_call[i].s_mem_h2d;
+        if (perf_call[i].s_mem_h2d > perf_total.s_mem_h2d_max) { perf_total.s_mem_h2d_max = perf_call[i].s_mem_h2d; }
+        if (perf_call[i].s_mem_h2d < perf_total.s_mem_h2d_min) { perf_total.s_mem_h2d_min = perf_call[i].s_mem_h2d; }
+        perf_total.s_kernel_exec += perf_call[i].s_kernel_exec;
+        if (perf_call[i].s_kernel_exec > perf_total.s_kernel_exec_max) { perf_total.s_kernel_exec_max = perf_call[i].s_kernel_exec; }
+        if (perf_call[i].s_kernel_exec < perf_total.s_kernel_exec_min) { perf_total.s_kernel_exec_min = perf_call[i].s_kernel_exec; }
+        perf_total.n_kernel_exec_cycles += (unsigned long)perf_call[i].n_kernel_exec_cycles;
+        if (perf_call[i].n_kernel_exec_cycles > perf_total.n_kernel_exec_cycles_max) { perf_total.n_kernel_exec_cycles_max = perf_call[i].n_kernel_exec_cycles; }
+        if (perf_call[i].n_kernel_exec_cycles < perf_total.n_kernel_exec_cycles_min) { perf_total.n_kernel_exec_cycles_min = perf_call[i].n_kernel_exec_cycles; }
+        perf_total.n_kernel_exec_iters += perf_call[i].n_kernel_exec_iters;
+        if (perf_call[i].n_kernel_exec_iters > perf_total.n_kernel_exec_iters_max) { perf_total.n_kernel_exec_iters_max = perf_call[i].n_kernel_exec_iters; }
+        if (perf_call[i].n_kernel_exec_iters < perf_total.n_kernel_exec_iters_min) { perf_total.n_kernel_exec_iters_min = perf_call[i].n_kernel_exec_iters; }
+        perf_total.s_mem_d2h += perf_call[i].s_mem_d2h;
+        if (perf_call[i].s_mem_d2h > perf_total.s_mem_d2h_max) { perf_total.s_mem_d2h_max = perf_call[i].s_mem_d2h; }
+        if (perf_call[i].s_mem_d2h < perf_total.s_mem_d2h_min) { perf_total.s_mem_d2h_min = perf_call[i].s_mem_d2h; }
+        perf_total.s_solve += perf_call[i].s_solve;
+        if (perf_call[i].s_solve > perf_total.s_solve_max) { perf_total.s_solve_max = perf_call[i].s_solve; }
+        if (perf_call[i].s_solve < perf_total.s_solve_min) { perf_total.s_solve_min = perf_call[i].s_solve; }
+        perf_total.s_postprocess += perf_call[i].s_postprocess;
+        if (perf_call[i].s_postprocess > perf_total.s_postprocess_max) { perf_total.s_postprocess_max = perf_call[i].s_postprocess; }
+        if (perf_call[i].s_postprocess < perf_total.s_postprocess_min) { perf_total.s_postprocess_min = perf_call[i].s_postprocess; }
+        perf_total.n_converged += (unsigned int)perf_call[i].converged;
+        if (perf_call[i].converged_flags & 1 << 4) { conv_iter += 1; }
+        if (perf_call[i].converged_flags & 1 << 3) { conv_ovf += 1; }
+#if defined(FPGA_STATISTICS_FILE_ENABLED)
+        if (fout != nullptr) {
+            std::fprintf(fout, "%d,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%8.6f,%u,%.1f,%8.6f,%8.6f,%8.6f,%u\n",
+                         i, perf_call[i].s_preconditioner_create, perf_call[i].s_analysis, perf_call[i].s_reorder,
+                         perf_call[i].s_mem_setup, perf_call[i].s_mem_h2d, perf_call[i].s_kernel_exec, perf_call[i].n_kernel_exec_cycles,
+                         perf_call[i].n_kernel_exec_iters, perf_call[i].s_mem_d2h, perf_call[i].s_solve, perf_call[i].s_postprocess,
+                         (unsigned int)perf_call[i].converged);
+        }
+#endif
+    }
+#if defined(FPGA_STATISTICS_FILE_ENABLED)
+    if (fout != nullptr) {
+        fclose(fout);
+    }
+#endif
+    perf_total.s_preconditioner_create_avg = perf_total.s_preconditioner_create / num_data_points;
+    perf_total.s_analysis_avg = perf_total.s_analysis / num_data_points;
+    perf_total.s_reorder_avg = perf_total.s_reorder / num_data_points;
+    perf_total.s_mem_setup_avg = perf_total.s_mem_setup / num_data_points;
+    perf_total.s_mem_h2d_avg = perf_total.s_mem_h2d / num_data_points;
+    perf_total.s_kernel_exec_avg = perf_total.s_kernel_exec / num_data_points;
+    perf_total.n_kernel_exec_cycles_avg = perf_total.n_kernel_exec_cycles / num_data_points;
+    perf_total.n_kernel_exec_iters_avg = perf_total.n_kernel_exec_iters / num_data_points;
+    perf_total.s_mem_d2h_avg = perf_total.s_mem_d2h / num_data_points;
+    perf_total.s_solve_avg = perf_total.s_solve / num_data_points;
+    perf_total.s_postprocess_avg = perf_total.s_postprocess / num_data_points;
+    std::printf("time preconditioner creation: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_preconditioner_create, perf_total.s_preconditioner_create_avg, perf_total.s_preconditioner_create_min, perf_total.s_preconditioner_create_max);
+    std::printf("time analysis...............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_analysis, perf_total.s_analysis_avg, perf_total.s_analysis_min, perf_total.s_analysis_max);
+    std::printf("time reorder................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_reorder, perf_total.s_reorder_avg, perf_total.s_reorder_min, perf_total.s_reorder_max);
+    std::printf("time memory setup...........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_mem_setup, perf_total.s_mem_setup_avg, perf_total.s_mem_setup_min, perf_total.s_mem_setup_max);
+    std::printf("time memory host2dev........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_mem_h2d, perf_total.s_mem_h2d_avg, perf_total.s_mem_h2d_min, perf_total.s_mem_h2d_max);
+    std::printf("time kernel execution.......: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_kernel_exec, perf_total.s_kernel_exec_avg, perf_total.s_kernel_exec_min, perf_total.s_kernel_exec_max);
+    std::printf("cycles kernel execution.....: total %lu, avg %lu, min %lu, max %lu\n",
+                perf_total.n_kernel_exec_cycles, perf_total.n_kernel_exec_cycles_avg, perf_total.n_kernel_exec_cycles_min, perf_total.n_kernel_exec_cycles_max);
+    std::printf("iterations kernel execution.: total %.1f, avg %.1f, min %.1f, max %.1f\n",
+                perf_total.n_kernel_exec_iters, perf_total.n_kernel_exec_iters_avg, perf_total.n_kernel_exec_iters_min, perf_total.n_kernel_exec_iters_max);
+    std::printf("time memory dev2host........: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_mem_d2h, perf_total.s_mem_d2h_avg, perf_total.s_mem_d2h_min, perf_total.s_mem_d2h_max);
+    std::printf("time solve..................: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_solve, perf_total.s_solve_avg, perf_total.s_solve_min, perf_total.s_solve_max);
+    std::printf("time postprocess............: total %8.6f s, avg %8.6f s, min %8.6f s, max %8.6f s\n",
+                perf_total.s_postprocess, perf_total.s_postprocess_avg, perf_total.s_postprocess_min, perf_total.s_postprocess_max);
+    std::printf("converged...................: %u/%u, with iter>%d=%u, overflow=%u\n",
+                perf_total.n_converged, num_data_points, maxit, conv_iter, conv_ovf);
+    std::printf("-----------------------\n");
+} //end generate_statistics()
+
+
+#define INSTANTIATE_BDA_FUNCTIONS(n)                                                          \
+template FpgaSolverBackend<n>::FpgaSolverBackend(std::string, int, int, double, ILUReorder);  \
+
+INSTANTIATE_BDA_FUNCTIONS(1);
+INSTANTIATE_BDA_FUNCTIONS(2);
+INSTANTIATE_BDA_FUNCTIONS(3);
+INSTANTIATE_BDA_FUNCTIONS(4);
+
+#undef INSTANTIATE_BDA_FUNCTIONS
+
+} //namespace bda
--- a/opm/simulators/linalg/bda/FPGASolverBackend.hpp
+++ b/opm/simulators/linalg/bda/FPGASolverBackend.hpp
@ -0,0 +1,265 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
+#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
+
+#include <opm/simulators/linalg/bda/BdaSolver.hpp>
+#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
+
+#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
+#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
+#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
+
+namespace bda
+{
+
+/// This class implements an ilu0-bicgstab solver on FPGA
+template <unsigned int block_size>
+class FpgaSolverBackend : public BdaSolver<block_size>
+{
+    typedef BdaSolver<block_size> Base;
+    typedef FPGABILU0<block_size> Preconditioner;
+
+    using Base::N;
+    using Base::Nb;
+    using Base::nnz;
+    using Base::nnzb;
+    using Base::verbosity;
+    using Base::maxit;
+    using Base::tolerance;
+    using Base::initialized;
+
+private:
+    double *rx = nullptr; // reordered x
+    double *rb = nullptr; // reordered b
+    int *fromOrder = nullptr, *toOrder = nullptr;
+    bool analysis_done = false;
+    bool level_scheduling = false;
+
+    // LUMat will shallow copy rowPointers and colIndices of mat/rMat
+    std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr;
+    BlockedMatrix<block_size> *rMat = nullptr;
+    std::unique_ptr<Preconditioner> prec = nullptr;
+
+    // vectors with data processed by the preconditioner (input to the kernel)
+    void **processedPointers = nullptr;
+    int *processedSizes = nullptr;
+
+    unsigned int fpga_calls = 0;
+    bool perf_call_enabled = true;
+
+    // per call performance metrics
+    typedef struct {
+      double s_preconditioner_create = 0.0;
+      double s_analysis = 0.0;
+      double s_reorder = 0.0;
+      double s_mem_setup = 0.0;
+      double s_mem_h2d = 0.0;
+      double s_kernel_exec = 0.0;
+      unsigned int n_kernel_exec_cycles = 0;
+      float n_kernel_exec_iters = 0.0;
+      double s_mem_d2h = 0.0;
+      double s_solve = 0.0;
+      double s_postprocess = 0.0;
+      bool converged = false;
+      unsigned int converged_flags = 0;
+    } perf_call_metrics_t;
+    // cumulative performance metrics
+    typedef struct {
+      double s_initialization;
+      double s_preconditioner_setup;
+      double s_preconditioner_create;
+      double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
+      double s_analysis;
+      double s_analysis_min,s_analysis_max,s_analysis_avg;
+      double s_reorder;
+      double s_reorder_min,s_reorder_max,s_reorder_avg;
+      double s_mem_setup;
+      double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
+      double s_mem_h2d;
+      double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
+      double s_kernel_exec;
+      double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
+      unsigned long n_kernel_exec_cycles;
+      unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
+      float n_kernel_exec_iters;
+      float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
+      double s_mem_d2h;
+      double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
+      double s_solve;
+      double s_solve_min,s_solve_max,s_solve_avg;
+      double s_postprocess;
+      double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
+      unsigned int n_converged;
+    } perf_total_metrics_t;
+    std::vector<perf_call_metrics_t> perf_call;
+    perf_total_metrics_t perf_total;
+    // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
+    // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
+    unsigned int fpga_config_bits = 0;
+    bool fpga_disabled = false;
+    bool platform_awsf1;
+    unsigned int debugbufferSize;
+    unsigned long int *debugBuffer = nullptr;
+    unsigned int *databufferSize = nullptr;
+    unsigned char *dataBuffer[RW_BUF] = {nullptr};
+    unsigned int debug_outbuf_words;
+    int resultsNum;
+    int resultsBufferNum;
+    unsigned int resultsBufferSize[RES_BUF_MAX];
+    unsigned int result_offsets[6];
+    unsigned int kernel_cycles, kernel_iter_run;
+    double norms[4];
+    unsigned char last_norm_idx;
+    bool kernel_aborted, kernel_signature, kernel_overflow;
+    bool kernel_noresults;
+    bool kernel_wrafterend, kernel_dbgfifofull;
+    bool use_residuals = false;
+    bool use_LU_res = false;
+    int sequence = 0;
+    // TODO: these values may be sent via command line parameters
+    unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
+    unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
+    int nnzValArrays_size = 0;
+    int L_nnzValArrays_size = 0;
+    int U_nnzValArrays_size = 0;
+    // aliases to areas of the host data buffers
+    long unsigned int *setupArray = nullptr;
+    double **nnzValArrays  = nullptr;
+    short unsigned int *columnIndexArray = nullptr;
+    unsigned char *newRowOffsetArray = nullptr;
+    unsigned int *PIndexArray = nullptr;
+    unsigned int *colorSizesArray = nullptr;
+    double **L_nnzValArrays = nullptr;
+    short unsigned int *L_columnIndexArray = nullptr;
+    unsigned char *L_newRowOffsetArray = nullptr;
+    unsigned int *L_PIndexArray = nullptr;
+    unsigned int *L_colorSizesArray = nullptr;
+    double **U_nnzValArrays = nullptr;
+    short unsigned int *U_columnIndexArray = nullptr;
+    unsigned char *U_newRowOffsetArray = nullptr;
+    unsigned int *U_PIndexArray = nullptr;
+    unsigned int *U_colorSizesArray = nullptr;
+    double *BLKDArray = nullptr;
+    double *X1Array = nullptr, *X2Array = nullptr;
+    double *R1Array = nullptr, *R2Array = nullptr;
+    double *LresArray = nullptr, *UresArray = nullptr;
+    double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
+    // OpenCL variables
+    cl_device_id device_id;
+    cl_context context;
+    cl_command_queue commands;
+    cl_program program;
+    cl_kernel kernel;
+    cl_mem cldata[RW_BUF] = {nullptr};
+    cl_mem cldebug = nullptr;
+    // HW limits/configuration variables
+    unsigned int hw_x_vector_elem;
+    unsigned int hw_max_row_size;
+    unsigned int hw_max_column_size;
+    unsigned int hw_max_colors_size;
+    unsigned short hw_max_nnzs_per_row;
+    unsigned int hw_max_matrix_size;
+    bool hw_use_uram;
+    bool hw_write_ilu0_results;
+    unsigned short hw_dma_data_width;
+    unsigned char hw_x_vector_latency;
+    unsigned char hw_add_latency;
+    unsigned char hw_mult_latency;
+    unsigned char hw_mult_num;
+    unsigned char hw_num_read_ports;
+    unsigned char hw_num_write_ports;
+    unsigned short hw_reset_cycles;
+    unsigned short hw_reset_settle;
+    // debug
+    bool reset_data_buffers = false;
+    bool fill_results_buffers = false;
+    int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
+    bool dump_results = false;
+    char *data_dir = nullptr;
+    char *basename = nullptr;
+    unsigned short rst_assert_cycles = 0;
+    unsigned short rst_settle_cycles = 0;
+
+    /// Allocate host memory
+    /// \param[in] N              number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
+
+    /// Reorder the linear system so it corresponds with the coloring
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] b              input vector
+    void update_system(double *vals, double *b);
+
+    /// Analyse sparsity pattern to extract parallelism
+    /// \return true iff analysis was successful
+    bool analyse_matrix();
+
+    /// Perform ilu0-decomposition
+    /// \return true iff decomposition was successful
+    bool create_preconditioner();
+
+    /// Solve linear system
+    /// \param[inout] res         summary of solver result
+    void solve_system(BdaResult &res);
+
+    /// Generate FPGA backend statistics
+    void generate_statistics(void);
+
+public:
+
+    /// Construct an fpgaSolver
+    /// \param[in] fpga_bitstream             FPGA bitstream file name
+    /// \param[in] linear_solver_verbosity    verbosity of fpgaSolver
+    /// \param[in] maxit                      maximum number of iterations for fpgaSolver
+    /// \param[in] tolerance                  required relative tolerance for fpgaSolver
+    /// \param[in] opencl_ilu_reorder         select either level_scheduling or graph_coloring, see ILUReorder.hpp for explanation
+    FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
+
+    /// Destroy an fpgaSolver, and free memory
+    ~FpgaSolverBackend();
+
+    /// Solve linear system, A*x = b, matrix A must be in blocked-CSR format
+    /// \param[in] N              number of rows, divide by dim to get number of blockrows
+    /// \param[in] nnz            number of nonzeroes, divide by dim*dim to get number of blocks
+    /// \param[in] dim            size of block
+    /// \param[in] vals           array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
+    /// \param[in] rows           array of rowPointers, contains N/dim+1 values
+    /// \param[in] cols           array of columnIndices, contains nnz values
+    /// \param[in] b              input vector, contains N values
+    /// \param[in] wellContribs   WellContributions, not used in FPGA solver because it requires them already added to matrix A
+    /// \param[inout] res         summary of solver result
+    /// \return                   status code
+    SolverStatus solve_system(int N, int nnz, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) override;
+
+    /// Get result after linear solve, and peform postprocessing if necessary
+    /// \param[inout] x           resulting x vector, caller must guarantee that x points to a valid array
+    void get_result(double *x) override;
+
+}; // end class fpgaSolverBackend
+
+} //namespace bda
+
+#endif
+
--- a/opm/simulators/linalg/bda/FPGAUtils.cpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.cpp
@ -0,0 +1,63 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sys/time.h>
+#include <fstream>
+
+namespace bda
+{
+
+double second(void)
+{
+    struct timeval tv;
+    gettimeofday(&tv, nullptr);
+    return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0;
+}
+
+bool even(int n)
+{
+    if (n % 2 == 0) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+int roundUpTo(int i, int n)
+{
+    if (i % n == 0) {
+        return i;
+    } else {
+        return (i / n + 1) * n;
+    }
+}
+
+bool fileExists(const char *filename)
+{
+    FILE *fin;
+    fin = fopen(filename, "r");
+    if (fin == nullptr) {
+        return false;
+    } else {
+        fclose(fin);
+        return true;
+    }
+}
+
+} //namespace bda
--- a/opm/simulators/linalg/bda/FPGAUtils.hpp
+++ b/opm/simulators/linalg/bda/FPGAUtils.hpp
@ -0,0 +1,39 @@
+/*
+  Copyright 2020 Equinor ASA
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef FPGA_UTILS_HEADER_INCLUDED
+#define FPGA_UTILS_HEADER_INCLUDED
+
+namespace bda
+{
+
+union double2int
+{
+    unsigned long int int_val;
+    double double_val;
+};
+
+double second(void);
+bool even(int n);
+int roundUpTo(int i, int n);
+bool fileExists(const char *filename);
+
+} // end namespace bda
+
+#endif // FPGA_UTILS_HEADER_INCLUDED
--- a/opm/simulators/linalg/bda/Reorder.cpp
+++ b/opm/simulators/linalg/bda/Reorder.cpp
@ -17,12 +17,7 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#include <vector>
-#include <cstring>
-#include <algorithm> // for fill()
 #include <random>
-#include <limits>
-#include <sstream>

 #include <opm/common/ErrorMacros.hpp>

@ -60,7 +55,7 @@ int colorBlockedNodes(int rows, const int *CSRRowPointers, const int *CSRColIndi
        std::mt19937 gen(rd());
        std::uniform_int_distribution<> uniform(0, std::numeric_limits<int>::max());
        {
-            for(int i = 0; i < rows; ++i){
+            for (int i = 0; i < rows; ++i) {
                randoms[i] = uniform(gen);
            }
        }
@ -180,7 +175,7 @@ void reorderBlockedMatrixByPattern(BlockedMatrix<block_size> *mat, int *toOrder,
        // put thisRow from the old matrix into row i of the new matrix
        rmat->rowPointers[i + 1] = rmat->rowPointers[i] + mat->rowPointers[thisRow + 1] - mat->rowPointers[thisRow];
        for (k = mat->rowPointers[thisRow]; k < mat->rowPointers[thisRow + 1]; k++) {
-            for (j = 0; j < bs * bs; j++){
+            for (j = 0; j < bs * bs; j++) {
                rmat->nnzValues[rIndex * bs * bs + j] = mat->nnzValues[k * bs * bs + j];
            }
            rmat->colIndices[rIndex] = mat->colIndices[k];
--- a/opm/simulators/linalg/bda/Reorder.hpp
+++ b/opm/simulators/linalg/bda/Reorder.hpp
@ -20,6 +20,8 @@
 #ifndef REORDER_HPP
 #define REORDER_HPP

+#include <vector>
+
 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>

 namespace bda
--- a/opm/simulators/linalg/bda/WellContributions.cpp
+++ b/opm/simulators/linalg/bda/WellContributions.cpp
@ -28,15 +28,18 @@
 namespace Opm
 {

-WellContributions::WellContributions(std::string gpu_mode){
-    if(gpu_mode.compare("cusparse") == 0){
+WellContributions::WellContributions(std::string accelerator_mode){
+    if(accelerator_mode.compare("cusparse") == 0){
        cuda_gpu = true;
    }
-    else if(gpu_mode.compare("opencl") == 0){
+    else if(accelerator_mode.compare("opencl") == 0){
        opencl_gpu = true;
    }
+    else if(accelerator_mode.compare("fpga") == 0){
+        // unused for FPGA, but must be defined to avoid error
+    }
    else{
-        OPM_THROW(std::logic_error, "Error: invalid GPU mode");
+        OPM_THROW(std::logic_error, "Invalid accelerator mode");
    }
 }

--- a/opm/simulators/linalg/bda/WellContributions.hpp
+++ b/opm/simulators/linalg/bda/WellContributions.hpp
@ -176,7 +176,7 @@ public:
    void alloc();

    /// Create a new WellContributions
-    WellContributions(std::string gpu_mode);
+    WellContributions(std::string accelerator_mode);

    /// Destroy a WellContributions, and free memory
    ~WellContributions();