From cbfe25d0f08b0211f46405052e5977882dd52c14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Atgeirr=20Fl=C3=B8=20Rasmussen?= <atgeirr@sintef.no>
Date: Wed, 14 Jun 2023 09:18:53 +0200
Subject: [PATCH] Add NLDD nonlinear solver option.

---
 opm/simulators/flow/Banners.cpp               |   9 +-
 opm/simulators/flow/Banners.hpp               |   4 +-
 opm/simulators/flow/BlackoilModelEbos.hpp     | 970 +++++++++++++++++-
 .../flow/BlackoilModelParametersEbos.hpp      | 105 +-
 opm/simulators/flow/FlowMainEbos.hpp          |   2 +-
 opm/simulators/linalg/setupPropertyTree.cpp   |  15 +
 opm/simulators/linalg/setupPropertyTree.hpp   |   1 +
 .../timestepping/SimulatorReport.cpp          |   1 +
 8 files changed, 1068 insertions(+), 39 deletions(-)

diff --git a/opm/simulators/flow/Banners.cpp b/opm/simulators/flow/Banners.cpp
index 0c4ee1f24..d7ac75d6c 100644
--- a/opm/simulators/flow/Banners.cpp
+++ b/opm/simulators/flow/Banners.cpp
@@ -112,13 +112,20 @@ void printFlowBanner(int nprocs, int nthreads, std::string_view moduleVersionNam
 }
 
 void printFlowTrailer(int nprocs, int nthreads,
-                      const SimulatorReport& report)
+                      const SimulatorReport& report,
+                      const SimulatorReportSingle& localsolves_report)
 {
     std::ostringstream ss;
     ss << "\n\n================    End of simulation     ===============\n\n";
     ss << fmt::format("Number of MPI processes: {:9}\n", nprocs);
     ss << fmt::format("Threads per MPI process: {:9}\n", nthreads);
     report.reportFullyImplicit(ss);
+
+    if (localsolves_report.total_linearizations > 0) {
+        ss << "======  Accumulated local solve data  ======\n";
+        localsolves_report.reportFullyImplicit(ss);
+    }
+
     OpmLog::info(ss.str());
 }
 
diff --git a/opm/simulators/flow/Banners.hpp b/opm/simulators/flow/Banners.hpp
index d88132f2a..c95e2eff9 100644
--- a/opm/simulators/flow/Banners.hpp
+++ b/opm/simulators/flow/Banners.hpp
@@ -28,6 +28,7 @@
 namespace Opm {
 
 struct SimulatorReport;
+struct SimulatorReportSingle;
 
 // Print an ASCII-art header to the PRT and DEBUG files.
 void printPRTHeader(const std::string& parameters,
@@ -39,7 +40,8 @@ void printFlowBanner(int nprocs, int threads, std::string_view moduleVersionName
 
 // Print flow application trailer.
 void printFlowTrailer(int nprocs, int nthreads,
-                      const SimulatorReport& report);
+                      const SimulatorReport& report,
+                      const SimulatorReportSingle& localsolves_report);
 
 } // namespace Opm
 
diff --git a/opm/simulators/flow/BlackoilModelEbos.hpp b/opm/simulators/flow/BlackoilModelEbos.hpp
index 210543a27..53a227810 100644
--- a/opm/simulators/flow/BlackoilModelEbos.hpp
+++ b/opm/simulators/flow/BlackoilModelEbos.hpp
@@ -27,6 +27,24 @@
 #include <ebos/eclproblem.hh>
 #include <opm/models/utils/start.hh>
 
+#include <opm/simulators/timestepping/AdaptiveTimeSteppingEbos.hpp>
+
+#include <opm/simulators/flow/NonlinearSolverEbos.hpp>
+#include <opm/simulators/flow/BlackoilModelParametersEbos.hpp>
+#include <opm/simulators/wells/BlackoilWellModel.hpp>
+#include <opm/simulators/aquifers/BlackoilAquiferModel.hpp>
+#include <opm/simulators/wells/WellConnectionAuxiliaryModule.hpp>
+#include <opm/simulators/flow/partitionCells.hpp>
+#include <opm/simulators/flow/SubDomain.hpp>
+#include <opm/simulators/flow/countGlobalCells.hpp>
+#include <opm/simulators/utils/DeferredLoggingErrorHelpers.hpp>
+#include <opm/simulators/linalg/extractMatrix.hpp>
+
+#include <opm/grid/common/SubGridPart.hpp>
+#include <opm/simulators/timestepping/SimulatorReport.hpp>
+#include <opm/simulators/linalg/ParallelIstlInformation.hpp>
+#include <opm/core/props/phaseUsageFromDeck.hpp>
+
 #include <opm/common/ErrorMacros.hpp>
 #include <opm/common/Exceptions.hpp>
 #include <opm/common/OpmLog/OpmLog.hpp>
@@ -43,6 +61,9 @@
 #include <opm/simulators/flow/NonlinearSolverEbos.hpp>
 #include <opm/simulators/flow/BlackoilModelParametersEbos.hpp>
 #include <opm/simulators/linalg/ISTLSolverEbos.hpp>
+
+
+
 #include <opm/simulators/timestepping/AdaptiveTimeSteppingEbos.hpp>
 #include <opm/simulators/timestepping/ConvergenceReport.hpp>
 #include <opm/simulators/timestepping/SimulatorReport.hpp>
@@ -52,17 +73,24 @@
 #include <opm/simulators/wells/BlackoilWellModel.hpp>
 #include <opm/simulators/wells/WellConnectionAuxiliaryModule.hpp>
 
+
+#include <dune/istl/matrixmarket.hh>
+#include <opm/simulators/linalg/MatrixMarketSpecializations.hpp>
+
 #include <dune/istl/owneroverlapcopy.hh>
 #include <dune/common/parallel/communication.hh>
 #include <dune/common/timer.hh>
 #include <dune/common/unused.hh>
 
+#include <fmt/format.h>
+
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
 #include <limits>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -200,6 +228,8 @@ namespace Opm {
         typedef typename SparseMatrixAdapter::IstlMatrix Mat;
         typedef Dune::BlockVector<VectorBlockType>      BVector;
 
+        using Domain = SubDomain<Grid>;
+
         typedef ISTLSolverEbos<TypeTag> ISTLSolverType;
 
         class ComponentName
@@ -295,14 +325,111 @@ namespace Opm {
             // compute global sum of number of cells
             global_nc_ = detail::countGlobalCells(grid_);
             convergence_reports_.reserve(300); // Often insufficient, but avoids frequent moves.
+            // TODO: remember to fix!
+            if (param_.nonlinear_solver_ == "nldd") {
+                if (terminal_output) {
+                    OpmLog::info("Using Non-Linear Domain Decomposition solver (nldd).");
+                }
+                setupSubDomains();
+            } else if (param_.nonlinear_solver_ == "newton") {
+                if (terminal_output) {
+                    OpmLog::info("Using Newton nonlinear solver.");
+                }
+            } else {
+                OPM_THROW(std::runtime_error, "Unknown nonlinear solver option: " + param_.nonlinear_solver_);
+            }
         }
 
+
         bool isParallel() const
         { return  grid_.comm().size() > 1; }
 
+
         const EclipseState& eclState() const
         { return ebosSimulator_.vanguard().eclState(); }
 
+
+
+        void setupSubDomains()
+        {
+            // Create partitions.
+            const auto& [partition_vector, num_domains] =
+                this->partitionCells();
+
+            // Scan through partitioning to get correct size for each.
+            std::vector<int> sizes(num_domains, 0);
+            for (const auto& p : partition_vector) {
+                ++sizes[p];
+            }
+
+            // Set up correctly sized vectors of entity seeds and of indices for each partition.
+            using EntitySeed = typename Grid::template Codim<0>::EntitySeed;
+            std::vector<std::vector<EntitySeed>> seeds(num_domains);
+            std::vector<std::vector<int>> partitions(num_domains);
+            for (int domain = 0; domain < num_domains; ++domain) {
+                seeds[domain].resize(sizes[domain]);
+                partitions[domain].resize(sizes[domain]);
+            }
+
+            // Iterate through grid once, setting the seeds of all partitions.
+            std::vector<int> count(num_domains, 0);
+            const auto beg = grid_.template leafbegin<0>();
+            const auto end = grid_.template leafend<0>();
+            int cell = 0;
+            for (auto it = beg; it != end; ++it, ++cell) {
+                const int p = partition_vector[cell];
+                seeds[p][count[p]] = it->seed();
+                partitions[p][count[p]] = cell;
+                ++count[p];
+            }
+            assert(count == sizes);
+
+            // Create the domains.
+            for (int index = 0; index < num_domains; ++index) {
+                std::vector<bool> interior(partition_vector.size(), false);
+                for (int ix : partitions[index]) {
+                    interior[ix] = true;
+                }
+
+                Dune::SubGridPart<Grid> view {
+                    ebosSimulator_.vanguard().grid(),
+                    std::move(seeds[index])
+                };
+
+                this->domains_.emplace_back(index,
+                                            std::move(partitions[index]),
+                                            std::move(interior),
+                                            std::move(view));
+            }
+
+            // Set up container for the local system matrices.
+            domain_matrices_.resize(num_domains);
+
+            // Set up container for the local linear solvers.
+            for (int index = 0; index < num_domains; ++index) {
+                // TODO: The ISTLSolverEbos constructor will make
+                // parallel structures appropriate for the full grid
+                // only. This must be addressed before going parallel.
+                FlowLinearSolverParameters param;
+                param.template init<TypeTag>(ebosSimulator_.vanguard().eclState().getSimulationConfig().useCPR());
+                // Override solver type with umfpack if small domain.
+                // Otherwise hardcode to ILU0
+                if (domains_[index].cells.size() < 200) {
+                    param.linsolver_ = "umfpack";
+                } else {
+                    param.linsolver_ = "ilu0";
+                    param.linear_solver_reduction_ = 1e-2;
+                }
+                param.linear_solver_print_json_definition_ = false;
+                domain_linsolvers_.emplace_back(ebosSimulator_, param);
+            }
+
+            assert(int(domains_.size()) == num_domains);
+        }
+
+
+
+
         /// Called once before each time step.
         /// \param[in] timer                  simulation timer
         SimulatorReportSingle prepareStep(const SimulatorTimerInterface& timer)
@@ -335,6 +462,12 @@ namespace Opm {
                 std::cout << "equation scaling not supported yet" << std::endl;
                 //updateEquationsScaling();
             }
+
+            if (!domains_.empty()) {
+                // Setup domain->well mapping.
+                wellModel().setupDomains(domains_);
+            }
+
             report.pre_post_time += perfTimer.stop();
 
             return report;
@@ -348,18 +481,11 @@ namespace Opm {
         /// \param[in] iteration              should be 0 for the first call of a new timestep
         /// \param[in] timer                  simulation timer
         /// \param[in] nonlinear_solver       nonlinear solver used (for oscillation/relaxation control)
-        /// \param[in, out] reservoir_state   reservoir state variables
-        /// \param[in, out] well_state        well state variables
         template <class NonlinearSolverType>
         SimulatorReportSingle nonlinearIteration(const int iteration,
                                                  const SimulatorTimerInterface& timer,
                                                  NonlinearSolverType& nonlinear_solver)
         {
-            SimulatorReportSingle report;
-            failureReport_ = SimulatorReportSingle();
-            Dune::Timer perfTimer;
-
-            perfTimer.start();
             if (iteration == 0) {
                 // For each iteration we store in a vector the norms of the residual of
                 // the mass balance for each active phase, the well flux and the well equations.
@@ -370,8 +496,32 @@ namespace Opm {
                 convergence_reports_.back().report.reserve(11);
             }
 
+            if (iteration == 0) {
+                return nonlinearIterationNewton(iteration, timer, nonlinear_solver);
+            }
+            if (param_.nonlinear_solver_ == "nldd") {
+                return nonlinearIterationNldd(iteration, timer, nonlinear_solver);
+            } else {
+                return nonlinearIterationNewton(iteration, timer, nonlinear_solver);
+            }
+        }
+
+
+        template <class NonlinearSolverType>
+        SimulatorReportSingle nonlinearIterationNewton(const int iteration,
+                                                       const SimulatorTimerInterface& timer,
+                                                       NonlinearSolverType& nonlinear_solver)
+        {
+
+            // -----------   Set up reports and timer   -----------
+            SimulatorReportSingle report;
+            failureReport_ = SimulatorReportSingle();
+            Dune::Timer perfTimer;
+
+            perfTimer.start();
             report.total_linearizations = 1;
 
+            // -----------   Assemble   -----------
             try {
                 report += assembleReservoir(timer, iteration);
                 report.assemble_time += perfTimer.stop();
@@ -379,16 +529,17 @@ namespace Opm {
             catch (...) {
                 report.assemble_time += perfTimer.stop();
                 failureReport_ += report;
-                // todo (?): make the report an attribute of the class
                 throw; // continue throwing the stick
             }
 
+
+            // -----------   Check if converged   -----------
             std::vector<double> residual_norms;
             perfTimer.reset();
             perfTimer.start();
             // the step is not considered converged until at least minIter iterations is done
             {
-                auto convrep = getConvergence(timer, iteration,residual_norms);
+                auto convrep = getConvergence(timer, iteration, residual_norms);
                 report.converged = convrep.converged()  && iteration > nonlinear_solver.minIter();;
                 ConvergenceReport::Severity severity = convrep.severityOfWorstFailure();
                 convergence_reports_.back().report.push_back(std::move(convrep));
@@ -402,14 +553,13 @@ namespace Opm {
             }
             report.update_time += perfTimer.stop();
             residual_norms_history_.push_back(residual_norms);
+
+            // -----------   If not converged, solve linear system and do Newton update  -----------
             if (!report.converged) {
                 perfTimer.reset();
                 perfTimer.start();
                 report.total_newton_iterations = 1;
 
-                // enable single precision for solvers when dt is smaller then 20 days
-                //residual_.singlePrecision = (unit::convert::to(dt, unit::day) < 20.) ;
-
                 // Compute the nonlinear update.
                 unsigned nc = ebosSimulator_.model().numGridDof();
                 BVector x(nc);
@@ -417,13 +567,15 @@ namespace Opm {
                 // Solve the linear system.
                 linear_solve_setup_time_ = 0.0;
                 try {
-                    // apply the Schur compliment of the well model to the reservoir linearized
-                    // equations
+                    // Apply the Schur complement of the well model to
+                    // the reservoir linearized equations.
                     // Note that linearize may throw for MSwells.
                     wellModel().linearize(ebosSimulator().model().linearizer().jacobian(),
                                           ebosSimulator().model().linearizer().residual());
 
+                    // ---- Solve linear system ----
                     solveJacobianSystem(x);
+
                     report.linear_solve_setup_time += linear_solve_setup_time_;
                     report.linear_solve_time += perfTimer.stop();
                     report.total_linear_iterations += linearIterationsLastSolve();
@@ -449,7 +601,7 @@ namespace Opm {
                     // Stabilize the nonlinear update.
                     bool isOscillate = false;
                     bool isStagnate = false;
-                    nonlinear_solver.detectOscillations(residual_norms_history_, iteration, isOscillate, isStagnate);
+                    nonlinear_solver.detectOscillations(residual_norms_history_, residual_norms_history_.size() - 1, isOscillate, isStagnate);
                     if (isOscillate) {
                         current_relaxation_ -= nonlinear_solver.relaxIncrement();
                         current_relaxation_ = std::max(current_relaxation_, nonlinear_solver.relaxMax());
@@ -462,6 +614,7 @@ namespace Opm {
                     nonlinear_solver.stabilizeNonlinearUpdate(x, dx_old_, current_relaxation_);
                 }
 
+                // ---- Newton update ----
                 // Apply the update, with considering model-dependent limitations and
                 // chopping of the update.
                 updateSolution(x);
@@ -472,6 +625,338 @@ namespace Opm {
             return report;
         }
 
+
+
+
+        template <class NonlinearSolverType>
+        SimulatorReportSingle nonlinearIterationNldd(const int iteration,
+                                                     const SimulatorTimerInterface& timer,
+                                                     NonlinearSolverType& nonlinear_solver)
+        {
+            // -----------   Set up reports and timer   -----------
+            SimulatorReportSingle report;
+            failureReport_ = SimulatorReportSingle();
+            Dune::Timer perfTimer;
+
+            perfTimer.start();
+            report.total_linearizations = 1;
+
+            // -----------   Assemble   -----------
+            try {
+                report += assembleReservoir(timer, iteration);
+                report.assemble_time += perfTimer.stop();
+            }
+            catch (...) {
+                report.assemble_time += perfTimer.stop();
+                failureReport_ += report;
+                throw; // continue throwing the stick
+            }
+
+            // -----------   Check if converged   -----------
+            std::vector<double> residual_norms;
+            perfTimer.reset();
+            perfTimer.start();
+            // the step is not considered converged until at least minIter iterations is done
+            {
+                auto convrep = getConvergence(timer, iteration, residual_norms);
+                report.converged = convrep.converged()  && iteration > nonlinear_solver.minIter();;
+                ConvergenceReport::Severity severity = convrep.severityOfWorstFailure();
+                convergence_reports_.back().report.push_back(std::move(convrep));
+
+                // Throw if any NaN or too large residual found.
+                if (severity == ConvergenceReport::Severity::NotANumber) {
+                    OPM_THROW(NumericalProblem, "NaN residual found!");
+                } else if (severity == ConvergenceReport::Severity::TooLarge) {
+                    OPM_THROW_NOLOG(NumericalProblem, "Too large residual found!");
+                }
+            }
+            report.update_time += perfTimer.stop();
+            residual_norms_history_.push_back(residual_norms);
+
+            if (report.converged) {
+                return report;
+            }
+
+            // -----------   If not converged, do an NLDD iteration   -----------
+
+            auto& solution = ebosSimulator().model().solution(0);
+            auto initial_solution = solution;
+            auto locally_solved = initial_solution;
+
+            // -----------   Decide on an ordering for the domains   -----------
+            std::vector<int> domain_order(domains_.size());
+            if (param_.local_solve_approach_ == "gauss-seidel") {
+                // TODO: enable flexibility and choice in choosing domain ordering approach.
+                if (true) {
+                    // Use average pressures to order domains.
+                    std::vector<std::pair<double, int>> avgpress_per_domain(domains_.size());
+                    for (const auto& domain : domains_) {
+                        double press_sum = 0.0;
+                        for (const int c : domain.cells) {
+                            press_sum += solution[c][Indices::pressureSwitchIdx];
+                        }
+                        const double avgpress = press_sum / domain.cells.size();
+                        avgpress_per_domain[domain.index] = std::make_pair(avgpress, domain.index);
+                    }
+                    // Lexicographical sort by pressure, then index.
+                    std::sort(avgpress_per_domain.begin(), avgpress_per_domain.end());
+                    // Reverse since we want high-pressure regions solved first.
+                    std::reverse(avgpress_per_domain.begin(), avgpress_per_domain.end());
+                    for (size_t ii = 0; ii < domains_.size(); ++ii) {
+                        domain_order[ii] = avgpress_per_domain[ii].second;
+                    }
+                } else {
+                    // Use maximum residual to order domains.
+                    const auto& residual = ebosSimulator_.model().linearizer().residual();
+                    const int num_vars = residual[0].size();
+                    std::vector<std::pair<double, int>> maxres_per_domain(domains_.size());
+                    for (const auto& domain : domains_) {
+                        double maxres = 0.0;
+                        for (const int c : domain.cells) {
+                            for (int ii = 0; ii < num_vars; ++ii) {
+                                maxres = std::max(maxres, std::fabs(residual[c][ii]));
+                            }
+                        }
+                        maxres_per_domain[domain.index] = std::make_pair(maxres, domain.index);
+                    }
+                    // Lexicographical sort by pressure, then index.
+                    std::sort(maxres_per_domain.begin(), maxres_per_domain.end());
+                    // Reverse since we want high-pressure regions solved first.
+                    std::reverse(maxres_per_domain.begin(), maxres_per_domain.end());
+                    for (size_t ii = 0; ii < domains_.size(); ++ii) {
+                        domain_order[ii] = maxres_per_domain[ii].second;
+                    }
+                }
+            } else {
+                std::iota(domain_order.begin(), domain_order.end(), 0);
+            }
+
+            // -----------   Solve each domain separately   -----------
+            std::vector<SimulatorReportSingle> domain_reports(domains_.size());
+            for (const int domain_index : domain_order) {
+                const auto& domain = domains_[domain_index];
+                SimulatorReportSingle local_report;
+                if (param_.local_solve_approach_ == "jacobi") {
+                    auto initial_local_well_primary_vars = wellModel().getPrimaryVarsDomain(domain);
+                    auto initial_local_solution = Details::extractVector(solution, domain.cells);
+                    auto res = solveDomain(domain, timer, iteration);
+                    local_report = res.first;
+                    if (local_report.converged) {
+                        auto local_solution = Details::extractVector(solution, domain.cells);
+                        Details::setGlobal(local_solution, domain.cells, locally_solved);
+                        Details::setGlobal(initial_local_solution, domain.cells, solution);
+                        ebosSimulator_.model().invalidateAndUpdateIntensiveQuantities(/*timeIdx=*/0, domain.view);
+                    } else {
+                        wellModel().setPrimaryVarsDomain(domain, initial_local_well_primary_vars);
+                        Details::setGlobal(initial_local_solution, domain.cells, solution);
+                        ebosSimulator_.model().invalidateAndUpdateIntensiveQuantities(/*timeIdx=*/0, domain.view);
+                    }
+                } else {
+                    assert(param_.local_solve_approach_ == "gauss-seidel");
+                    auto initial_local_well_primary_vars = wellModel().getPrimaryVarsDomain(domain);
+                    auto initial_local_solution = Details::extractVector(solution, domain.cells);
+                    auto res = solveDomain(domain, timer, iteration);
+                    local_report = res.first;
+                    if (!local_report.converged) {
+                        // We look at the detailed convergence report to evaluate
+                        // if we should accept the unconverged solution.
+                        const auto& convrep = res.second;
+                        // We do not accept a solution if the wells are unconverged.
+                        if (!convrep.wellFailed()) {
+                            // Calculare the sums of the mb and cnv failures.
+                            double mb_sum = 0.0;
+                            double cnv_sum = 0.0;
+                            for (const auto& rc : convrep.reservoirConvergence()) {
+                                if (rc.type() == ConvergenceReport::ReservoirFailure::Type::MassBalance) {
+                                    mb_sum += rc.value();
+                                } else if (rc.type() == ConvergenceReport::ReservoirFailure::Type::Cnv) {
+                                    cnv_sum += rc.value();
+                                }
+                            }
+                            // If not too high, we overrule the convergence failure.
+                            const double acceptable_local_mb_sum = 1e-3;
+                            const double acceptable_local_cnv_sum = 1.0;
+                            if (mb_sum < acceptable_local_mb_sum && cnv_sum < acceptable_local_cnv_sum) {
+                                local_report.converged = true;
+                                OpmLog::debug("Accepting solution in unconverged domain " + std::to_string(domain.index));
+                            }
+                        }
+                    }
+                    if (local_report.converged) {
+                        auto local_solution = Details::extractVector(solution, domain.cells);
+                        Details::setGlobal(local_solution, domain.cells, locally_solved);
+                    } else {
+                        wellModel().setPrimaryVarsDomain(domain, initial_local_well_primary_vars);
+                        Details::setGlobal(initial_local_solution, domain.cells, solution);
+                        ebosSimulator_.model().invalidateAndUpdateIntensiveQuantities(/*timeIdx=*/0, domain.view);
+                    }
+                }
+                // This should have updated the global matrix to be
+                // dR_i/du_j evaluated at new local solutions for
+                // i == j, at old solution for i != j.
+                if (!local_report.converged) {
+                    // TODO: more proper treatment, including in parallel.
+                    OpmLog::debug("Convergence failure in domain " + std::to_string(domain.index));
+                }
+                domain_reports[domain.index] = local_report;
+            }
+
+            // Log summary of local solve convergence to DBG file.
+            {
+                int num_converged = 0;
+                SimulatorReportSingle rep;
+                for (const auto& dr : domain_reports) {
+                    if (dr.converged) {
+                        ++num_converged;
+                    }
+                    rep += dr;
+                }
+                std::ostringstream os;
+                os << fmt::format("Local solves finished. Converged for {}/{} domains.\n",
+                                  num_converged, domain_reports.size());
+                rep.reportFullyImplicit(os, nullptr);
+                OpmLog::debug(os.str());
+                local_reports_accumulated_ += rep;
+            }
+
+            if (param_.local_solve_approach_ == "jacobi") {
+                solution = locally_solved;
+                ebosSimulator_.model().invalidateAndUpdateIntensiveQuantities(/*timeIdx=*/0);
+            }
+
+            // Finish with a Newton step.
+            // Note that the "iteration + 100" is a simple way to avoid entering
+            // "if (iteration == 0)" and similar blocks, and also makes it a little
+            // easier to spot the iteration residuals in the DBG file. A more sophisticated
+            // approach can be done later.
+            auto rep = nonlinearIterationNewton(iteration + 100, timer, nonlinear_solver);
+            report += rep;
+            if (rep.converged) {
+                report.converged = true;
+            }
+            return report;
+        }
+
+
+
+
+        std::pair<SimulatorReportSingle, ConvergenceReport>
+        solveDomain(const Domain& domain,
+                    const SimulatorTimerInterface& timer,
+                    const int global_iteration,
+                    const bool initial_assembly_required = false)
+        {
+            SimulatorReportSingle report;
+            Dune::Timer solveTimer;
+            solveTimer.start();
+            Dune::Timer detailTimer;
+
+            ebosSimulator_.model().newtonMethod().setIterationIndex(0);
+
+            // When called, if assembly has already been performed
+            // with the initial values, we only need to check
+            // for local convergence. Otherwise, we must do a local
+            // assembly.
+            int iter = 0;
+            if (initial_assembly_required) {
+                detailTimer.start();
+                ebosSimulator_.model().newtonMethod().setIterationIndex(iter);
+                // TODO: we should have a beginIterationLocal function()
+                // only handling the well model for now
+                ebosSimulator_.problem().wellModel().assembleDomain(ebosSimulator_.model().newtonMethod().numIterations(),
+                                                                    ebosSimulator_.timeStepSize(),
+                                                                    domain);
+                // Assemble reservoir locally.
+                report += assembleReservoirDomain(domain, iter);
+                report.assemble_time += detailTimer.stop();
+            }
+            detailTimer.reset();
+            detailTimer.start();
+            std::vector<double> resnorms;
+            auto convreport = getDomainConvergence(domain, timer, 0, resnorms);
+            if (convreport.converged()) {
+                // TODO: set more info, timing etc.
+                report.converged = true;
+                return { report, convreport };
+            }
+
+            // We have already assembled for the first iteration,
+            // but not done the Schur complement for the wells yet.
+            detailTimer.reset();
+            detailTimer.start();
+            wellModel().linearizeDomain(domain,
+                                        ebosSimulator().model().linearizer().jacobian(),
+                                        ebosSimulator().model().linearizer().residual());
+            const double tt1 = detailTimer.stop();
+            report.assemble_time += tt1;
+            report.assemble_time_well += tt1;
+
+            // Local Newton loop.
+            const int max_iter = param_.max_local_solve_iterations_;
+            bool converged = false;
+            do {
+                // Solve local linear system.
+                // Note that x has full size, we expect it to be nonzero only for in-domain cells.
+                const int nc = grid_.size(0);
+                BVector x(nc);
+                detailTimer.reset();
+                detailTimer.start();
+                solveJacobianSystemDomain(domain, x);
+                wellModel().postSolveDomain(x, domain);
+                report.linear_solve_time += detailTimer.stop();
+                report.linear_solve_setup_time += linear_solve_setup_time_;
+                report.total_linear_iterations = linearIterationsLastSolve();
+
+                // Update local solution. // TODO: x is still full size, should we optimize it?
+                detailTimer.reset();
+                detailTimer.start();
+                updateDomainSolution(domain, x);
+                report.update_time += detailTimer.stop();
+
+                // Assemble well and reservoir.
+                detailTimer.reset();
+                detailTimer.start();
+                ++iter;
+                ebosSimulator_.model().newtonMethod().setIterationIndex(iter);
+                // TODO: we should have a beginIterationLocal function()
+                // only handling the well model for now
+                // Assemble reservoir locally.
+                ebosSimulator_.problem().wellModel().assembleDomain(ebosSimulator_.model().newtonMethod().numIterations(),
+                                                                    ebosSimulator_.timeStepSize(),
+                                                                    domain);
+                report += assembleReservoirDomain(domain, iter);
+                report.assemble_time += detailTimer.stop();
+
+                // Check for local convergence.
+                detailTimer.reset();
+                detailTimer.start();
+                convreport = getDomainConvergence(domain, timer, iter, resnorms);
+
+                // apply the Schur complement of the well model to the
+                // reservoir linearized equations
+                detailTimer.reset();
+                detailTimer.start();
+                wellModel().linearizeDomain(domain,
+                                            ebosSimulator().model().linearizer().jacobian(),
+                                            ebosSimulator().model().linearizer().residual());
+                const double tt2 = detailTimer.stop();
+                report.assemble_time += tt2;
+                report.assemble_time_well += tt2;
+            } while (!convreport.converged() && iter <= max_iter);
+
+            ebosSimulator_.problem().endIteration();
+
+            report.converged = convreport.converged();
+            report.total_newton_iterations = iter;
+            report.total_linearizations = iter;
+            report.total_time = solveTimer.stop();
+            // TODO: set more info, timing etc.
+            return { report, convreport };
+        }
+
+
+
+
         void printIf(int c, double x, double y, double eps, std::string type) {
             if (std::abs(x-y) > eps) {
                 std::cout << type << " " <<c << ": "<<x << " " << y << std::endl;
@@ -479,6 +964,8 @@ namespace Opm {
         }
 
 
+
+
         /// Called once after each time step.
         /// In this class, this function does nothing.
         /// \param[in] timer                  simulation timer
@@ -504,7 +991,18 @@ namespace Opm {
             ebosSimulator_.problem().beginIteration();
             ebosSimulator_.model().linearizer().linearizeDomain();
             ebosSimulator_.problem().endIteration();
+            return wellModel().lastReport();
+        }
 
+        /// Assemble the residual and Jacobian of the nonlinear system.
+        /// \param[in]      reservoir_state   reservoir state variables
+        /// \param[in, out] well_state        well state variables
+        /// \param[in]      initial_assembly  pass true if this is the first call to assemble() in this timestep
+        SimulatorReportSingle assembleReservoirDomain(const Domain& domain,
+                                                      const int iterationIdx)
+        {
+            // -------- Mass balance equations --------
+            ebosSimulator_.model().linearizer().linearizeDomain(domain);
             return wellModel().lastReport();
         }
 
@@ -598,12 +1096,42 @@ namespace Opm {
             return ebosSimulator_.model().newtonMethod().linearSolver().iterations ();
         }
 
+
+        void solveJacobianSystemDomain(const Domain& domain, BVector& global_x)
+        {
+            Dune::Timer perfTimer;
+            perfTimer.start();
+
+            const Mat& main_matrix = ebosSimulator_.model().linearizer().jacobian().istlMatrix();
+            if (domain_matrices_[domain.index]) {
+                Details::copySubMatrix(main_matrix, domain.cells, *domain_matrices_[domain.index]);
+            } else {
+                domain_matrices_[domain.index] = std::make_unique<Mat>(Details::extractMatrix(main_matrix, domain.cells));
+            }
+            auto& jac = *domain_matrices_[domain.index];
+            auto res = Details::extractVector(ebosSimulator_.model().linearizer().residual(), domain.cells);
+            auto x = res;
+
+            // set initial guess
+            global_x = 0.0;
+            x = 0.0;
+
+            auto& linsolver = domain_linsolvers_[domain.index];
+
+            linsolver.prepare(jac, res);
+            linear_solve_setup_time_ = perfTimer.stop();
+            linsolver.setResidual(res);
+            linsolver.solve(x);
+
+            Details::setGlobal(x, domain.cells, global_x);
+        }
+
         /// Solve the Jacobian system Jx = r where J is the Jacobian and
         /// r is the residual.
         void solveJacobianSystem(BVector& x)
         {
 
-            auto& ebosJac = ebosSimulator_.model().linearizer().jacobian();
+            auto& ebosJac = ebosSimulator_.model().linearizer().jacobian().istlMatrix();
             auto& ebosResid = ebosSimulator_.model().linearizer().residual();
 
             // set initial guess
@@ -619,12 +1147,32 @@ namespace Opm {
             // account for parallelization properly. since the residual of ECFV
             // discretizations does not need to be synchronized across processes to be
             // consistent, this is not relevant for OPM-flow...
-            ebosSolver.setMatrix(ebosJac);
             ebosSolver.solve(x);
        }
 
 
 
+        /// Apply an update to the primary variables.
+        void updateDomainSolution(const Domain& domain, const BVector& dx)
+        {
+            auto& ebosNewtonMethod = ebosSimulator_.model().newtonMethod();
+            SolutionVector& solution = ebosSimulator_.model().solution(/*timeIdx=*/0);
+
+            ebosNewtonMethod.update_(/*nextSolution=*/solution,
+                                     /*curSolution=*/solution,
+                                     /*update=*/dx,
+                                     /*resid=*/dx,
+                                     domain.cells); // the update routines of the black
+                                                    // oil model do not care about the
+                                                    // residual
+
+            // if the solution is updated, the intensive quantities need to be recalculated
+            ebosSimulator_.model().invalidateAndUpdateIntensiveQuantities(/*timeIdx=*/0, domain.view);
+        }
+
+
+
+
         /// Apply an update to the primary variables.
         void updateSolution(const BVector& dx)
         {
@@ -719,7 +1267,8 @@ namespace Opm {
         ///         of the cells associated with a numerical aquifer.
         std::tuple<double,double> localConvergenceData(std::vector<Scalar>& R_sum,
                                     std::vector<Scalar>& maxCoeff,
-                                    std::vector<Scalar>& B_avg)
+                                    std::vector<Scalar>& B_avg,
+                                    std::vector<int>& maxCoeffCell)
         {
             OPM_TIMEBLOCK(localConvergenceData);
             double pvSumLocal = 0.0;
@@ -734,9 +1283,11 @@ namespace Opm {
             OPM_BEGIN_PARALLEL_TRY_CATCH();
             for (const auto& elem : elements(gridView, Dune::Partitions::interior)) {
                 elemCtx.updatePrimaryStencil(elem);
-                elemCtx.updatePrimaryIntensiveQuantities(/*timeIdx=*/0);
+                // elemCtx.updatePrimaryIntensiveQuantities(/*timeIdx=*/0);
+
                 const unsigned cell_idx = elemCtx.globalSpaceIndex(/*spaceIdx=*/0, /*timeIdx=*/0);
-                const auto& intQuants = elemCtx.intensiveQuantities(/*spaceIdx=*/0, /*timeIdx=*/0);
+                // const auto& intQuants = elemCtx.intensiveQuantities(/*spaceIdx=*/0, /*timeIdx=*/0);
+                const auto& intQuants = *(ebosSimulator_.model().cachedIntensiveQuantities(cell_idx, /*timeIdx=*/ 0));
                 const auto& fs = intQuants.fluidState();
 
                 const double pvValue = ebosProblem.referencePorosity(cell_idx, /*timeIdx=*/0) * ebosModel.dofTotalVolume( cell_idx );
@@ -759,7 +1310,140 @@ namespace Opm {
                     const auto R2 = ebosResid[cell_idx][compIdx];
 
                     R_sum[ compIdx ] += R2;
-                    maxCoeff[ compIdx ] = std::max( maxCoeff[ compIdx ], std::abs( R2 ) / pvValue );
+                    const double Rval = std::abs( R2 ) / pvValue;
+                    if (Rval > maxCoeff[ compIdx ]) {
+                        maxCoeff[ compIdx ] = Rval;
+                        maxCoeffCell[ compIdx ] = cell_idx;
+                    }
+                }
+
+                if constexpr (has_solvent_) {
+                    B_avg[ contiSolventEqIdx ] += 1.0 / intQuants.solventInverseFormationVolumeFactor().value();
+                    const auto R2 = ebosResid[cell_idx][contiSolventEqIdx];
+                    R_sum[ contiSolventEqIdx ] += R2;
+                    maxCoeff[ contiSolventEqIdx ] = std::max( maxCoeff[ contiSolventEqIdx ], std::abs( R2 ) / pvValue );
+                }
+                if constexpr (has_extbo_) {
+                    B_avg[ contiZfracEqIdx ] += 1.0 / fs.invB(FluidSystem::gasPhaseIdx).value();
+                    const auto R2 = ebosResid[cell_idx][contiZfracEqIdx];
+                    R_sum[ contiZfracEqIdx ] += R2;
+                    maxCoeff[ contiZfracEqIdx ] = std::max( maxCoeff[ contiZfracEqIdx ], std::abs( R2 ) / pvValue );
+                }
+                if constexpr (has_polymer_) {
+                    B_avg[ contiPolymerEqIdx ] += 1.0 / fs.invB(FluidSystem::waterPhaseIdx).value();
+                    const auto R2 = ebosResid[cell_idx][contiPolymerEqIdx];
+                    R_sum[ contiPolymerEqIdx ] += R2;
+                    maxCoeff[ contiPolymerEqIdx ] = std::max( maxCoeff[ contiPolymerEqIdx ], std::abs( R2 ) / pvValue );
+                }
+                if constexpr (has_foam_) {
+                    B_avg[ contiFoamEqIdx ] += 1.0 / fs.invB(FluidSystem::gasPhaseIdx).value();
+                    const auto R2 = ebosResid[cell_idx][contiFoamEqIdx];
+                    R_sum[ contiFoamEqIdx ] += R2;
+                    maxCoeff[ contiFoamEqIdx ] = std::max( maxCoeff[ contiFoamEqIdx ], std::abs( R2 ) / pvValue );
+                }
+                if constexpr (has_brine_) {
+                    B_avg[ contiBrineEqIdx ] += 1.0 / fs.invB(FluidSystem::waterPhaseIdx).value();
+                    const auto R2 = ebosResid[cell_idx][contiBrineEqIdx];
+                    R_sum[ contiBrineEqIdx ] += R2;
+                    maxCoeff[ contiBrineEqIdx ] = std::max( maxCoeff[ contiBrineEqIdx ], std::abs( R2 ) / pvValue );
+                }
+
+                if constexpr (has_polymermw_) {
+                    static_assert(has_polymer_);
+
+                    B_avg[contiPolymerMWEqIdx] += 1.0 / fs.invB(FluidSystem::waterPhaseIdx).value();
+                    // the residual of the polymer molecular equation is scaled down by a 100, since molecular weight
+                    // can be much bigger than 1, and this equation shares the same tolerance with other mass balance equations
+                    // TODO: there should be a more general way to determine the scaling-down coefficient
+                    const auto R2 = ebosResid[cell_idx][contiPolymerMWEqIdx] / 100.;
+                    R_sum[contiPolymerMWEqIdx] += R2;
+                    maxCoeff[contiPolymerMWEqIdx] = std::max( maxCoeff[contiPolymerMWEqIdx], std::abs( R2 ) / pvValue );
+                }
+
+                if constexpr (has_energy_) {
+                    B_avg[ contiEnergyEqIdx ] += 1.0;
+                    const auto R2 = ebosResid[cell_idx][contiEnergyEqIdx];
+                    R_sum[ contiEnergyEqIdx ] += R2;
+                    maxCoeff[ contiEnergyEqIdx ] = std::max( maxCoeff[ contiEnergyEqIdx ], std::abs( R2 ) / pvValue );
+                }
+
+            }
+
+            OPM_END_PARALLEL_TRY_CATCH("BlackoilModelEbos::localConvergenceData() failed: ", grid_.comm());
+
+            // compute local average in terms of global number of elements
+            const int bSize = B_avg.size();
+            for ( int i = 0; i<bSize; ++i )
+            {
+                B_avg[ i ] /= Scalar( global_nc_ );
+            }
+
+            return {pvSumLocal, numAquiferPvSumLocal};
+        }
+
+
+        // Get reservoir quantities on this process needed for convergence calculations.
+        double localDomainConvergenceData(const Domain& domain,
+                                          std::vector<Scalar>& R_sum,
+                                          std::vector<Scalar>& maxCoeff,
+                                          std::vector<Scalar>& B_avg,
+                                          std::vector<int>& maxCoeffCell)
+        {
+            double pvSumLocal = 0.0;
+            const auto& ebosModel = ebosSimulator_.model();
+            const auto& ebosProblem = ebosSimulator_.problem();
+
+            const auto& ebosResid = ebosSimulator_.model().linearizer().residual();
+
+            ElementContext elemCtx(ebosSimulator_);
+            const auto& gridView = domain.view;
+            const auto& elemEndIt = gridView.template end</*codim=*/0>();
+            OPM_BEGIN_PARALLEL_TRY_CATCH();
+            for (auto elemIt = gridView.template begin</*codim=*/0>();
+                 elemIt != elemEndIt;
+                 ++elemIt)
+            {
+                if (elemIt->partitionType() != Dune::InteriorEntity) {
+                    continue;
+                }
+                const auto& elem = *elemIt;
+                elemCtx.updatePrimaryStencil(elem);
+                // elemCtx.updatePrimaryIntensiveQuantities(/*timeIdx=*/0);
+                const unsigned cell_idx = elemCtx.globalSpaceIndex(/*spaceIdx=*/0, /*timeIdx=*/0);
+
+                // const auto& intQuants = elemCtx.intensiveQuantities(/*spaceIdx=*/0, /*timeIdx=*/0);
+                auto* intQuantsPtr = ebosSimulator_.model().cachedIntensiveQuantities(cell_idx, /*timeIdx=*/ 0);
+                // const auto& intQuants = *(ebosSimulator_.model().cachedIntensiveQuantities(cell_idx, /*timeIdx=*/ 0));
+                if (intQuantsPtr == nullptr) {
+                    OpmLog::debug("** no cached intensive quantities for cell " + std::to_string(cell_idx));
+                    elemCtx.updatePrimaryIntensiveQuantities(0);
+                    intQuantsPtr = ebosSimulator_.model().cachedIntensiveQuantities(cell_idx, /*timeIdx=*/ 0);
+                    assert(intQuantsPtr != nullptr);
+                }
+                const auto& intQuants = *intQuantsPtr;
+
+                const auto& fs = intQuants.fluidState();
+
+                const double pvValue = ebosProblem.referencePorosity(cell_idx, /*timeIdx=*/0) * ebosModel.dofTotalVolume( cell_idx );
+                pvSumLocal += pvValue;
+
+                for (unsigned phaseIdx = 0; phaseIdx < FluidSystem::numPhases; ++phaseIdx)
+                {
+                    if (!FluidSystem::phaseIsActive(phaseIdx)) {
+                        continue;
+                    }
+
+                    const unsigned compIdx = Indices::canonicalToActiveComponentIndex(FluidSystem::solventComponentIndex(phaseIdx));
+
+                    B_avg[ compIdx ] += 1.0 / fs.invB(phaseIdx).value();
+                    const auto R2 = ebosResid[cell_idx][compIdx];
+
+                    R_sum[ compIdx ] += R2;
+                    const double Rval = std::abs( R2 ) / pvValue;
+                    if (Rval > maxCoeff[ compIdx ]) {
+                        maxCoeff[ compIdx ] = Rval;
+                        maxCoeffCell[ compIdx ] = cell_idx;
+                    }
                 }
 
                 if constexpr (has_solvent_) {
@@ -842,12 +1526,43 @@ namespace Opm {
             const int bSize = B_avg.size();
             for ( int i = 0; i<bSize; ++i )
             {
-                B_avg[ i ] /= Scalar( global_nc_ );
+                B_avg[ i ] /= Scalar(domain.cells.size());
             }
 
-            return {pvSumLocal, numAquiferPvSumLocal};
+            // return {pvSumLocal, numAquiferPvSumLocal};
+            return pvSumLocal;
         }
 
+
+        double computeCnvErrorPvLocal(const Domain& domain, const std::vector<Scalar>& B_avg, double dt)
+        {
+            double errorPV{};
+            const auto& ebosModel = ebosSimulator_.model();
+            const auto& ebosProblem = ebosSimulator_.problem();
+            const auto& ebosResid = ebosSimulator_.model().linearizer().residual();
+
+            for (const int cell_idx : domain.cells)
+            {
+                const double pvValue = ebosProblem.referencePorosity(cell_idx, /*timeIdx=*/0) * ebosModel.dofTotalVolume( cell_idx );
+                const auto& cellResidual = ebosResid[cell_idx];
+                bool cnvViolated = false;
+
+                for (unsigned eqIdx = 0; eqIdx < cellResidual.size(); ++eqIdx)
+                {
+                    using std::fabs;
+                    Scalar CNV = cellResidual[eqIdx] * dt * B_avg[eqIdx] / pvValue;
+                    cnvViolated = cnvViolated || (fabs(CNV) > param_.tolerance_cnv_);
+                }
+
+                if (cnvViolated)
+                {
+                    errorPV += pvValue;
+                }
+            }
+            return errorPV;
+        }
+
+
         /// \brief Compute the total pore volume of cells violating CNV that are not part
         ///        of a numerical aquifer.
         double computeCnvErrorPv(const std::vector<Scalar>& B_avg, double dt)
@@ -870,7 +1585,7 @@ namespace Opm {
                     continue;
                 }
                 elemCtx.updatePrimaryStencil(elem);
-                elemCtx.updatePrimaryIntensiveQuantities(/*timeIdx=*/0);
+                // elemCtx.updatePrimaryIntensiveQuantities(/*timeIdx=*/0);
                 const unsigned cell_idx = elemCtx.globalSpaceIndex(/*spaceIdx=*/0, /*timeIdx=*/0);
                 const double pvValue = ebosProblem.referencePorosity(cell_idx, /*timeIdx=*/0) * ebosModel.dofTotalVolume( cell_idx );
                 const auto& cellResidual = ebosResid[cell_idx];
@@ -894,6 +1609,119 @@ namespace Opm {
             return grid_.comm().sum(errorPV);
         }
 
+        ConvergenceReport getDomainReservoirConvergence(const double reportTime,
+                                                        const double dt,
+                                                        const int iteration,
+                                                        const Domain& domain,
+                                                        std::vector<Scalar>& B_avg,
+                                                        std::vector<Scalar>& residual_norms)
+        {
+            typedef std::vector< Scalar > Vector;
+
+            const int numComp = numEq;
+            Vector R_sum(numComp, 0.0 );
+            Vector maxCoeff(numComp, std::numeric_limits< Scalar >::lowest() );
+            std::vector<int> maxCoeffCell(numComp, -1);
+            const double pvSum = localDomainConvergenceData(domain, R_sum, maxCoeff, B_avg, maxCoeffCell);
+
+            // compute global sum and max of quantities
+            // const double pvSum = convergenceReduction(grid_.comm(), pvSumLocal,
+            //                                           R_sum, maxCoeff, B_avg);
+
+            auto cnvErrorPvFraction = computeCnvErrorPvLocal(domain, B_avg, dt);
+            cnvErrorPvFraction /= pvSum;
+
+            const double tol_mb  = param_.local_tolerance_scaling_mb_ * param_.tolerance_mb_;
+            // Default value of relaxed_max_pv_fraction_ is 0.03 and min_strict_cnv_iter_ is 0.
+            // For each iteration, we need to determine whether to use the relaxed CNV tolerance.
+            // To disable the usage of relaxed CNV tolerance, you can set the relaxed_max_pv_fraction_ to be 0.
+            const bool use_relaxed = cnvErrorPvFraction < param_.relaxed_max_pv_fraction_ && iteration >= param_.min_strict_cnv_iter_;
+            // Tighter bound for local convergence should increase the
+            // likelyhood of: local convergence => global convergence
+            const double tol_cnv = param_.local_tolerance_scaling_cnv_
+                * (use_relaxed ? param_.tolerance_cnv_relaxed_ :  param_.tolerance_cnv_);
+
+            // Finish computation
+            std::vector<Scalar> CNV(numComp);
+            std::vector<Scalar> mass_balance_residual(numComp);
+            for ( int compIdx = 0; compIdx < numComp; ++compIdx )
+            {
+                CNV[compIdx]                    = B_avg[compIdx] * dt * maxCoeff[compIdx];
+                mass_balance_residual[compIdx]  = std::abs(B_avg[compIdx]*R_sum[compIdx]) * dt / pvSum;
+                residual_norms.push_back(CNV[compIdx]);
+            }
+
+            // Create convergence report.
+            ConvergenceReport report;
+            using CR = ConvergenceReport;
+            for (int compIdx = 0; compIdx < numComp; ++compIdx) {
+                double res[2] = { mass_balance_residual[compIdx], CNV[compIdx] };
+                CR::ReservoirFailure::Type types[2] = { CR::ReservoirFailure::Type::MassBalance,
+                                                        CR::ReservoirFailure::Type::Cnv };
+                double tol[2] = { tol_mb, tol_cnv };
+                int cell[2] = { -1, maxCoeffCell[compIdx] }; // No cell associated with MB failures.
+                for (int ii : {0, 1}) {
+                    if (std::isnan(res[ii])) {
+                        report.setReservoirFailed({types[ii], CR::Severity::NotANumber, compIdx});//, cell[ii], res[ii]});
+                        if ( terminal_output_ ) {
+                            OpmLog::debug("NaN residual for " + compNames_.name(compIdx) + " equation.");
+                        }
+                    } else if (res[ii] > maxResidualAllowed()) {
+                        report.setReservoirFailed({types[ii], CR::Severity::TooLarge, compIdx});//, cell[ii], res[ii]});
+                        if ( terminal_output_ ) {
+                            OpmLog::debug("Too large residual for " + compNames_.name(compIdx) + " equation.");
+                        }
+                    } else if (res[ii] < 0.0) {
+                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});//, cell[ii], res[ii]});
+                        if ( terminal_output_ ) {
+                            OpmLog::debug("Negative residual for " + compNames_.name(compIdx) + " equation.");
+                        }
+                    } else if (res[ii] > tol[ii]) {
+                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});//, cell[ii], res[ii]});
+                    }
+                }
+            }
+
+            // Output of residuals.
+            if ( terminal_output_ )
+            {
+                // Only rank 0 does print to std::cout
+                if (iteration == 0) {
+                    std::string msg = fmt::format("Domain {}, size {}, containing cell {}\n| Iter",
+                                                  domain.index, domain.cells.size(), domain.cells[0]);
+                    for (int compIdx = 0; compIdx < numComp; ++compIdx) {
+                        msg += "    MB(";
+                        msg += compNames_.name(compIdx)[0];
+                        msg += ")  ";
+                    }
+                    for (int compIdx = 0; compIdx < numComp; ++compIdx) {
+                        msg += "    CNV(";
+                        msg += compNames_.name(compIdx)[0];
+                        msg += ") ";
+                    }
+                    OpmLog::debug(msg);
+                }
+                std::ostringstream ss;
+                ss << "| ";
+                const std::streamsize oprec = ss.precision(3);
+                const std::ios::fmtflags oflags = ss.setf(std::ios::scientific);
+                ss << std::setw(4) << iteration;
+                for (int compIdx = 0; compIdx < numComp; ++compIdx) {
+                    ss << std::setw(11) << mass_balance_residual[compIdx];
+                }
+                for (int compIdx = 0; compIdx < numComp; ++compIdx) {
+                    ss << std::setw(11) << CNV[compIdx];
+                }
+                ss.precision(oprec);
+                ss.flags(oflags);
+                OpmLog::debug(ss.str());
+            }
+
+            return report;
+        }
+
+
+
         ConvergenceReport getReservoirConvergence(const double reportTime,
                                                   const double dt,
                                                   const int iteration,
@@ -906,7 +1734,8 @@ namespace Opm {
             const int numComp = numEq;
             Vector R_sum(numComp, 0.0 );
             Vector maxCoeff(numComp, std::numeric_limits< Scalar >::lowest() );
-            const auto [ pvSumLocal, numAquiferPvSumLocal] = localConvergenceData(R_sum, maxCoeff, B_avg);
+            std::vector<int> maxCoeffCell(numComp, -1);
+            const auto [ pvSumLocal, numAquiferPvSumLocal] = localConvergenceData(R_sum, maxCoeff, B_avg, maxCoeffCell);
 
             // compute global sum and max of quantities
             const auto [ pvSum, numAquiferPvSum ] =
@@ -942,24 +1771,25 @@ namespace Opm {
                 CR::ReservoirFailure::Type types[2] = { CR::ReservoirFailure::Type::MassBalance,
                                                         CR::ReservoirFailure::Type::Cnv };
                 double tol[2] = { tol_mb, tol_cnv };
+                int cell[2] = { -1, maxCoeffCell[compIdx] }; // No cell associated with MB failures.
                 for (int ii : {0, 1}) {
                     if (std::isnan(res[ii])) {
-                        report.setReservoirFailed({types[ii], CR::Severity::NotANumber, compIdx});
+                        report.setReservoirFailed({types[ii], CR::Severity::NotANumber, compIdx});//, cell[ii], res[ii]});
                         if ( terminal_output_ ) {
                             OpmLog::debug("NaN residual for " + this->compNames_.name(compIdx) + " equation.");
                         }
                     } else if (res[ii] > maxResidualAllowed()) {
-                        report.setReservoirFailed({types[ii], CR::Severity::TooLarge, compIdx});
+                        report.setReservoirFailed({types[ii], CR::Severity::TooLarge, compIdx});//, cell[ii], res[ii]});
                         if ( terminal_output_ ) {
                             OpmLog::debug("Too large residual for " + this->compNames_.name(compIdx) + " equation.");
                         }
                     } else if (res[ii] < 0.0) {
-                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});
+                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});//, cell[ii], res[ii]});
                         if ( terminal_output_ ) {
                             OpmLog::debug("Negative residual for " + this->compNames_.name(compIdx) + " equation.");
                         }
                     } else if (res[ii] > tol[ii]) {
-                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});
+                        report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});//, cell[ii], res[ii]});
                     }
                     report.setReservoirConvergenceMetric(types[ii], compIdx, res[ii]);
                 }
@@ -1001,6 +1831,22 @@ namespace Opm {
             return report;
         }
 
+        ConvergenceReport getDomainConvergence(const Domain& domain,
+                                              const SimulatorTimerInterface& timer,
+                                              const int iteration,
+                                              std::vector<double>& residual_norms)
+        {
+            std::vector<Scalar> B_avg(numEq, 0.0);
+            auto report = getDomainReservoirConvergence(timer.simulationTimeElapsed(),
+                                                        timer.currentStepLength(),
+                                                       iteration,
+                                                        domain,
+                                                        B_avg,
+                                                        residual_norms);
+            report += wellModel().getDomainWellConvergence(domain, B_avg);
+            return report;
+        }
+
         /// Compute convergence based on total mass balance (tol_mb) and maximum
         /// residual mass balance (tol_cnv).
         /// \param[in]   timer       simulation timer
@@ -1059,6 +1905,10 @@ namespace Opm {
         const SimulatorReportSingle& failureReport() const
         { return failureReport_; }
 
+        /// return the statistics if the nonlinearIteration() method failed
+        const SimulatorReportSingle& localAccumulatedReports() const
+        { return local_reports_accumulated_; }
+
         const std::vector<StepReport>& stepReports() const
         {
             return convergence_reports_;
@@ -1081,6 +1931,7 @@ namespace Opm {
 
         ModelParameters                 param_;
         SimulatorReportSingle failureReport_;
+        SimulatorReportSingle local_reports_accumulated_;
 
         // Well Model
         BlackoilWellModel<TypeTag>& well_model_;
@@ -1096,6 +1947,9 @@ namespace Opm {
 
         std::vector<StepReport> convergence_reports_;
         ComponentName compNames_{};
+        std::vector<Domain> domains_;
+        std::vector<std::unique_ptr<Mat>> domain_matrices_;
+        std::vector<ISTLSolverType> domain_linsolvers_;
 
     public:
         /// return the StandardWells object
@@ -1116,6 +1970,19 @@ namespace Opm {
         }
 
     private:
+        template <typename, class = void>
+        struct HasZoltanPartitioning : public std::false_type {};
+
+        template <typename GridType>
+        struct HasZoltanPartitioning<
+            GridType,
+            std::void_t<decltype(std::declval<GridType>().zoltanPartitionWithoutScatter
+                                 (std::declval<const std::vector<Well>*>(),
+                                  std::declval<const double*>(),
+                                  std::declval<const int>(),
+                                  std::declval<const double>()))>
+            > : public std::true_type {};
+
         template<class T>
         bool isNumericalAquiferCell(const Dune::CpGrid& grid, const T& elem)
         {
@@ -1142,6 +2009,51 @@ namespace Opm {
         double maxResidualAllowed() const { return param_.max_residual_allowed_; }
         double linear_solve_setup_time_;
 
+        std::pair<std::vector<int>, int> partitionCells() const
+        {
+            const std::string& method = this->param_.local_domain_partition_method_;
+            if (method == "zoltan") {
+                if constexpr (HasZoltanPartitioning<Grid>::value) {
+                    return this->partitionCellsZoltan();
+                } else {
+                    OPM_THROW(std::runtime_error, "Zoltan requested for local domain partitioning, "
+                              "but is not available for the current grid type.");
+                }
+            } else if (method == "simple") {
+                const int num_cells = detail::countLocalInteriorCells(this->grid_);
+                return partitionCellsSimple(num_cells, this->param_.num_local_domains_);
+            } else if (method.size() > 10 && method.substr(method.size() - 10, 10) == ".partition") {
+                // Method string ends with ".partition", interpret as filename for partitioning.
+                const int num_cells = detail::countLocalInteriorCells(this->grid_);
+                return partitionCellsFromFile(method, num_cells);
+            } else {
+                OPM_THROW(std::runtime_error, "Unknown local domain partitioning method requested: " + method);
+            }
+        }
+
+        std::pair<std::vector<int>, int> partitionCellsZoltan() const
+        {
+            const auto wells = this->ebosSimulator_.vanguard().schedule().getWellsatEnd();
+
+            auto partition_vector = this->grid_.zoltanPartitionWithoutScatter
+                (&wells, nullptr, this->param_.num_local_domains_,
+                 this->param_.local_domain_partition_imbalance_);
+
+            return this->countDomains(std::move(partition_vector));
+        }
+
+        std::pair<std::vector<int>, int>
+        countDomains(std::vector<int> partition_vector) const
+        {
+            auto maxPos = std::max_element(partition_vector.begin(),
+                                           partition_vector.end());
+
+            const auto num_domains = (maxPos == partition_vector.end())
+                ? 0 : *maxPos + 1;
+
+            return { std::move(partition_vector), num_domains };
+        }
+
     public:
         std::vector<bool> wasSwitched_;
     };
diff --git a/opm/simulators/flow/BlackoilModelParametersEbos.hpp b/opm/simulators/flow/BlackoilModelParametersEbos.hpp
index 074724cc5..f52b62e71 100644
--- a/opm/simulators/flow/BlackoilModelParametersEbos.hpp
+++ b/opm/simulators/flow/BlackoilModelParametersEbos.hpp
@@ -177,8 +177,38 @@ template<class TypeTag, class MyTypeTag>
 struct NetworkMaxIterations {
     using type = UndefinedProperty;
 };
-
-
+template<class TypeTag, class MyTypeTag>
+struct NonlinearSolver {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct LocalSolveApproach {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct MaxLocalSolveIterations {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct LocalToleranceScalingMb {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct LocalToleranceScalingCnv {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct NumLocalDomains {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct LocalDomainsPartitioningImbalance {
+    using type = UndefinedProperty;
+};
+template<class TypeTag, class MyTypeTag>
+struct LocalDomainsPartitioningMethod {
+    using type = UndefinedProperty;
+};
 template<class TypeTag>
 struct DbhpMaxRel<TypeTag, TTag::FlowModelParameters> {
     using type = GetPropType<TypeTag, Scalar>;
@@ -335,11 +365,42 @@ template<class TypeTag>
 struct NetworkMaxIterations<TypeTag, TTag::FlowModelParameters> {
     static constexpr int value = 200;
 };
-
-
-
-
-
+template<class TypeTag>
+struct NonlinearSolver<TypeTag, TTag::FlowModelParameters> {
+    static constexpr auto value = "newton";
+};
+template<class TypeTag>
+struct LocalSolveApproach<TypeTag, TTag::FlowModelParameters> {
+    static constexpr auto value = "jacobi";
+};
+template<class TypeTag>
+struct MaxLocalSolveIterations<TypeTag, TTag::FlowModelParameters> {
+    static constexpr int value = 20;
+};
+template<class TypeTag>
+struct LocalToleranceScalingMb<TypeTag, TTag::FlowModelParameters> {
+    using type = GetPropType<TypeTag, Scalar>;
+    static constexpr type value = 1.0;
+};
+template<class TypeTag>
+struct LocalToleranceScalingCnv<TypeTag, TTag::FlowModelParameters> {
+    using type = GetPropType<TypeTag, Scalar>;
+    static constexpr type value = 0.01;
+};
+template<class TypeTag>
+struct NumLocalDomains<TypeTag, TTag::FlowModelParameters> {
+    using type = int;
+    static constexpr auto value = 0;
+};
+template<class TypeTag>
+struct LocalDomainsPartitioningImbalance<TypeTag, TTag::FlowModelParameters> {
+    using type = GetPropType<TypeTag, Scalar>;
+    static constexpr auto value = type{1.03};
+};
+template<class TypeTag>
+struct LocalDomainsPartitioningMethod<TypeTag, TTag::FlowModelParameters> {
+    static constexpr auto value = "zoltan";
+};
 // if openMP is available, determine the number threads per process automatically.
 #if _OPENMP
 template<class TypeTag>
@@ -461,6 +522,19 @@ namespace Opm
         /// Maximum number of iterations in the network solver before giving up
         int network_max_iterations_;
 
+        /// Nonlinear solver type: newton or nldd.
+        std::string nonlinear_solver_;
+        /// 'jacobi' and 'gauss-seidel' supported.
+        std::string local_solve_approach_;
+
+        int max_local_solve_iterations_;
+
+        double local_tolerance_scaling_mb_;
+        double local_tolerance_scaling_cnv_;
+
+        int num_local_domains_{0};
+        double local_domain_partition_imbalance_{1.03};
+        std::string local_domain_partition_method_;
 
         /// Construct from user parameters or defaults.
         BlackoilModelParametersEbos()
@@ -497,6 +571,14 @@ namespace Opm
             check_well_operability_iter_ = EWOMS_GET_PARAM(TypeTag, bool, EnableWellOperabilityCheckIter);
             max_number_of_well_switches_ = EWOMS_GET_PARAM(TypeTag, int, MaximumNumberOfWellSwitches);
             use_average_density_ms_wells_ = EWOMS_GET_PARAM(TypeTag, bool, UseAverageDensityMsWells);
+            nonlinear_solver_ = EWOMS_GET_PARAM(TypeTag, std::string, NonlinearSolver);
+            local_solve_approach_ = EWOMS_GET_PARAM(TypeTag, std::string, LocalSolveApproach);
+            max_local_solve_iterations_ = EWOMS_GET_PARAM(TypeTag, int, MaxLocalSolveIterations);
+            local_tolerance_scaling_mb_ = EWOMS_GET_PARAM(TypeTag, double, LocalToleranceScalingMb);
+            local_tolerance_scaling_cnv_ = EWOMS_GET_PARAM(TypeTag, double, LocalToleranceScalingCnv);
+            num_local_domains_ = EWOMS_GET_PARAM(TypeTag, int, NumLocalDomains);
+            local_domain_partition_imbalance_ = std::max(1.0, EWOMS_GET_PARAM(TypeTag, double, LocalDomainsPartitioningImbalance));
+            local_domain_partition_method_ = EWOMS_GET_PARAM(TypeTag, std::string, LocalDomainsPartitioningMethod);
             deck_file_name_ = EWOMS_GET_PARAM(TypeTag, std::string, EclDeckFileName);
             network_max_strict_iterations_ = EWOMS_GET_PARAM(TypeTag, int, NetworkMaxStrictIterations);
             network_max_iterations_ = EWOMS_GET_PARAM(TypeTag, int, NetworkMaxIterations);            
@@ -540,6 +622,15 @@ namespace Opm
             EWOMS_REGISTER_PARAM(TypeTag, bool, UseAverageDensityMsWells, "Approximate segment densitities by averaging over segment and its outlet");
             EWOMS_REGISTER_PARAM(TypeTag, int, NetworkMaxStrictIterations, "Maximum iterations in network solver before relaxing tolerance");
             EWOMS_REGISTER_PARAM(TypeTag, int, NetworkMaxIterations, "Maximum number of iterations in the network solver before giving up");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, NonlinearSolver, "Choose nonlinear solver. Valid choices are newton or nldd.");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, LocalSolveApproach, "Choose local solve approach. Valid choices are jacobi and gauss-seidel");
+            EWOMS_REGISTER_PARAM(TypeTag, int, MaxLocalSolveIterations, "Max iterations for local solves with NLDD nonlinear solver.");
+            EWOMS_REGISTER_PARAM(TypeTag, Scalar, LocalToleranceScalingMb, "Set lower than 1.0 to use stricter convergence tolerance for local solves.");
+            EWOMS_REGISTER_PARAM(TypeTag, Scalar, LocalToleranceScalingCnv, "Set lower than 1.0 to use stricter convergence tolerance for local solves.");
+            EWOMS_REGISTER_PARAM(TypeTag, int, NumLocalDomains, "Number of local domains for NLDD nonlinear solver.");
+            EWOMS_REGISTER_PARAM(TypeTag, Scalar, LocalDomainsPartitioningImbalance, "Subdomain partitioning imbalance tolerance. 1.03 is 3 percent imbalance.");
+            EWOMS_REGISTER_PARAM(TypeTag, std::string, LocalDomainsPartitioningMethod, "Subdomain partitioning method. "
+                                 "Allowed values are 'zoltan', 'simple', and the name of a partition file ending with '.partition'.");
         }
     };
 } // namespace Opm
diff --git a/opm/simulators/flow/FlowMainEbos.hpp b/opm/simulators/flow/FlowMainEbos.hpp
index bddc5323d..601844b7e 100644
--- a/opm/simulators/flow/FlowMainEbos.hpp
+++ b/opm/simulators/flow/FlowMainEbos.hpp
@@ -495,7 +495,7 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
                 = omp_get_max_threads();
 #endif
 
-            printFlowTrailer(mpi_size_, threads, report);
+            printFlowTrailer(mpi_size_, threads, report, simulator_->model().localAccumulatedReports());
 
             detail::handleExtraConvergenceOutput(report,
                                                  EWOMS_GET_PARAM(TypeTag, std::string, OutputExtraConvergenceInfo),
diff --git a/opm/simulators/linalg/setupPropertyTree.cpp b/opm/simulators/linalg/setupPropertyTree.cpp
index 69013575b..3a01b9314 100644
--- a/opm/simulators/linalg/setupPropertyTree.cpp
+++ b/opm/simulators/linalg/setupPropertyTree.cpp
@@ -96,6 +96,10 @@ setupPropertyTree(FlowLinearSolverParameters p, // Note: copying the parameters
         return setupILU(conf, p);
     }
 
+    if (conf == "umfpack") {
+        return setupUMFPack(conf, p);
+    }
+
     // At this point, the only separate ISAI implementation is with the OpenCL code, and
     // it will check this argument to see if it should be using ISAI. The parameter tree
     // will be ignored, so this is just a dummy configuration to avoid the throw below.
@@ -263,4 +267,15 @@ setupILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParamet
 }
 
 
+PropertyTree
+setupUMFPack([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p)
+{
+    using namespace std::string_literals;
+    PropertyTree prm;
+    prm.put("verbosity", p.linear_solver_verbosity_);
+    prm.put("solver", "umfpack"s);
+    return prm;
+}
+
+
 } // namespace Opm
diff --git a/opm/simulators/linalg/setupPropertyTree.hpp b/opm/simulators/linalg/setupPropertyTree.hpp
index abc4e2553..85b76c2c1 100644
--- a/opm/simulators/linalg/setupPropertyTree.hpp
+++ b/opm/simulators/linalg/setupPropertyTree.hpp
@@ -37,6 +37,7 @@ PropertyTree setupCPRW(const std::string& conf, const FlowLinearSolverParameters
 PropertyTree setupCPR(const std::string& conf, const FlowLinearSolverParameters& p);
 PropertyTree setupAMG(const std::string& conf, const FlowLinearSolverParameters& p);
 PropertyTree setupILU(const std::string& conf, const FlowLinearSolverParameters& p);
+PropertyTree setupUMFPack(const std::string& conf, const FlowLinearSolverParameters& p);
 
 } // namespace Opm
 
diff --git a/opm/simulators/timestepping/SimulatorReport.cpp b/opm/simulators/timestepping/SimulatorReport.cpp
index 2a8d5f516..8ec00bbd0 100644
--- a/opm/simulators/timestepping/SimulatorReport.cpp
+++ b/opm/simulators/timestepping/SimulatorReport.cpp
@@ -240,6 +240,7 @@ namespace Opm
 
     void SimulatorReport::reportFullyImplicit(std::ostream& os) const
     {
+        os << fmt::format("Number of timesteps:     {:9}\n", stepreports.size());
         success.reportFullyImplicit(os, &failure);
     }