Improve parallel logging in NLDD solver.

This commit is contained in:
Atgeirr Flø Rasmussen 2024-01-12 15:56:00 +01:00
parent b9b42a01cc
commit 6eb670b5c7
3 changed files with 65 additions and 67 deletions

View File

@ -52,6 +52,7 @@
#include <fmt/format.h>
#include <algorithm>
#include <array>
#include <cassert>
#include <cmath>
#include <cstddef>
@ -96,7 +97,7 @@ public:
//! \param param param Model parameters
//! \param compNames Names of the solution components
BlackoilModelEbosNldd(BlackoilModelEbos<TypeTag>& model)
: model_(model)
: model_(model), rank_(model_.ebosSimulator().vanguard().grid().comm().rank())
{
// Create partitions.
const auto& [partition_vector, num_domains] = this->partitionCells();
@ -208,6 +209,7 @@ public:
const auto domain_order = this->getSubdomainOrder();
// ----------- Solve each domain separately -----------
DeferredLogger logger;
std::vector<SimulatorReportSingle> domain_reports(domains_.size());
for (const int domain_index : domain_order) {
const auto& domain = domains_[domain_index];
@ -215,12 +217,12 @@ public:
try {
switch (model_.param().local_solve_approach_) {
case DomainSolveApproach::Jacobi:
solveDomainJacobi(solution, locally_solved, local_report,
solveDomainJacobi(solution, locally_solved, local_report, logger,
iteration, timer, domain);
break;
default:
case DomainSolveApproach::GaussSeidel:
solveDomainGaussSeidel(solution, locally_solved, local_report,
solveDomainGaussSeidel(solution, locally_solved, local_report, logger,
iteration, timer, domain);
break;
}
@ -234,25 +236,37 @@ public:
// i == j, at old solution for i != j.
if (!local_report.converged) {
// TODO: more proper treatment, including in parallel.
OpmLog::debug("Convergence failure in domain " + std::to_string(domain.index));
logger.debug(fmt::format("Convergence failure in domain {} on rank {}." , domain.index, rank_));
}
domain_reports[domain.index] = local_report;
}
// Communicate and log all messages.
auto global_logger = gatherDeferredLogger(logger, model_.ebosSimulator().vanguard().grid().comm());
global_logger.logMessages();
// Accumulate local solve data.
int num_converged = 0;
int num_domains = domain_reports.size();
// Putting the counts in a single array to avoid multiple
// comm.sum() calls. Keeping the named vars for readability.
std::array<int, 4> counts{ 0, 0, 0, static_cast<int>(domain_reports.size()) };
int& num_converged = counts[0];
int& num_converged_already = counts[1];
int& num_local_newtons = counts[2];
int& num_domains = counts[3];
{
SimulatorReportSingle rep;
for (const auto& dr : domain_reports) {
if (dr.converged) {
++num_converged;
if (dr.total_newton_iterations == 0) {
++num_converged_already;
}
}
rep += dr;
}
num_local_newtons = rep.total_newton_iterations;
local_reports_accumulated_ += rep;
}
bool is_iorank = true;
if (model_.param().local_solve_approach_ == DomainSolveApproach::Jacobi) {
solution = locally_solved;
@ -289,15 +303,14 @@ public:
model_.ebosSimulator().model().invalidateAndUpdateIntensiveQuantitiesOverlap(/*timeIdx=*/0);
// Make total counts of domains converged.
num_converged = comm.sum(num_converged);
num_domains = comm.sum(num_domains);
is_iorank = comm.rank() == 0;
comm.sum(counts.data(), counts.size());
}
#endif // HAVE_MPI
const bool is_iorank = this->rank_ == 0;
if (is_iorank) {
OpmLog::debug(fmt::format("Local solves finished. Converged for {}/{} domains.\n",
num_converged, num_domains));
OpmLog::debug(fmt::format("Local solves finished. Converged for {}/{} domains. {} domains did no work. {} total local Newton iterations.\n",
num_converged, num_domains, num_converged_already, num_local_newtons));
}
// Finish with a Newton step.
@ -344,6 +357,7 @@ private:
std::pair<SimulatorReportSingle, ConvergenceReport>
solveDomain(const Domain& domain,
const SimulatorTimerInterface& timer,
DeferredLogger& logger,
[[maybe_unused]] const int global_iteration,
const bool initial_assembly_required)
{
@ -376,7 +390,7 @@ private:
detailTimer.reset();
detailTimer.start();
std::vector<double> resnorms;
auto convreport = this->getDomainConvergence(domain, timer, 0, resnorms);
auto convreport = this->getDomainConvergence(domain, timer, 0, logger, resnorms);
if (convreport.converged()) {
// TODO: set more info, timing etc.
report.converged = true;
@ -433,7 +447,7 @@ private:
// Check for local convergence.
detailTimer.reset();
detailTimer.start();
convreport = this->getDomainConvergence(domain, timer, iter, resnorms);
convreport = this->getDomainConvergence(domain, timer, iter, logger, resnorms);
// apply the Schur complement of the well model to the
// reservoir linearized equations
@ -580,6 +594,7 @@ private:
const double dt,
const int iteration,
const Domain& domain,
DeferredLogger& logger,
std::vector<Scalar>& B_avg,
std::vector<Scalar>& residual_norms)
{
@ -629,32 +644,26 @@ private:
for (int ii : {0, 1}) {
if (std::isnan(res[ii])) {
report.setReservoirFailed({types[ii], CR::Severity::NotANumber, compIdx});
if (model_.terminalOutputEnabled()) {
OpmLog::debug("NaN residual for " + model_.compNames().name(compIdx) + " equation.");
}
logger.debug("NaN residual for " + model_.compNames().name(compIdx) + " equation.");
} else if (res[ii] > model_.param().max_residual_allowed_) {
report.setReservoirFailed({types[ii], CR::Severity::TooLarge, compIdx});
if (model_.terminalOutputEnabled()) {
OpmLog::debug("Too large residual for " + model_.compNames().name(compIdx) + " equation.");
}
logger.debug("Too large residual for " + model_.compNames().name(compIdx) + " equation.");
} else if (res[ii] < 0.0) {
report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});
if (model_.terminalOutputEnabled()) {
OpmLog::debug("Negative residual for " + model_.compNames().name(compIdx) + " equation.");
}
logger.debug("Negative residual for " + model_.compNames().name(compIdx) + " equation.");
} else if (res[ii] > tol[ii]) {
report.setReservoirFailed({types[ii], CR::Severity::Normal, compIdx});
}
}
}
// Output of residuals.
if (model_.terminalOutputEnabled())
{
// Only rank 0 does print to std::cout
// Output of residuals. If converged at initial state, log nothing.
const bool converged_at_initial_state = (report.converged() && iteration == 0);
if (!converged_at_initial_state) {
if (iteration == 0) {
std::string msg = fmt::format("Domain {}, size {}, containing cell {}\n| Iter",
domain.index, domain.cells.size(), domain.cells[0]);
// Log header.
std::string msg = fmt::format("Domain {} on rank {}, size {}, containing cell {}\n| Iter",
domain.index, this->rank_, domain.cells.size(), domain.cells[0]);
for (int compIdx = 0; compIdx < numComp; ++compIdx) {
msg += " MB(";
msg += model_.compNames().name(compIdx)[0];
@ -665,8 +674,9 @@ private:
msg += model_.compNames().name(compIdx)[0];
msg += ") ";
}
OpmLog::debug(msg);
logger.debug(msg);
}
// Log convergence data.
std::ostringstream ss;
ss << "| ";
const std::streamsize oprec = ss.precision(3);
@ -680,7 +690,7 @@ private:
}
ss.precision(oprec);
ss.flags(oflags);
OpmLog::debug(ss.str());
logger.debug(ss.str());
}
return report;
@ -689,6 +699,7 @@ private:
ConvergenceReport getDomainConvergence(const Domain& domain,
const SimulatorTimerInterface& timer,
const int iteration,
DeferredLogger& logger,
std::vector<double>& residual_norms)
{
std::vector<Scalar> B_avg(numEq, 0.0);
@ -696,9 +707,10 @@ private:
timer.currentStepLength(),
iteration,
domain,
logger,
B_avg,
residual_norms);
report += model_.wellModel().getDomainWellConvergence(domain, B_avg);
report += model_.wellModel().getDomainWellConvergence(domain, B_avg, logger);
return report;
}
@ -772,13 +784,14 @@ private:
void solveDomainJacobi(GlobalEqVector& solution,
GlobalEqVector& locally_solved,
SimulatorReportSingle& local_report,
DeferredLogger& logger,
const int iteration,
const SimulatorTimerInterface& timer,
const Domain& domain)
{
auto initial_local_well_primary_vars = model_.wellModel().getPrimaryVarsDomain(domain);
auto initial_local_solution = Details::extractVector(solution, domain.cells);
auto res = solveDomain(domain, timer, iteration, false);
auto res = solveDomain(domain, timer, logger, iteration, false);
local_report = res.first;
if (local_report.converged) {
auto local_solution = Details::extractVector(solution, domain.cells);
@ -796,13 +809,14 @@ private:
void solveDomainGaussSeidel(GlobalEqVector& solution,
GlobalEqVector& locally_solved,
SimulatorReportSingle& local_report,
DeferredLogger& logger,
const int iteration,
const SimulatorTimerInterface& timer,
const Domain& domain)
{
auto initial_local_well_primary_vars = model_.wellModel().getPrimaryVarsDomain(domain);
auto initial_local_solution = Details::extractVector(solution, domain.cells);
auto res = solveDomain(domain, timer, iteration, true);
auto res = solveDomain(domain, timer, logger, iteration, true);
local_report = res.first;
if (!local_report.converged) {
// We look at the detailed convergence report to evaluate
@ -825,7 +839,7 @@ private:
const double acceptable_local_cnv_sum = 1.0;
if (mb_sum < acceptable_local_mb_sum && cnv_sum < acceptable_local_cnv_sum) {
local_report.converged = true;
OpmLog::debug("Accepting solution in unconverged domain " + std::to_string(domain.index));
logger.debug(fmt::format("Accepting solution in unconverged domain {} on rank {}.", domain.index, rank_));
}
}
}
@ -942,6 +956,7 @@ private:
std::vector<std::unique_ptr<Mat>> domain_matrices_; //!< Vector of matrix operator for each subdomain
std::vector<ISTLSolverType> domain_linsolvers_; //!< Vector of linear solvers for each domain
SimulatorReportSingle local_reports_accumulated_; //!< Accumulated convergence report for subdomain solvers
int rank_ = 0; //!< MPI rank of this process
};
} // namespace Opm

View File

@ -291,7 +291,8 @@ namespace Opm {
// Check if well equations are converged locally.
ConvergenceReport getDomainWellConvergence(const Domain& domain,
const std::vector<Scalar>& B_avg) const;
const std::vector<Scalar>& B_avg,
DeferredLogger& local_deferredLogger) const;
const SimulatorReportSingle& lastReport() const;

View File

@ -1764,56 +1764,38 @@ namespace Opm {
ConvergenceReport
BlackoilWellModel<TypeTag>::
getDomainWellConvergence(const Domain& domain,
const std::vector<Scalar>& B_avg) const
const std::vector<Scalar>& B_avg,
DeferredLogger& local_deferredLogger) const
{
const auto& summary_state = ebosSimulator_.vanguard().summaryState();
const int iterationIdx = ebosSimulator_.model().newtonMethod().numIterations();
const bool relax_tolerance = iterationIdx > param_.strict_outer_iter_wells_;
Opm::DeferredLogger local_deferredLogger;
ConvergenceReport local_report;
ConvergenceReport report;
for (const auto& well : well_container_) {
if ((well_domain_.at(well->name()) == domain.index)) {
if (well->isOperableAndSolvable() || well->wellIsStopped()) {
local_report += well->getWellConvergence(summary_state,
this->wellState(),
B_avg,
local_deferredLogger,
relax_tolerance);
report += well->getWellConvergence(summary_state,
this->wellState(),
B_avg,
local_deferredLogger,
relax_tolerance);
} else {
ConvergenceReport report;
ConvergenceReport xreport;
using CR = ConvergenceReport;
report.setWellFailed({CR::WellFailure::Type::Unsolvable, CR::Severity::Normal, -1, well->name()});
local_report += report;
xreport.setWellFailed({CR::WellFailure::Type::Unsolvable, CR::Severity::Normal, -1, well->name()});
report += xreport;
}
}
}
// This function will be called once for each domain on a process,
// and the number of domains on a process will vary, so there is
// no way to communicate here. There is also no need, as a domain
// is local to a single process in our current approach.
// Therefore there is no call to gatherDeferredLogger() or to
// gatherConvergenceReport() below. However, as of now, log messages
// on non-output ranks will be lost here.
// TODO: create the DeferredLogger outside this scope to enable it
// to be gathered and printed on the output rank.
Opm::DeferredLogger global_deferredLogger = local_deferredLogger;
ConvergenceReport report = local_report;
if (terminal_output_) {
global_deferredLogger.logMessages();
}
// Log debug messages for NaN or too large residuals.
// TODO: This will as of now only be logged on the output rank.
// In the similar code in getWellConvergence(), all ranks will be
// at the same spot, that does not hold for this per-domain function.
if (terminal_output_) {
for (const auto& f : report.wellFailures()) {
if (f.severity() == ConvergenceReport::Severity::NotANumber) {
OpmLog::debug("NaN residual found with phase " + std::to_string(f.phase()) + " for well " + f.wellName());
local_deferredLogger.debug("NaN residual found with phase " + std::to_string(f.phase()) + " for well " + f.wellName());
} else if (f.severity() == ConvergenceReport::Severity::TooLarge) {
OpmLog::debug("Too large residual found with phase " + std::to_string(f.phase()) + " for well " + f.wellName());
local_deferredLogger.debug("Too large residual found with phase " + std::to_string(f.phase()) + " for well " + f.wellName());
}
}
}