Do a graceful exit instead of MPI_Abort for expected exceptions.

Instead of unconditionally issuing MPI_Abort if we encounter a fatal
exception, we try to test whether all processes have experienced this
exception and if this is the case just terminate nomally with a exit
code that signals an error. We still use MPI_Abort if not all
processes get an exception as this is the only way to make sure that
the program aborts.

This approach also works around issues in some MPI implementations
that might not correctly return the error.

Multiple messages like this are gone now:
```
--------------------------------------------------------------------------
MPI_ABORT was invoked on rank 1 in communicator MPI_COMM_WORLD
with errorcode 1.

NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes.
You may or may not see output from other processes, depending on
exactly when Open MPI kills them.
--------------------------------------------------------------------------
[smaug.dr-blatt.de:129359] 1 more process has sent help message help-mpi-api.txt / mpi-abort
[smaug.dr-blatt.de:129359] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
```

Bu we still see something like this:
```
--------------------------------------------------------------------------
Primary job  terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:

  Process name: [[35057,1],0]
  Exit code:    1
--------------------------------------------------------------------------
```
This commit is contained in:
Markus Blatt 2023-07-19 12:15:35 +02:00
parent ac6b9b2f34
commit 7551229e77

View File

@ -320,6 +320,85 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
return simtimer_.get();
}
static void checkAllMPIProcesses()
{
#if HAVE_MPI
const auto& comm = EclGenericVanguard::comm();
if (comm.size() > 1)
{
// we try to prevent the abort here.
// For that we need a signal that each process is here.
// Each process sends a message to rank 0.
const int tag = 357912;
if (comm.rank() == 0)
{
// wait for a message from all processes.
std::vector<MPI_Request> requests(comm.size() - 1, MPI_REQUEST_NULL);
std::vector<int> data(comm.size()-1);
for(decltype(comm.size()) i = 1; i < comm.size(); ++i)
{
if (auto error = MPI_Irecv(data.data() + i, 1, MPI_INT, i, tag, comm, requests.data() + i - 1);
error != MPI_SUCCESS) {
OpmLog::error(fmt::format("Error: Could not set up MPI receive (error code : {})", error));
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
}
std::size_t msgs = comm.size() - 1;
for(std::size_t tries = 0; msgs >0 && tries < 3; ++tries)
{
sleep(3);
int flag, idx;
for(auto left_msgs = msgs; left_msgs > 0; --left_msgs)
{
if( auto error = MPI_Testany(comm.size()-1, requests.data(), &idx, &flag, MPI_STATUS_IGNORE);
error != MPI_SUCCESS) {
OpmLog::error(fmt::format("Error: Could not test for MPI message (error code : {})", error));
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
if (flag)
{
--msgs;
}
}
}
if (msgs) {
// seems like some processes are stuck. Abort just to be save
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
}
else
{
int data= 3;
MPI_Request request = MPI_REQUEST_NULL;
if (auto error = MPI_Isend(&data, 1, MPI_INT, 0, tag, comm, &request);
error != MPI_SUCCESS) {
OpmLog::error(fmt::format("Error: Could send MPI message (error code : {})", error));
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
bool completed = false;
for(std::size_t tries = 0; !completed && tries < 3; tries++)
{
sleep(3);
int flag;
if( auto error = MPI_Test(&request, &flag, MPI_STATUS_IGNORE);
error != MPI_SUCCESS) {
OpmLog::error(fmt::format("Error: Could not test for MPI message (error code : {})", error));
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
if (flag)
{
completed = true;
}
}
if (!completed) {
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
}
}
#endif
}
private:
// called by execute() or executeInitStep()
int execute_(int (FlowMainEbos::* runOrInitFunc)(), bool cleanup)
@ -338,10 +417,7 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
std::cout << message.str() << "\n";
}
}
#if HAVE_MPI
if (this->mpi_size_ > 1)
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
#endif
this->checkAllMPIProcesses();
return EXIT_FAILURE;
};
@ -364,10 +440,14 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
return exitCode;
}
catch (const LinearTimeSteppingBreakdown& e) {
return logger(e, "Simulation aborted: ");
auto exitCode = logger(e, "Simulation aborted: ");
executeCleanup_();
return exitCode;
}
catch (const std::exception& e) {
return logger(e, "Simulation aborted as program threw an unexpected exception: ");
auto exitCode = logger(e, "Simulation aborted as program threw an unexpected exception: ");
executeCleanup_();
return exitCode;
}
}