mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Do a graceful exit instead of MPI_Abort for expected exceptions.
Instead of unconditionally issuing MPI_Abort if we encounter a fatal exception, we try to test whether all processes have experienced this exception and if this is the case just terminate nomally with a exit code that signals an error. We still use MPI_Abort if not all processes get an exception as this is the only way to make sure that the program aborts. This approach also works around issues in some MPI implementations that might not correctly return the error. Multiple messages like this are gone now: ``` -------------------------------------------------------------------------- MPI_ABORT was invoked on rank 1 in communicator MPI_COMM_WORLD with errorcode 1. NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. You may or may not see output from other processes, depending on exactly when Open MPI kills them. -------------------------------------------------------------------------- [smaug.dr-blatt.de:129359] 1 more process has sent help message help-mpi-api.txt / mpi-abort [smaug.dr-blatt.de:129359] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages ``` Bu we still see something like this: ``` -------------------------------------------------------------------------- Primary job terminated normally, but 1 process returned a non-zero exit code. Per user-direction, the job has been aborted. -------------------------------------------------------------------------- -------------------------------------------------------------------------- mpirun detected that one or more processes exited with non-zero status, thus causing the job to be terminated. The first process to do so was: Process name: [[35057,1],0] Exit code: 1 -------------------------------------------------------------------------- ```
This commit is contained in:
parent
ac6b9b2f34
commit
7551229e77
@ -320,6 +320,85 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
|
||||
return simtimer_.get();
|
||||
}
|
||||
|
||||
static void checkAllMPIProcesses()
|
||||
{
|
||||
#if HAVE_MPI
|
||||
const auto& comm = EclGenericVanguard::comm();
|
||||
if (comm.size() > 1)
|
||||
{
|
||||
// we try to prevent the abort here.
|
||||
// For that we need a signal that each process is here.
|
||||
// Each process sends a message to rank 0.
|
||||
const int tag = 357912;
|
||||
if (comm.rank() == 0)
|
||||
{
|
||||
// wait for a message from all processes.
|
||||
std::vector<MPI_Request> requests(comm.size() - 1, MPI_REQUEST_NULL);
|
||||
std::vector<int> data(comm.size()-1);
|
||||
|
||||
for(decltype(comm.size()) i = 1; i < comm.size(); ++i)
|
||||
{
|
||||
if (auto error = MPI_Irecv(data.data() + i, 1, MPI_INT, i, tag, comm, requests.data() + i - 1);
|
||||
error != MPI_SUCCESS) {
|
||||
OpmLog::error(fmt::format("Error: Could not set up MPI receive (error code : {})", error));
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
std::size_t msgs = comm.size() - 1;
|
||||
for(std::size_t tries = 0; msgs >0 && tries < 3; ++tries)
|
||||
{
|
||||
sleep(3);
|
||||
int flag, idx;
|
||||
for(auto left_msgs = msgs; left_msgs > 0; --left_msgs)
|
||||
{
|
||||
if( auto error = MPI_Testany(comm.size()-1, requests.data(), &idx, &flag, MPI_STATUS_IGNORE);
|
||||
error != MPI_SUCCESS) {
|
||||
OpmLog::error(fmt::format("Error: Could not test for MPI message (error code : {})", error));
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
if (flag)
|
||||
{
|
||||
--msgs;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (msgs) {
|
||||
// seems like some processes are stuck. Abort just to be save
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int data= 3;
|
||||
MPI_Request request = MPI_REQUEST_NULL;
|
||||
if (auto error = MPI_Isend(&data, 1, MPI_INT, 0, tag, comm, &request);
|
||||
error != MPI_SUCCESS) {
|
||||
OpmLog::error(fmt::format("Error: Could send MPI message (error code : {})", error));
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
bool completed = false;
|
||||
for(std::size_t tries = 0; !completed && tries < 3; tries++)
|
||||
{
|
||||
sleep(3);
|
||||
int flag;
|
||||
if( auto error = MPI_Test(&request, &flag, MPI_STATUS_IGNORE);
|
||||
error != MPI_SUCCESS) {
|
||||
OpmLog::error(fmt::format("Error: Could not test for MPI message (error code : {})", error));
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
if (flag)
|
||||
{
|
||||
completed = true;
|
||||
}
|
||||
}
|
||||
if (!completed) {
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
// called by execute() or executeInitStep()
|
||||
int execute_(int (FlowMainEbos::* runOrInitFunc)(), bool cleanup)
|
||||
@ -338,10 +417,7 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
|
||||
std::cout << message.str() << "\n";
|
||||
}
|
||||
}
|
||||
#if HAVE_MPI
|
||||
if (this->mpi_size_ > 1)
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
#endif
|
||||
this->checkAllMPIProcesses();
|
||||
return EXIT_FAILURE;
|
||||
};
|
||||
|
||||
@ -364,10 +440,14 @@ void handleExtraConvergenceOutput(SimulatorReport& report,
|
||||
return exitCode;
|
||||
}
|
||||
catch (const LinearTimeSteppingBreakdown& e) {
|
||||
return logger(e, "Simulation aborted: ");
|
||||
auto exitCode = logger(e, "Simulation aborted: ");
|
||||
executeCleanup_();
|
||||
return exitCode;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
return logger(e, "Simulation aborted as program threw an unexpected exception: ");
|
||||
auto exitCode = logger(e, "Simulation aborted as program threw an unexpected exception: ");
|
||||
executeCleanup_();
|
||||
return exitCode;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user