Do not check return values for MPI calls

The custom error handlers for each slave-master communicator will
handle errors in MPI_Recv() and MPI_Send() and eventually call
MPI_Abort(). So there is no need to check return values for these
MPI calls.
This commit is contained in:
Håkon Hægland 2024-12-03 14:05:31 +01:00
parent afab98a5a4
commit 8da3c203f3
3 changed files with 39 additions and 26 deletions

View File

@ -150,7 +150,10 @@ receiveNextReportDateFromSlaves()
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
for (unsigned int i = 0; i < num_slaves; i++) { for (unsigned int i = 0; i < num_slaves; i++) {
double slave_next_report_time_offset; // Elapsed time from the beginning of the simulation double slave_next_report_time_offset; // Elapsed time from the beginning of the simulation
int result = MPI_Recv( // NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Recv(
&slave_next_report_time_offset, &slave_next_report_time_offset,
/*count=*/1, /*count=*/1,
/*datatype=*/MPI_DOUBLE, /*datatype=*/MPI_DOUBLE,
@ -159,9 +162,6 @@ receiveNextReportDateFromSlaves()
this->getSlaveComm(i), this->getSlaveComm(i),
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
if (result != MPI_SUCCESS) {
OPM_THROW(std::runtime_error, "Failed to receive next report date from slave process");
}
this->slave_next_report_time_offsets_[i] = slave_next_report_time_offset; this->slave_next_report_time_offsets_[i] = slave_next_report_time_offset;
OpmLog::info( OpmLog::info(
fmt::format( fmt::format(

View File

@ -48,6 +48,9 @@ ReservoirCouplingSlave(
if (this->slave_master_comm_ == MPI_COMM_NULL) { if (this->slave_master_comm_ == MPI_COMM_NULL) {
OPM_THROW(std::runtime_error, "Slave process is not spawned by a master process"); OPM_THROW(std::runtime_error, "Slave process is not spawned by a master process");
} }
// NOTE: By installing a custom error handler for all slave-master communicators, which
// eventually will call MPI_Abort(), there is no need to check the return value of any
// MPI_Recv() or MPI_Send() calls as errors will be caught by the error handler.
ReservoirCoupling::setErrhandler(this->slave_master_comm_, /*is_master=*/false); ReservoirCoupling::setErrhandler(this->slave_master_comm_, /*is_master=*/false);
} }
@ -56,7 +59,10 @@ ReservoirCouplingSlave::
receiveNextTimeStepFromMaster() { receiveNextTimeStepFromMaster() {
double timestep; double timestep;
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
int result = MPI_Recv( // NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Recv(
&timestep, &timestep,
/*count=*/1, /*count=*/1,
/*datatype=*/MPI_DOUBLE, /*datatype=*/MPI_DOUBLE,
@ -65,9 +71,6 @@ receiveNextTimeStepFromMaster() {
this->slave_master_comm_, this->slave_master_comm_,
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
if (result != MPI_SUCCESS) {
OPM_THROW(std::runtime_error, "Failed to receive next time step from master");
}
OpmLog::info( OpmLog::info(
fmt::format("Slave rank 0 received next timestep {} from master.", timestep) fmt::format("Slave rank 0 received next timestep {} from master.", timestep)
); );
@ -84,7 +87,10 @@ receiveMasterGroupNamesFromMasterProcess() {
std::vector<char> group_names; std::vector<char> group_names;
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
MPI_Aint asize = 0; MPI_Aint asize = 0;
int result = MPI_Recv( // NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Recv(
&asize, &asize,
/*count=*/1, /*count=*/1,
/*datatype=*/MPI_AINT, /*datatype=*/MPI_AINT,
@ -94,15 +100,11 @@ receiveMasterGroupNamesFromMasterProcess() {
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
OpmLog::info("Received master group names size from master process rank 0"); OpmLog::info("Received master group names size from master process rank 0");
if (result != MPI_SUCCESS) {
OPM_THROW(std::runtime_error,
"Failed to receive master group names (size) from master process");
}
// NOTE: MPI_Aint and std::size_t should be compatible on most systems, but we will // NOTE: MPI_Aint and std::size_t should be compatible on most systems, but we will
// cast it to std::size_t to avoid any potential issues // cast it to std::size_t to avoid any potential issues
size = static_cast<std::size_t>(asize); size = static_cast<std::size_t>(asize);
group_names.resize(size); group_names.resize(size);
int result2 = MPI_Recv( MPI_Recv(
group_names.data(), group_names.data(),
/*count=*/size, /*count=*/size,
/*datatype=*/MPI_CHAR, /*datatype=*/MPI_CHAR,
@ -111,10 +113,6 @@ receiveMasterGroupNamesFromMasterProcess() {
this->slave_master_comm_, this->slave_master_comm_,
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
if (result2 != MPI_SUCCESS) {
OPM_THROW(std::runtime_error,
"Failed to receive master group names from master process");
}
OpmLog::info("Received master group names from master process rank 0"); OpmLog::info("Received master group names from master process rank 0");
} }
this->comm_.broadcast(&size, /*count=*/1, /*emitter_rank=*/0); this->comm_.broadcast(&size, /*count=*/1, /*emitter_rank=*/0);
@ -136,6 +134,9 @@ sendNextReportDateToMasterProcess() const
// NOTE: This is an offset in seconds from the start date, so it will be 0 if the next report // NOTE: This is an offset in seconds from the start date, so it will be 0 if the next report
// would be the start date. In general, it should be a positive number. // would be the start date. In general, it should be a positive number.
double next_report_time_offset = elapsed_time + current_step_length; double next_report_time_offset = elapsed_time + current_step_length;
// NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Send( MPI_Send(
&next_report_time_offset, &next_report_time_offset,
/*count=*/1, /*count=*/1,
@ -155,6 +156,9 @@ sendActivationDateToMasterProcess() const
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
// NOTE: The master process needs the s // NOTE: The master process needs the s
double activation_date = this->getGrupSlavActivationDate_(); double activation_date = this->getGrupSlavActivationDate_();
// NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Send( MPI_Send(
&activation_date, &activation_date,
/*count=*/1, /*count=*/1,
@ -174,6 +178,9 @@ sendSimulationStartDateToMasterProcess() const
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
// NOTE: The master process needs the s // NOTE: The master process needs the s
double start_date = this->schedule_.getStartTime(); double start_date = this->schedule_.getStartTime();
// NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Send( MPI_Send(
&start_date, &start_date,
/*count=*/1, /*count=*/1,

View File

@ -154,7 +154,10 @@ receiveActivationDateFromSlaves_()
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
for (unsigned int i = 0; i < num_slaves; i++) { for (unsigned int i = 0; i < num_slaves; i++) {
double slave_activation_date; double slave_activation_date;
int result = MPI_Recv( // NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Recv(
&slave_activation_date, &slave_activation_date,
/*count=*/1, /*count=*/1,
/*datatype=*/MPI_DOUBLE, /*datatype=*/MPI_DOUBLE,
@ -163,9 +166,6 @@ receiveActivationDateFromSlaves_()
this->master_.getSlaveComm(i), this->master_.getSlaveComm(i),
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
if (result != MPI_SUCCESS) {
OPM_THROW(std::runtime_error, "Failed to receive activation date from slave process");
}
if (slave_activation_date < this->master_.getActivationDate()) { if (slave_activation_date < this->master_.getActivationDate()) {
OPM_THROW(std::runtime_error, "Slave process start date is earlier than " OPM_THROW(std::runtime_error, "Slave process start date is earlier than "
"the master process' activation date"); "the master process' activation date");
@ -188,7 +188,10 @@ receiveSimulationStartDateFromSlaves_()
if (this->comm_.rank() == 0) { if (this->comm_.rank() == 0) {
for (unsigned int i = 0; i < num_slaves; i++) { for (unsigned int i = 0; i < num_slaves; i++) {
double slave_start_date; double slave_start_date;
int result = MPI_Recv( // NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
MPI_Recv(
&slave_start_date, &slave_start_date,
/*count=*/1, /*count=*/1,
/*datatype=*/MPI_DOUBLE, /*datatype=*/MPI_DOUBLE,
@ -197,9 +200,6 @@ receiveSimulationStartDateFromSlaves_()
this->master_.getSlaveComm(i), this->master_.getSlaveComm(i),
MPI_STATUS_IGNORE MPI_STATUS_IGNORE
); );
if (result != MPI_SUCCESS) {
OPM_THROW(std::runtime_error, "Failed to receive start date from slave process");
}
this->master_.addSlaveStartDate(slave_start_date); this->master_.addSlaveStartDate(slave_start_date);
OpmLog::info( OpmLog::info(
fmt::format( fmt::format(
@ -227,6 +227,9 @@ sendMasterGroupNamesToSlaves_()
for (unsigned int i = 0; i < num_slaves; i++) { for (unsigned int i = 0; i < num_slaves; i++) {
auto slave_name = this->master_.getSlaveName(i); auto slave_name = this->master_.getSlaveName(i);
auto [group_names, size] = this->getMasterGroupNamesForSlave_(slave_name); auto [group_names, size] = this->getMasterGroupNamesForSlave_(slave_name);
// NOTE: All slave-master communicators have set a custom error handler, which eventually
// will call MPI_Abort() so there is no need to check the return value of any MPI_Recv()
// or MPI_Send() calls.
// NOTE: size should be of type std::size_t, so we can safely cast it to MPI_AINT // NOTE: size should be of type std::size_t, so we can safely cast it to MPI_AINT
MPI_Send( MPI_Send(
&size, &size,
@ -312,6 +315,9 @@ spawnSlaveProcesses_()
} }
OPM_THROW(std::runtime_error, "Failed to spawn slave process"); OPM_THROW(std::runtime_error, "Failed to spawn slave process");
} }
// NOTE: By installing a custom error handler for all slave-master communicators, which
// eventually will call MPI_Abort(), there is no need to check the return value of any
// MPI_Recv() or MPI_Send() calls as errors will be caught by the error handler.
ReservoirCoupling::setErrhandler(master_slave_comm, /*is_master=*/true); ReservoirCoupling::setErrhandler(master_slave_comm, /*is_master=*/true);
OpmLog::info( OpmLog::info(
fmt::format( fmt::format(