Merge pull request #5430 from atgeirr/fix-damaris-logging-parallel

Add and use DamarisOutput::handleError() helper.
This commit is contained in:
Bård Skaflestad 2024-06-19 11:30:08 +02:00 committed by GitHub
commit 46523e7cd9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 64 additions and 55 deletions

View File

@ -38,50 +38,27 @@
namespace Opm::DamarisOutput {
int setPosition(const char* field, int rank, int64_t pos)
int setPosition(const char* field, int64_t pos)
{
int dam_err = damaris_set_position(field, &pos);
if (dam_err != DAMARIS_OK) {
OpmLog::warning(fmt::format("damariswriter::setPosition() : ( rank:{}) "
"damaris_set_position({}, ...), Damaris Error: {} ",
rank, field, damaris_error_string(dam_err)));
}
return dam_err;
}
int setParameter(const char* field, int rank, int value)
int setParameter(const char* field, int value)
{
int dam_err = damaris_parameter_set(field, &value, sizeof(int));
if (dam_err != DAMARIS_OK) {
OpmLog::warning(fmt::format("damariswriter::setParameter() (rank:{}) "
"damaris_parameter_set(\"{}\",...)", rank, field));
}
return dam_err;
}
int write(const char* field, int rank, const void* data)
int write(const char* field, const void* data)
{
int dam_err = damaris_write(field, data);
if (dam_err != DAMARIS_OK) {
OpmLog::warning(fmt::format("damariswriter::write() : ( rank:{}) "
"damaris_write({}, ...), Damaris Error: {} ",
rank, field, damaris_error_string(dam_err)));
}
return dam_err;
}
int endIteration(int rank)
int endIteration()
{
int dam_err = damaris_end_iteration();
if (dam_err != DAMARIS_OK) {
OpmLog::warning(fmt::format("damariswriter::endIteration() : ( rank:{}) "
"damaris_end_iteration(), Damaris Error: {} ",
rank, damaris_error_string(dam_err)));
}
return dam_err;
}
@ -117,22 +94,48 @@ int setupWritingPars(Parallel::Communication comm,
// Set the paramater so that the Damaris servers can allocate the correct amount of memory for the variabe
// Damaris parameters only support int data types. This will limit models to be under size of 2^32-1 elements
// ToDo: Do we need to check that local ranks are 0 based ?
int dam_err = setParameter("n_elements_local", comm.rank(), elements_rank_sizes[comm.rank()]);
int dam_err = setParameter("n_elements_local", elements_rank_sizes[comm.rank()]);
// Damaris parameters only support int data types. This will limit models to be under size of 2^32-1 elements
// ToDo: Do we need to check that n_elements_global_max will fit in a C int type (INT_MAX)
if( n_elements_global_max <= std::numeric_limits<int>::max() ) {
setParameter("n_elements_total", comm.rank(), n_elements_global_max);
if ( n_elements_global_max <= std::numeric_limits<int>::max() ) {
setParameter("n_elements_total", n_elements_global_max);
} else {
OpmLog::error(fmt::format("( rank:{} ) The size of the global array ({}) is"
"greater than what a Damaris paramater type supports ({}). ",
comm.rank(), n_elements_global_max, std::numeric_limits<int>::max() ));
// assert( n_elements_global_max <= std::numeric_limits<int>::max() ) ;
if (comm.rank() == 0) {
OpmLog::error(fmt::format("The size of the global array ({}) is"
"greater than what a Damaris paramater type supports ({}). ",
n_elements_global_max, std::numeric_limits<int>::max() ));
}
OPM_THROW(std::runtime_error, "setupDamarisWritingPars() n_elements_global_max "
"> std::numeric_limits<int>::max() " + std::to_string(dam_err));
}
return dam_err;
}
void
handleError(const int dam_err, Parallel::Communication comm, const std::string& message)
{
// Find if some rank has encountered an error.
const int isOk = (dam_err == DAMARIS_OK);
const bool error = (comm.sum(isOk) != comm.size());
if (error) {
// Form error message on ranks that had error, and put it into a DeferredLogger.
DeferredLogger logger;
if (dam_err != DAMARIS_OK) {
// Since the simulator will continue, this is a warning not an error
// from the OPM Flow point of view.
logger.warning("OPM_DAMARIS_ERROR",
fmt::format("Damaris error in {}, on rank {}, error string: {}",
message,
comm.rank(),
damaris_error_string(dam_err)));
}
DeferredLogger global = gatherDeferredLogger(logger, comm);
if (comm.rank() == 0) {
global.logMessages();
}
}
}
}

View File

@ -61,13 +61,14 @@ namespace Opm {
namespace DamarisOutput {
int endIteration(int rank);
int setParameter(const char* field, int rank, int value);
int setPosition(const char* field, int rank, int64_t pos);
int write(const char* field, int rank, const void* data);
int endIteration();
int setParameter(const char* field, int value);
int setPosition(const char* field, int64_t pos);
int write(const char* field, const void* data);
int setupWritingPars(Parallel::Communication comm,
const int n_elements_local_grid,
std::vector<unsigned long long>& elements_rank_offsets);
void handleError(const int dam_err, Parallel::Communication comm, const std::string& message);
}
/*!
@ -229,12 +230,13 @@ public:
{
OPM_TIMEBLOCK(writeOutput);
const int reportStepNum = simulator_.episodeIndex() + 1;
const auto& cc = simulator_.vanguard().grid().comm();
// added this as localCellData was not being written
if (!isSubStep)
this->damarisOutputModule_->invalidateLocalData() ;
this->prepareLocalCellData(isSubStep, reportStepNum);
this->damarisOutputModule_->outputErrorLog(simulator_.gridView().comm());
this->damarisOutputModule_->outputErrorLog(cc);
// The damarisWriter is not outputing well or aquifer data (yet)
auto localWellData = simulator_.problem().wellModel().wellData(); // data::Well
@ -254,8 +256,7 @@ public:
// which define sizes of the Damaris variables, per-rank and globally (over all ranks).
// Also sets the offsets to where a ranks array data sits within the global array.
// This is usefull for HDF5 output and for defining distributed arrays in Dask.
dam_err_ = DamarisOutput::setupWritingPars(simulator_.vanguard().grid().comm(),
numElements_, elements_rank_offsets_);
dam_err_ = DamarisOutput::setupWritingPars(cc, numElements_, elements_rank_offsets_);
// sets positions and data for non-time-varying variables MPI_RANK and GLOBAL_CELL_INDEX
this->setGlobalIndexForDamaris() ;
@ -286,15 +287,13 @@ public:
// Call damaris_set_position() for all available variables
// There is an assumption that all variables are the same size, with the same offset.
// see initDamarisTemplateXmlFile.cpp for the Damaris XML descriptions.
dam_err_ = DamarisOutput::setPosition(name.c_str(), rank_,
this->elements_rank_offsets_[rank_]);
dam_err_ = DamarisOutput::setPosition(name.c_str(), this->elements_rank_offsets_[rank_]);
// It does not seem I can test for what type of data is present (double or int)
// in the std::variant within the data::CellData, so I will use a try catch block.
try {
if (dataCol.data<double>().size() >= static_cast<std::vector<double>::size_type>(this->numElements_)) {
dam_err_ = DamarisOutput::write(name.c_str(), rank_,
dataCol.data<double>().data()) ;
dam_err_ = DamarisOutput::write(name.c_str(), dataCol.data<double>().data()) ;
} else {
OpmLog::info(fmt::format("( rank:{}) The variable \"{}\" was found to be of a different size {} (not {}).", rank_, name, dataCol.data<double>().size(), this->numElements_ ));
}
@ -302,8 +301,7 @@ public:
catch (std::bad_variant_access const& ex) {
// Not a std::vector<double>, must be a std::vector<int>
if (dataCol.data<int>().size() >= static_cast<std::vector<int>::size_type>(this->numElements_)) {
dam_err_ = DamarisOutput::write(name.c_str(), rank_,
dataCol.data<int>().data()) ;
dam_err_ = DamarisOutput::write(name.c_str(), dataCol.data<int>().data()) ;
} else {
OpmLog::info(fmt::format("( rank:{}) The variable \"{}\" was found to be of a different size {} (not {}).", rank_, name, dataCol.data<int>().size(), this->numElements_ ));
}
@ -311,6 +309,7 @@ public:
++cell_data_written ;
}
}
DamarisOutput::handleError(dam_err_, cc, "setPosition() and write() for available variables");
if (!cell_data_written) {
OpmLog::info(fmt::format("( rank:{}) No simulation data written to the Damaris server - check --damaris-limit-variables command line option (if used) has valid variable name(s) and that the Damaris XML file contains variable names that are available in your simulation.", rank_));
@ -329,12 +328,13 @@ public:
std::cout << "Name of Damaris Block Varaiable : (" << rank_ << ") " << name << " part : " << part << " Value : " << dataCol << std::endl ;
}
dam_err_ = DamarisOutput::endIteration(rank_);
dam_err_ = DamarisOutput::endIteration();
*/
if (this->damarisOutputModule_->getPRESSURE_ptr() != nullptr)
{
dam_err_ = DamarisOutput::endIteration(rank_);
dam_err_ = DamarisOutput::endIteration();
}
DamarisOutput::handleError(dam_err_, cc, "endIteration()");
} // end of ! isSubstep
}
@ -358,10 +358,12 @@ private:
void setGlobalIndexForDamaris ()
{
const auto& cc = simulator_.vanguard().grid().comm();
// Use damaris_set_position to set the offset in the global size of the array.
// This is used so that output functionality (e.g. HDF5Store) knows the global offsets of
// the data of the ranks data.
dam_err_ = DamarisOutput::setPosition("GLOBAL_CELL_INDEX", rank_, elements_rank_offsets_[rank_]);
dam_err_ = DamarisOutput::setPosition("GLOBAL_CELL_INDEX", elements_rank_offsets_[rank_]);
DamarisOutput::handleError(dam_err_, cc, "setPosition() for GLOBAL_CELL_INDEX");
// This is an example of writing to the Damaris shared memory directly (i.e. we allocate the
// variable directly in the shared memory region and do not use damaris_write() to copy data there.
@ -375,13 +377,14 @@ private:
if (this->collectOnIORank_.isParallel()) {
const std::vector<int>& local_to_global =
this->collectOnIORank_.localIdxToGlobalIdxMapping();
dam_err_ = DamarisOutput::write("GLOBAL_CELL_INDEX", rank_, local_to_global.data());
dam_err_ = DamarisOutput::write("GLOBAL_CELL_INDEX", local_to_global.data());
} else {
std::vector<int> local_to_global_filled ;
local_to_global_filled.resize(this->numElements_) ;
std::iota(local_to_global_filled.begin(), local_to_global_filled.end(), 0);
dam_err_ = DamarisOutput::write("GLOBAL_CELL_INDEX", rank_, local_to_global_filled.data());
dam_err_ = DamarisOutput::write("GLOBAL_CELL_INDEX", local_to_global_filled.data());
}
DamarisOutput::handleError(dam_err_, cc, "write() for GLOBAL_CELL_INDEX");
mpi_rank_var.setDamarisParameterAndShmem( {this->numElements_ } ) ;
// Fill the created memory area
@ -391,8 +394,10 @@ private:
// Python code (as an example) can use the path as required.
const auto& outputDir = simulator_.vanguard().eclState().cfg().io().getOutputDir();
if (outputDir.size() > 0) {
dam_err_ = DamarisOutput::setParameter("path_string_length", rank_, outputDir.size()) ;
dam_err_ = DamarisOutput::write("OUTPUTDIR", rank_, outputDir.c_str());
dam_err_ = DamarisOutput::setParameter("path_string_length", outputDir.size()) ;
DamarisOutput::handleError(dam_err_, cc, "setParameter() for path_string_length");
dam_err_ = DamarisOutput::write("OUTPUTDIR", outputDir.c_str());
DamarisOutput::handleError(dam_err_, cc, "write() for OUTPUTDIR");
}
}
@ -536,6 +541,7 @@ private:
damarisOutputModule_->validateLocalData();
OPM_END_PARALLEL_TRY_CATCH("DamarisWriter::prepareLocalCellData() failed: ", simulator_.vanguard().grid().comm());
}
};
} // namespace Opm