Merge pull request #5611 from multitalentloes/print_gpu_info_on_startup

Print gpus used on simulator startup
This commit is contained in:
Kjetil Olsen Lye 2024-10-31 21:29:08 +01:00 committed by GitHub
commit e81cf62e79
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 141 additions and 14 deletions

View File

@ -244,6 +244,8 @@ endif()
# add these files if we should compile the hip code
if (HAVE_CUDA)
list(APPEND MAIN_SOURCE_FILES opm/simulators/linalg/gpuistl/device_management.hpp) # should not be hipified to make main independant of library
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg device_management.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)

View File

@ -35,10 +35,6 @@
#include <opm/simulators/utils/DamarisOutputModule.hpp>
#endif
#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#endif
namespace Opm {
Main::Main(int argc, char** argv, bool ownMPI)
@ -163,7 +159,7 @@ void Main::initMPI()
}
#if HAVE_CUDA
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
Opm::gpuistl::setDevice();
#endif
#endif // HAVE_MPI

View File

@ -72,6 +72,10 @@
#include <opm/simulators/utils/ParallelEclipseState.hpp>
#endif
#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/device_management.hpp>
#endif
#if HAVE_DAMARIS
#include <opm/simulators/utils/DamarisKeywords.hpp>
#endif
@ -426,6 +430,10 @@ protected:
return false;
}
#if HAVE_CUDA
Opm::gpuistl::printDevice();
#endif
exitCode = EXIT_SUCCESS;
return true;
}

View File

@ -0,0 +1,84 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/utils/DeferredLogger.hpp>
#if HAVE_CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#endif
namespace Opm::gpuistl {
/*
* Print the device name and compute capability on every rank
If you have an AMD GPU and you have an AMD CPU you might run
into problems with this code when using multiple MPI ranks.
The simulation might hang because the integrated GPU in the CPU
is detected has Radeon compute units, but it does not support ROCM.
This is fixable my making only the GPUS on your system visible with
ROCR_VISIBLE_DEVICES environment variable.
*/
void printDevice()
{
int mpiRank = 0;
#if HAVE_CUDA
#if HAVE_MPI
mpiRank = FlowGenericVanguard::comm().rank();
#endif
int deviceCount = -1;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));
const auto deviceId = mpiRank % deviceCount;
struct cudaDeviceProp props;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));
std::string out;
out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
auto deferred_logger = ::Opm::DeferredLogger();
deferred_logger.info(out);
DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
if (mpiRank == 0) {
global.logMessages();
}
#endif
}
void setDevice()
{
#if HAVE_CUDA
#if HAVE_MPI
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
#else
Opm::gpuistl::setDevice(0, 1);
#endif
#endif
}
} // namespace Opm::gpuistl

View File

@ -0,0 +1,34 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_GPUISTL_DEVICE_MANAGEMENT
#define OPM_GPUISTL_DEVICE_MANAGEMENT
/*
This file should not be hipified, and serves as a layer between main and gpuistl/set_device
that does not depend on the library such that the simulatorobjects to not depend
on the library and can be built in parallel.
*/
namespace Opm::gpuistl {
void printDevice();
void setDevice();
}
#endif // namespace Opm::gpuistl

View File

@ -17,33 +17,34 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <sstream>
#include <cuda_runtime.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
namespace Opm::gpuistl
{
void
setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
{
int deviceCount = -1;
[[maybe_unused]] auto cuError = cudaGetDeviceCount(&deviceCount);
if (deviceCount <= 0) {
// If they have CUDA enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// If they have CUDA/HIP enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// later down the line. At this point in the simulator, we can not determine if CUDA is enabled, so we can only
// issue a warning.
OpmLog::warning("Could not find any CUDA devices.");
OpmLog::warning("Could not find any CUDA/HIP devices.");
return;
}
// Now do a round robin kind of assignment
// TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
const auto deviceId = mpiRank % deviceCount;
OPM_GPU_SAFE_CALL(cudaDeviceReset());
OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
+ " devices).");
OPM_GPU_WARN_IF_ERROR(cudaDeviceReset());
OPM_GPU_WARN_IF_ERROR(cudaSetDevice(deviceId));
}
} // namespace Opm::gpuistl

View File

@ -17,8 +17,8 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_CUISTL_SET_DEVICE_HEADER
#define OPM_CUISTL_SET_DEVICE_HEADER
#ifndef OPM_GPUISTL_SET_DEVICE_HEADER
#define OPM_GPUISTL_SET_DEVICE_HEADER
namespace Opm::gpuistl
{
@ -32,5 +32,7 @@ namespace Opm::gpuistl
//!
//! @note If no CUDA device is present, this does nothing.
void setDevice(int mpiRank, int numberOfMpiRanks);
void printDevice(int mpiRank, int numberOfMpiRanks);
} // namespace Opm::gpuistl
#endif