mirror of
https://github.com/OPM/opm-simulators.git
synced 2024-12-22 15:33:29 -06:00
Merge pull request #5611 from multitalentloes/print_gpu_info_on_startup
Print gpus used on simulator startup
This commit is contained in:
commit
e81cf62e79
@ -244,6 +244,8 @@ endif()
|
||||
|
||||
# add these files if we should compile the hip code
|
||||
if (HAVE_CUDA)
|
||||
list(APPEND MAIN_SOURCE_FILES opm/simulators/linalg/gpuistl/device_management.hpp) # should not be hipified to make main independant of library
|
||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg device_management.cpp)
|
||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
|
||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
|
||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
|
||||
|
@ -35,10 +35,6 @@
|
||||
#include <opm/simulators/utils/DamarisOutputModule.hpp>
|
||||
#endif
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm {
|
||||
|
||||
Main::Main(int argc, char** argv, bool ownMPI)
|
||||
@ -163,7 +159,7 @@ void Main::initMPI()
|
||||
}
|
||||
|
||||
#if HAVE_CUDA
|
||||
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
|
||||
Opm::gpuistl::setDevice();
|
||||
#endif
|
||||
|
||||
#endif // HAVE_MPI
|
||||
|
@ -72,6 +72,10 @@
|
||||
#include <opm/simulators/utils/ParallelEclipseState.hpp>
|
||||
#endif
|
||||
|
||||
#if HAVE_CUDA
|
||||
#include <opm/simulators/linalg/gpuistl/device_management.hpp>
|
||||
#endif
|
||||
|
||||
#if HAVE_DAMARIS
|
||||
#include <opm/simulators/utils/DamarisKeywords.hpp>
|
||||
#endif
|
||||
@ -426,6 +430,10 @@ protected:
|
||||
return false;
|
||||
}
|
||||
|
||||
#if HAVE_CUDA
|
||||
Opm::gpuistl::printDevice();
|
||||
#endif
|
||||
|
||||
exitCode = EXIT_SUCCESS;
|
||||
return true;
|
||||
}
|
||||
|
84
opm/simulators/linalg/gpuistl/device_management.cpp
Normal file
84
opm/simulators/linalg/gpuistl/device_management.cpp
Normal file
@ -0,0 +1,84 @@
|
||||
/*
|
||||
Copyright 2024 SINTEF AS
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
|
||||
#include <opm/simulators/utils/DeferredLogger.hpp>
|
||||
#if HAVE_CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
|
||||
#endif
|
||||
|
||||
namespace Opm::gpuistl {
|
||||
|
||||
/*
|
||||
* Print the device name and compute capability on every rank
|
||||
|
||||
If you have an AMD GPU and you have an AMD CPU you might run
|
||||
into problems with this code when using multiple MPI ranks.
|
||||
The simulation might hang because the integrated GPU in the CPU
|
||||
is detected has Radeon compute units, but it does not support ROCM.
|
||||
This is fixable my making only the GPUS on your system visible with
|
||||
ROCR_VISIBLE_DEVICES environment variable.
|
||||
*/
|
||||
void printDevice()
|
||||
{
|
||||
int mpiRank = 0;
|
||||
#if HAVE_CUDA
|
||||
#if HAVE_MPI
|
||||
mpiRank = FlowGenericVanguard::comm().rank();
|
||||
#endif
|
||||
|
||||
int deviceCount = -1;
|
||||
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));
|
||||
|
||||
const auto deviceId = mpiRank % deviceCount;
|
||||
|
||||
struct cudaDeviceProp props;
|
||||
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));
|
||||
|
||||
std::string out;
|
||||
out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
|
||||
mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
|
||||
auto deferred_logger = ::Opm::DeferredLogger();
|
||||
deferred_logger.info(out);
|
||||
|
||||
DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
|
||||
if (mpiRank == 0) {
|
||||
global.logMessages();
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
void setDevice()
|
||||
{
|
||||
#if HAVE_CUDA
|
||||
#if HAVE_MPI
|
||||
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
|
||||
#else
|
||||
Opm::gpuistl::setDevice(0, 1);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Opm::gpuistl
|
34
opm/simulators/linalg/gpuistl/device_management.hpp
Normal file
34
opm/simulators/linalg/gpuistl/device_management.hpp
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
Copyright 2024 SINTEF AS
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_GPUISTL_DEVICE_MANAGEMENT
|
||||
#define OPM_GPUISTL_DEVICE_MANAGEMENT
|
||||
|
||||
/*
|
||||
This file should not be hipified, and serves as a layer between main and gpuistl/set_device
|
||||
that does not depend on the library such that the simulatorobjects to not depend
|
||||
on the library and can be built in parallel.
|
||||
*/
|
||||
|
||||
namespace Opm::gpuistl {
|
||||
void printDevice();
|
||||
void setDevice();
|
||||
}
|
||||
|
||||
#endif // namespace Opm::gpuistl
|
@ -17,33 +17,34 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include <config.h>
|
||||
#include <sstream>
|
||||
#include <cuda_runtime.h>
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
|
||||
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
void
|
||||
setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
|
||||
{
|
||||
|
||||
int deviceCount = -1;
|
||||
[[maybe_unused]] auto cuError = cudaGetDeviceCount(&deviceCount);
|
||||
|
||||
if (deviceCount <= 0) {
|
||||
// If they have CUDA enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
|
||||
// If they have CUDA/HIP enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
|
||||
// later down the line. At this point in the simulator, we can not determine if CUDA is enabled, so we can only
|
||||
// issue a warning.
|
||||
OpmLog::warning("Could not find any CUDA devices.");
|
||||
OpmLog::warning("Could not find any CUDA/HIP devices.");
|
||||
return;
|
||||
}
|
||||
|
||||
// Now do a round robin kind of assignment
|
||||
// TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
|
||||
const auto deviceId = mpiRank % deviceCount;
|
||||
OPM_GPU_SAFE_CALL(cudaDeviceReset());
|
||||
OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
|
||||
OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
|
||||
+ " devices).");
|
||||
OPM_GPU_WARN_IF_ERROR(cudaDeviceReset());
|
||||
OPM_GPU_WARN_IF_ERROR(cudaSetDevice(deviceId));
|
||||
}
|
||||
|
||||
} // namespace Opm::gpuistl
|
||||
|
@ -17,8 +17,8 @@
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#ifndef OPM_CUISTL_SET_DEVICE_HEADER
|
||||
#define OPM_CUISTL_SET_DEVICE_HEADER
|
||||
#ifndef OPM_GPUISTL_SET_DEVICE_HEADER
|
||||
#define OPM_GPUISTL_SET_DEVICE_HEADER
|
||||
|
||||
namespace Opm::gpuistl
|
||||
{
|
||||
@ -32,5 +32,7 @@ namespace Opm::gpuistl
|
||||
//!
|
||||
//! @note If no CUDA device is present, this does nothing.
|
||||
void setDevice(int mpiRank, int numberOfMpiRanks);
|
||||
|
||||
void printDevice(int mpiRank, int numberOfMpiRanks);
|
||||
} // namespace Opm::gpuistl
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user