Print GPU used on every rank

This commit is contained in:
Tobias Meyer Andersen 2024-09-13 15:52:31 +02:00
parent 5032a37283
commit 798f7d5426
7 changed files with 134 additions and 14 deletions

View File

@ -244,6 +244,8 @@ endif()
# add these files if we should compile the hip code
if (HAVE_CUDA)
list(APPEND MAIN_SOURCE_FILES opm/simulators/linalg/gpuistl/device_management.hpp) # should not be hipified to make main independant of library
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg device_management.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)

View File

@ -35,10 +35,6 @@
#include <opm/simulators/utils/DamarisOutputModule.hpp>
#endif
#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#endif
namespace Opm {
Main::Main(int argc, char** argv, bool ownMPI)
@ -163,7 +159,7 @@ void Main::initMPI()
}
#if HAVE_CUDA
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
Opm::gpuistl::setDevice();
#endif
#endif // HAVE_MPI

View File

@ -72,6 +72,10 @@
#include <opm/simulators/utils/ParallelEclipseState.hpp>
#endif
#if HAVE_CUDA
#include <opm/simulators/linalg/gpuistl/device_management.hpp>
#endif
#if HAVE_DAMARIS
#include <opm/simulators/utils/DamarisKeywords.hpp>
#endif
@ -426,6 +430,10 @@ protected:
return false;
}
#if HAVE_CUDA
Opm::gpuistl::printDevice();
#endif
exitCode = EXIT_SUCCESS;
return true;
}

View File

@ -0,0 +1,77 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/utils/DeferredLogger.hpp>
#if HAVE_CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#endif
namespace Opm::gpuistl {
/*
* Print the device name and compute capability on every rank
*/
void printDevice()
{
int mpiRank = 0;
#if HAVE_CUDA
#if HAVE_MPI
mpiRank = FlowGenericVanguard::comm().rank();
#endif
int deviceCount = -1;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));
const auto deviceId = mpiRank % deviceCount;
struct cudaDeviceProp props;
OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));
std::string out;
out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
auto deferred_logger = ::Opm::DeferredLogger();
deferred_logger.info(out);
DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
if (mpiRank == 0) {
global.logMessages();
}
#endif
}
void setDevice()
{
#if HAVE_CUDA
#if HAVE_MPI
Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
#else
Opm::gpuistl::setDevice(0, 1);
#endif
#endif
}
} // namespace Opm::gpuistl

View File

@ -0,0 +1,34 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_GPUISTL_DEVICE_MANAGEMENT
#define OPM_GPUISTL_DEVICE_MANAGEMENT
/*
This file should not be hipified, and serves as a layer between main and gpuistl/set_device
that does not depend on the library such that the simulatorobjects to not depend
on the library and can be built in parallel.
*/
namespace Opm::gpuistl {
void printDevice();
void setDevice();
}
#endif // namespace Opm::gpuistl

View File

@ -17,33 +17,34 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <sstream>
#include <cuda_runtime.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
namespace Opm::gpuistl
{
void
setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
{
int deviceCount = -1;
[[maybe_unused]] auto cuError = cudaGetDeviceCount(&deviceCount);
if (deviceCount <= 0) {
// If they have CUDA enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// If they have CUDA/HIP enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
// later down the line. At this point in the simulator, we can not determine if CUDA is enabled, so we can only
// issue a warning.
OpmLog::warning("Could not find any CUDA devices.");
OpmLog::warning("Could not find any CUDA/HIP devices.");
return;
}
// Now do a round robin kind of assignment
// TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
const auto deviceId = mpiRank % deviceCount;
OPM_GPU_SAFE_CALL(cudaDeviceReset());
OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
+ " devices).");
OPM_GPU_WARN_IF_ERROR(cudaDeviceReset());
OPM_GPU_WARN_IF_ERROR(cudaSetDevice(deviceId));
}
} // namespace Opm::gpuistl

View File

@ -17,8 +17,8 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_CUISTL_SET_DEVICE_HEADER
#define OPM_CUISTL_SET_DEVICE_HEADER
#ifndef OPM_GPUISTL_SET_DEVICE_HEADER
#define OPM_GPUISTL_SET_DEVICE_HEADER
namespace Opm::gpuistl
{
@ -32,5 +32,7 @@ namespace Opm::gpuistl
//!
//! @note If no CUDA device is present, this does nothing.
void setDevice(int mpiRank, int numberOfMpiRanks);
void printDevice(int mpiRank, int numberOfMpiRanks);
} // namespace Opm::gpuistl
#endif