Merge pull request #5611 from multitalentloes/print_gpu_info_on_startup

Print gpus used on simulator startup
2025-02-25 18:55:30 -06:00 · 2024-10-31 21:29:08 +01:00 · 2024-10-31 21:29:08 +01:00 · e81cf62e79
commit e81cf62e79
parent a08d49d3a4 964844a636
7 changed files with 141 additions and 14 deletions
--- a/CMakeLists_files.cmake
+++ b/CMakeLists_files.cmake
@ -244,6 +244,8 @@ endif()

 # add these files if we should compile the hip code
 if (HAVE_CUDA)
+  list(APPEND MAIN_SOURCE_FILES opm/simulators/linalg/gpuistl/device_management.hpp) # should not be hipified to make main independant of library
+  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg device_management.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuBlasHandle.cpp)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/gpusparse_matrix_operations.cu)
  ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
--- a/opm/simulators/flow/Main.cpp
+++ b/opm/simulators/flow/Main.cpp
@ -35,10 +35,6 @@
 #include <opm/simulators/utils/DamarisOutputModule.hpp>
 #endif

-#if HAVE_CUDA
-#include <opm/simulators/linalg/gpuistl/set_device.hpp>
-#endif
-
 namespace Opm {

 Main::Main(int argc, char** argv, bool ownMPI)
@ -163,7 +159,7 @@ void Main::initMPI()
    }

 #if HAVE_CUDA
-    Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
+    Opm::gpuistl::setDevice();
 #endif

 #endif // HAVE_MPI
--- a/opm/simulators/flow/Main.hpp
+++ b/opm/simulators/flow/Main.hpp
@ -72,6 +72,10 @@
 #include <opm/simulators/utils/ParallelEclipseState.hpp>
 #endif

+#if HAVE_CUDA
+#include <opm/simulators/linalg/gpuistl/device_management.hpp>
+#endif
+
 #if HAVE_DAMARIS
 #include <opm/simulators/utils/DamarisKeywords.hpp>
 #endif
@ -426,6 +430,10 @@ protected:
            return false;
        }

+#if HAVE_CUDA
+    Opm::gpuistl::printDevice();
+#endif
+
        exitCode = EXIT_SUCCESS;
        return true;
    }
--- a/opm/simulators/linalg/gpuistl/device_management.cpp
+++ b/opm/simulators/linalg/gpuistl/device_management.cpp
@ -0,0 +1,84 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <config.h>
+
+#include <opm/simulators/flow/FlowGenericVanguard.hpp>
+#include <opm/simulators/utils/DeferredLogger.hpp>
+#if HAVE_CUDA
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <opm/simulators/linalg/gpuistl/set_device.hpp>
+#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
+#endif
+
+namespace Opm::gpuistl {
+
+    /*
+        * Print the device name and compute capability on every rank
+
+        If you have an AMD GPU and you have an AMD CPU you might run
+        into problems with this code when using multiple MPI ranks.
+        The simulation might hang because the integrated GPU in the CPU
+        is detected has Radeon compute units, but it does not support ROCM.
+        This is fixable my making only the GPUS on your system visible with
+        ROCR_VISIBLE_DEVICES environment variable.
+    */
+    void printDevice()
+    {
+        int mpiRank = 0;
+#if HAVE_CUDA
+#if HAVE_MPI
+        mpiRank = FlowGenericVanguard::comm().rank();
+#endif
+
+        int deviceCount = -1;
+        OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));
+
+        const auto deviceId = mpiRank % deviceCount;
+
+        struct cudaDeviceProp props;
+        OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));
+
+        std::string out;
+        out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
+            mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
+        auto deferred_logger = ::Opm::DeferredLogger();
+        deferred_logger.info(out);
+
+        DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
+        if (mpiRank == 0) {
+            global.logMessages();
+        }
+
+#endif
+    }
+
+    void setDevice()
+    {
+#if HAVE_CUDA
+#if HAVE_MPI
+        Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
+#else
+        Opm::gpuistl::setDevice(0, 1);
+#endif
+#endif
+    }
+
+} // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/device_management.hpp
+++ b/opm/simulators/linalg/gpuistl/device_management.hpp
@ -0,0 +1,34 @@
+/*
+  Copyright 2024 SINTEF AS
+
+  This file is part of the Open Porous Media project (OPM).
+
+  OPM is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  OPM is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef OPM_GPUISTL_DEVICE_MANAGEMENT
+#define OPM_GPUISTL_DEVICE_MANAGEMENT
+
+/*
+  This file should not be hipified, and serves as a layer between main and gpuistl/set_device
+  that does not depend on the library such that the simulatorobjects to not depend
+  on the library and can be built in parallel.
+*/
+
+namespace Opm::gpuistl {
+    void printDevice();
+    void setDevice();
+}
+
+#endif // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/set_device.cpp
+++ b/opm/simulators/linalg/gpuistl/set_device.cpp
@ -17,33 +17,34 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include <config.h>
+#include <sstream>
 #include <cuda_runtime.h>
 #include <opm/common/OpmLog/OpmLog.hpp>
+#include <opm/simulators/flow/FlowGenericVanguard.hpp>
 #include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
 #include <opm/simulators/linalg/gpuistl/set_device.hpp>
+
 namespace Opm::gpuistl
 {
 void
 setDevice(int mpiRank, [[maybe_unused]] int numberOfMpiRanks)
 {
-
    int deviceCount = -1;
    [[maybe_unused]] auto cuError = cudaGetDeviceCount(&deviceCount);

    if (deviceCount <= 0) {
-        // If they have CUDA enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
+        // If they have CUDA/HIP enabled (ie. using a component that needs CUDA, eg. gpubicgstab or CUILU0), this will fail
        // later down the line. At this point in the simulator, we can not determine if CUDA is enabled, so we can only
        // issue a warning.
-        OpmLog::warning("Could not find any CUDA devices.");
+        OpmLog::warning("Could not find any CUDA/HIP devices.");
        return;
    }

    // Now do a round robin kind of assignment
    // TODO: We need to be more sophistacted here. We have no guarantee this will pick the correct device.
    const auto deviceId = mpiRank % deviceCount;
-    OPM_GPU_SAFE_CALL(cudaDeviceReset());
-    OPM_GPU_SAFE_CALL(cudaSetDevice(deviceId));
-    OpmLog::info("Set CUDA device to " + std::to_string(deviceId) + " (out of " + std::to_string(deviceCount)
-                 + " devices).");
+    OPM_GPU_WARN_IF_ERROR(cudaDeviceReset());
+    OPM_GPU_WARN_IF_ERROR(cudaSetDevice(deviceId));
 }
+
 } // namespace Opm::gpuistl
--- a/opm/simulators/linalg/gpuistl/set_device.hpp
+++ b/opm/simulators/linalg/gpuistl/set_device.hpp
@ -17,8 +17,8 @@
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
 */

-#ifndef OPM_CUISTL_SET_DEVICE_HEADER
-#define OPM_CUISTL_SET_DEVICE_HEADER
+#ifndef OPM_GPUISTL_SET_DEVICE_HEADER
+#define OPM_GPUISTL_SET_DEVICE_HEADER

 namespace Opm::gpuistl
 {
@ -32,5 +32,7 @@ namespace Opm::gpuistl
 //!
 //! @note If no CUDA device is present, this does nothing.
 void setDevice(int mpiRank, int numberOfMpiRanks);
+
+void printDevice(int mpiRank, int numberOfMpiRanks);
 } // namespace Opm::gpuistl
 #endif