opm-simulators/opm/simulators/linalg/gpuistl/device_management.cpp

/*
  Copyright 2024 SINTEF AS

  This file is part of the Open Porous Media project (OPM).

  OPM is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.

  OPM is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with OPM.  If not, see <http://www.gnu.org/licenses/>.
*/

#include <config.h>

#include <opm/simulators/flow/FlowGenericVanguard.hpp>
#include <opm/simulators/utils/DeferredLogger.hpp>
#if HAVE_CUDA
#include <cuda_runtime.h>
#include <cuda.h>
#include <opm/simulators/linalg/gpuistl/set_device.hpp>
#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>
#endif

namespace Opm::gpuistl {

    /*
        * Print the device name and compute capability on every rank

        If you have an AMD GPU and you have an AMD CPU you might run
        into problems with this code when using multiple MPI ranks.
        The simulation might hang because the integrated GPU in the CPU
        is detected has Radeon compute units, but it does not support ROCM.
        This is fixable by making only the GPUS on your system visible with
        ROCR_VISIBLE_DEVICES environment variable.
    */
    void printDevice()
    {
        int mpiRank = 0;
#if HAVE_CUDA
#if HAVE_MPI
        mpiRank = FlowGenericVanguard::comm().rank();
#endif

        int deviceCount = -1;
        OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));

        auto deferred_logger = ::Opm::DeferredLogger();
        if (deviceCount > 0) {
            const auto deviceId = mpiRank % deviceCount;

            struct cudaDeviceProp props;
            OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));

            std::string out;
            out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",
                mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);
            deferred_logger.info(out);
        }

        DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());
        if (mpiRank == 0) {
            global.logMessages();
        }
#endif
    }

    void setDevice()
    {
#if HAVE_CUDA
#if HAVE_MPI
        Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());
#else
        Opm::gpuistl::setDevice(0, 1);
#endif
#endif
    }

} // namespace Opm::gpuistl
Print GPU used on every rank 2024-09-13 08:52:31 -05:00			`/*`
			`Copyright 2024 SINTEF AS`

			`This file is part of the Open Porous Media project (OPM).`

			`OPM is free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`OPM is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with OPM. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include <config.h>`

			`#include <opm/simulators/flow/FlowGenericVanguard.hpp>`
			`#include <opm/simulators/utils/DeferredLogger.hpp>`
			`#if HAVE_CUDA`
			`#include <cuda_runtime.h>`
			`#include <cuda.h>`
			`#include <opm/simulators/linalg/gpuistl/set_device.hpp>`
			`#include <opm/simulators/linalg/gpuistl/detail/gpu_safe_call.hpp>`
			`#endif`

			`namespace Opm::gpuistl {`

			`/*`
			`* Print the device name and compute capability on every rank`
document amd gpu/cpu issue 2024-10-30 05:02:29 -05:00
			`If you have an AMD GPU and you have an AMD CPU you might run`
			`into problems with this code when using multiple MPI ranks.`
			`The simulation might hang because the integrated GPU in the CPU`
			`is detected has Radeon compute units, but it does not support ROCM.`
fixed: division by zero if no gpu device was found 2024-11-01 04:29:51 -05:00			`This is fixable by making only the GPUS on your system visible with`
document amd gpu/cpu issue 2024-10-30 05:02:29 -05:00			`ROCR_VISIBLE_DEVICES environment variable.`
Print GPU used on every rank 2024-09-13 08:52:31 -05:00			`*/`
			`void printDevice()`
			`{`
			`int mpiRank = 0;`
			`#if HAVE_CUDA`
			`#if HAVE_MPI`
			`mpiRank = FlowGenericVanguard::comm().rank();`
			`#endif`

			`int deviceCount = -1;`
			`OPM_GPU_WARN_IF_ERROR(cudaGetDeviceCount(&deviceCount));`

fixed: division by zero if no gpu device was found 2024-11-01 04:29:51 -05:00			`auto deferred_logger = ::Opm::DeferredLogger();`
			`if (deviceCount > 0) {`
			`const auto deviceId = mpiRank % deviceCount;`
Print GPU used on every rank 2024-09-13 08:52:31 -05:00
fixed: division by zero if no gpu device was found 2024-11-01 04:29:51 -05:00			`struct cudaDeviceProp props;`
			`OPM_GPU_WARN_IF_ERROR(cudaGetDeviceProperties(&props, deviceId));`
Print GPU used on every rank 2024-09-13 08:52:31 -05:00
fixed: division by zero if no gpu device was found 2024-11-01 04:29:51 -05:00			`std::string out;`
			`out = fmt::format("rank: {}, GPU: {}, Compute Capability: {}.{} (device {} out of {})\n",`
			`mpiRank, props.name, props.major, props.minor, deviceId, deviceCount);`
			`deferred_logger.info(out);`
			`}`
Print GPU used on every rank 2024-09-13 08:52:31 -05:00
			`DeferredLogger global = gatherDeferredLogger(deferred_logger, FlowGenericVanguard::comm());`
			`if (mpiRank == 0) {`
			`global.logMessages();`
			`}`
			`#endif`
			`}`

			`void setDevice()`
			`{`
			`#if HAVE_CUDA`
			`#if HAVE_MPI`
			`Opm::gpuistl::setDevice(FlowGenericVanguard::comm().rank(), FlowGenericVanguard::comm().size());`
			`#else`
			`Opm::gpuistl::setDevice(0, 1);`
			`#endif`
			`#endif`
			`}`

			`} // namespace Opm::gpuistl`