horovod · maxhgerlach · May 9, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,7 +25,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - Fixed build with ROCm. ([#3839](https://github.com/horovod/horovod/pull/3839), [#3848](https://github.com/horovod/horovod/pull/3848))
 - Fixed build of Docker image horovod-nvtabular. ([#3851](https://github.com/horovod/horovod/pull/3851))
 - Fixed linking recent NCCL by defaulting CUDA runtime library linkage to static and ensuring that weak symbols are overridden. ([#3867](https://github.com/horovod/horovod/pull/3867), [#3846](https://github.com/horovod/horovod/pull/3846))
-
+- Update with_device functions in MXNet and PyTorch to skip unnecessary cudaSetDevice calls. ([#3912](https://github.com/horovod/horovod/pull/3912))
 
 ## [v0.27.0] - 2023-02-01
 

diff --git a/horovod/mxnet/cuda_util.cc b/horovod/mxnet/cuda_util.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 // =============================================================================
 
+#include <dlfcn.h>
 #include <stdexcept>
 
 #if HAVE_CUDA
+#include "cuda.h"
 #include "cuda_runtime.h"
 #include <mxnet/base.h>
 #endif
@@ -27,13 +29,47 @@
 namespace horovod {
 namespace mxnet {
 
+#if HAVE_CUDA
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice)(CUdevice* device);
+static void* cudalib = nullptr;
+static PFN_cuCtxGetDevice pfn_cuCtxGetDevice = nullptr;
+
+static void initialize_driver_api() {
+  // Clear previous errors
+  (void) dlerror();
+
+  cudalib = dlopen("libcuda.so", RTLD_LAZY);
+  if (!cudalib) {
+    throw std::logic_error("Internal error. Could not dlopen libcuda.so.");
+  }
+
+  pfn_cuCtxGetDevice = (PFN_cuCtxGetDevice) dlsym(cudalib, "cuCtxGetDevice");
+  if (!pfn_cuCtxGetDevice) {
+    throw std::logic_error("Internal error. Could not load cuCtxGetDevice.");
+  }
+}
+#endif
+
 with_device::with_device(int device) {
   if (device == CPU_DEVICE_ID) {
     restore_device_ = CPU_DEVICE_ID;
   } else {
 #if HAVE_CUDA
-    CUDA_CALL(cudaGetDevice(&restore_device_));
-    CUDA_CALL(cudaSetDevice(device));
+    if (!cudalib) initialize_driver_api();
+    CUdevice cudev;
+    auto err = pfn_cuCtxGetDevice(&cudev);
+    if (err == CUDA_ERROR_NOT_INITIALIZED ||
+        err == CUDA_ERROR_INVALID_CONTEXT) {
+       // If device has never been set on this thread,
+       // restore to supplied device.
+       restore_device_ = device;
+     } else if (err == CUDA_SUCCESS) {
+       restore_device_ = static_cast<int>(cudev);
+     } else {
+       throw std::logic_error("Internal error. cuCtxGetDevice returned error code " +
+                              std::to_string(err));
+     }
+     CUDA_CALL(cudaSetDevice(device));
 #else
     throw std::logic_error("Internal error. Requested device context manager "
                            "with GPU device but not compiled with CUDA.");

diff --git a/horovod/torch/cuda_util.cc b/horovod/torch/cuda_util.cc
@@ -13,12 +13,14 @@
 // limitations under the License.
 // =============================================================================
 
+#include <dlfcn.h>
+#include <stdexcept>
+
 #if HAVE_GPU
+#include "cuda.h"
 #include "cuda_runtime.h"
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDAGuard.h>
-#else
-#include <stdexcept>
 #endif
 
 #include "../common/common.h"
@@ -27,12 +29,46 @@
 namespace horovod {
 namespace torch {
 
+#if HAVE_GPU
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice)(CUdevice* device);
+static void* cudalib = nullptr;
+static PFN_cuCtxGetDevice pfn_cuCtxGetDevice = nullptr;
+
+static void initialize_driver_api() {
+  // Clear previous errors
+  (void) dlerror();
+
+  cudalib = dlopen("libcuda.so", RTLD_LAZY);
+  if (!cudalib) {
+    throw std::logic_error("Internal error. Could not dlopen libcuda.so.");
+  }
+
+  pfn_cuCtxGetDevice = (PFN_cuCtxGetDevice) dlsym(cudalib, "cuCtxGetDevice");
+  if (!pfn_cuCtxGetDevice) {
+    throw std::logic_error("Internal error. Could not load cuCtxGetDevice.");
+  }
+}
+#endif
+
 with_device::with_device(int device) {
   if (device == CPU_DEVICE_ID) {
     restore_device_ = CPU_DEVICE_ID;
   } else {
 #if HAVE_GPU
-    C10_CUDA_CHECK(cudaGetDevice(&restore_device_));
+    if (!cudalib) initialize_driver_api();
+    CUdevice cudev;
+    auto err = pfn_cuCtxGetDevice(&cudev);
+    if (err == CUDA_ERROR_NOT_INITIALIZED ||
+        err == CUDA_ERROR_INVALID_CONTEXT) {
+       // If device has never been set on this thread,
+       // restore to supplied device.
+       restore_device_ = device;
+     } else if (err == CUDA_SUCCESS) {
+       restore_device_ = static_cast<int>(cudev);
+     } else {
+       throw std::logic_error("Internal error. cuCtxGetDevice returned error code " +
+                              std::to_string(err));
+     }
     C10_CUDA_CHECK(cudaSetDevice(device));
 #else
     throw std::logic_error("Internal error. Requested device context manager "