NVIDIA-NeMo · parthchadha · May 9, 2025 · May 8, 2025 · May 8, 2025 · May 8, 2025
diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py
@@ -61,12 +61,11 @@ def _get_node_ip_and_free_port():
 
 
 def init_ray(log_dir: Optional[str] = None):
-    """Initialize Ray and connect to an existing Ray cluster or fall back and start a local one. Should be called before any ray API is called.
+    """Initialise Ray.
 
-    This function:
-    1. Gathers common environment variables needed for distributed training
-    2. Sets up the working directory and Python executable
-    3. Connects to an existing Ray cluster
+    Try to attach to an existing local cluster.
+    If that cluster uses the same CUDA_VISIBLE_DEVICES or Slurm managed tag we will reuse it.
+    Otherwise, we will detach and start a fresh local cluster.
     """
     if "UV_CACHE_DIR" not in os.environ:
         logging.warning("UV_CACHE_DIR is not set, using default cache dir")
@@ -78,26 +77,76 @@ def init_ray(log_dir: Optional[str] = None):
         "py_executable": PY_EXECUTABLES.SYSTEM,
     }
 
-    # Initialize Ray connection
+    cvd = os.environ.get("CUDA_VISIBLE_DEVICES", "ALL")
+    # sort cvd to ensure consistent tag
+    cvd = ",".join(sorted(cvd.split(",")))
+    cvd_tag = f"nrl_tag_{cvd.replace(',', '_')}"
+    SLURM_MANAGED_TAG = "slurm_managed_ray_cluster"
+
+    # Try to attach to an existing cluster
     try:
-        # Try to connect to an existing cluster first.
         ray.init(
             address="auto",
             log_to_driver=True,
             include_dashboard=False,
             runtime_env=runtime_env,
             _temp_dir=os.path.abspath(log_dir) if log_dir else None,
         )
-        logger.info(f"Connected to existing Ray cluster: {ray.cluster_resources()}")
-    except ConnectionError:
-        # If no existing cluster, start a new one with local resources
-        ray.init(
-            log_to_driver=True,
-            include_dashboard=False,
-            runtime_env=runtime_env,
-            _temp_dir=os.path.abspath(log_dir) if log_dir else None,
+
+        cluster_res = ray.cluster_resources()
+
+        # Reuse if the driver's cvd_tag matches a tag in the cluster.
+        # This is for reusing a previously self-started local cluster.
+        if cvd_tag in cluster_res:
+            logger.info(
+                f"Connected to existing Ray cluster (driver CVD_TAG '{cvd_tag}' matched): {cluster_res}"
+            )
+            return
+
+        # Reuse if it's an externally managed SLURM cluster.
+        if SLURM_MANAGED_TAG in cluster_res:
+            logger.info(
+                f"Connected to existing SLURM-managed Ray cluster (tag '{SLURM_MANAGED_TAG}' found): {cluster_res}"
+            )
+            return
+
+        # If neither reuse condition is met, but we connected to *something*
+        logger.info(
+            f"Existing Ray cluster found ({cluster_res}) but it does not meet reuse criteria. "
+            f"Driver's cvd_tag: '{cvd_tag}'. Expected SLURM tag: '{SLURM_MANAGED_TAG}'. "
+            "Starting a new local cluster..."
         )
-        logger.info(f"Started local cluster with: {ray.cluster_resources()}")
+        ray.shutdown()
+
+        # Clear driver-side package cache so working_dir is re-uploaded
+        import importlib
+
+        import ray._private.runtime_env.packaging as _pkg
+
+        importlib.reload(_pkg)
+
+    except ConnectionError:
+        logger.debug("No existing Ray cluster found, will start a new one.")
+        # If ConnectionError, proceed to start a new local cluster without further action here.
+        # Clear driver-side package cache so working_dir is re-uploaded
+        ray.shutdown()
+        pass
+
+    # Start a brand-new local cluster
+    # Reuse `runtime_env` but drop `working_dir` to avoid packaging the whole repo (prevents ray OSError: Failed to download runtime_env file package issue)
+    local_runtime_env = dict(runtime_env)
+    local_runtime_env.pop("working_dir", None)
+
+    ray.init(
+        log_to_driver=True,
+        include_dashboard=False,
+        runtime_env=local_runtime_env,
+        _temp_dir=os.path.abspath(log_dir) if log_dir else None,
+        resources={cvd_tag: 1},
+    )
+    logger.info(
+        f"Started local cluster with tag '{cvd_tag}': {ray.cluster_resources()}"
+    )
 
 
 class ResourceInsufficientError(Exception):

diff --git a/ray.sub b/ray.sub
@@ -183,7 +183,7 @@ monitor-sidecar &
 cat <<EOFINNER | tee /launch-worker.sh
 ray start --address "$ip_head" \
           --disable-usage-stats \
-          --resources="{\"worker_units\": $gpus_per_node}" \
+          --resources="{\"worker_units\": $gpus_per_node, \"slurm_managed_ray_cluster\": 1}" \
           --min-worker-port=${MIN_WORKER_PORT} \
           --max-worker-port=${MAX_WORKER_PORT} \
           \

diff --git a/tests/unit/distributed/test_virtual_cluster.py b/tests/unit/distributed/test_virtual_cluster.py
@@ -112,3 +112,76 @@ def test_env_max_retries_exhausted():
         mock_sleep.assert_any_call(2)  # 2^1
         mock_sleep.assert_any_call(4)  # 2^2
         mock_sleep.assert_any_call(8)  # 2^3
+
+
+def test_ray_reinit_on_cuda_devices_change():
+    """Test that Ray cluster is reinitialized when CUDA_VISIBLE_DEVICES changes."""
+
+    with (
+        patch("ray.init") as mock_ray_init,
+        patch("ray.shutdown") as mock_ray_shutdown,
+        patch("ray.cluster_resources") as mock_cluster_resources,
+    ):
+        # First call with CUDA_VISIBLE_DEVICES=0
+        mock_cluster_resources.return_value = {"GPU": 1, "nrl_tag_0": 1}
+        with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}, clear=True):
+            from nemo_rl.distributed.virtual_cluster import init_ray
+
+            init_ray()
+
+        assert mock_ray_init.call_count == 1
+        assert mock_ray_shutdown.call_count == 0
+        mock_ray_init.reset_mock()
+        mock_ray_shutdown.reset_mock()
+
+        # Second call with CUDA_VISIBLE_DEVICES=1
+        mock_cluster_resources.return_value = {"GPU": 1, "nrl_tag_0": 1}
+        with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "1"}, clear=True):
+            init_ray()
+
+        # Ray should be shutdown and reinitialized since the tag doesn't match
+        assert (
+            mock_ray_init.call_count == 2
+        )  # Once for initial connect, once for reinit
+        assert mock_ray_shutdown.call_count == 1  # Should shutdown after tag mismatch
+
+        # Verify that the second init call included the new tag
+        second_init_call = mock_ray_init.call_args_list[1]
+        assert "resources" in second_init_call[1]
+        assert "nrl_tag_1" in second_init_call[1]["resources"]
+
+
+def test_ray_uses_same_cluster_for_permuted_cuda_devices():
+    """Test that Ray cluster is reused if CUDA_VISIBLE_DEVICES order changes but set of devices is the same."""
+
+    with (
+        patch("ray.init") as mock_ray_init,
+        patch("ray.shutdown") as mock_ray_shutdown,
+        patch("ray.cluster_resources") as mock_cluster_resources,
+    ):
+        # Expected sorted tag
+        expected_tag = "nrl_tag_0_2"
+
+        # First call with CUDA_VISIBLE_DEVICES="0,2"
+        mock_cluster_resources.return_value = {"GPU": 2, expected_tag: 1}
+        with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,2"}, clear=True):
+            from nemo_rl.distributed.virtual_cluster import init_ray
+
+            init_ray()
+
+        assert mock_ray_init.call_count == 1
+        assert mock_ray_init.call_args_list[0][1]["address"] == "auto"
+        assert mock_ray_shutdown.call_count == 0
+        mock_ray_init.reset_mock()
+        mock_ray_shutdown.reset_mock()
+
+        # Second call with CUDA_VISIBLE_DEVICES="2,0"
+        mock_cluster_resources.return_value = {"GPU": 2, expected_tag: 1}
+        with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "2,0"}, clear=True):
+            from nemo_rl.distributed.virtual_cluster import init_ray
+
+            init_ray()
+
+        assert mock_ray_init.call_count == 1
+        assert mock_ray_init.call_args_list[0][1]["address"] == "auto"
+        assert mock_ray_shutdown.call_count == 0