From d1ae5ace67c77bedfccc3213c531f2171979fe11 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 00:22:07 -0700 Subject: [PATCH 1/2] fix: ray.sub early exit + fixing more ports Signed-off-by: Terry Kong --- ray.sub | 134 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 32 deletions(-) diff --git a/ray.sub b/ray.sub index 047d5427b..4baf4460b 100644 --- a/ray.sub +++ b/ray.sub @@ -18,19 +18,26 @@ CONTAINER=$CONTAINER MOUNTS=$MOUNTS COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up. ######################################################## -# Ray ports -GCS_SERVER_PORT=${GCS_SERVER_PORT:-6379} -DASHBOARD_PORT=${DASHBOARD_PORT:-8265} -OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-8076} -NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-8077} -DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:-52365} -DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-52366} -METRICS_PORT=${METRICS_PORT:-9002} -# On our clusters, the largest port range on an idle worker appeared between 52367-64607 +# Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports +NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-53001} +OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-53003} +RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:-53005} +DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007} +METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009} + +# Ports for the head node +PORT=${PORT:-6379} +RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001} +#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ?? +DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367} +DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger +DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365} + +# On our clusters, the largest port range on an idle worker appeared between 52369-64607 # (not including the other ports set by this script). So this range is chosen to be # somewhere in the middle -MIN_WORKER_PORT=${MIN_WORKER_PORT:-53001} -MAX_WORKER_PORT=${MAX_WORKER_PORT:-53257} +MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001} +MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257} ######################################################## # Defaults to placing uv cache inside the SLURM_SUBMIT_DIR @@ -39,7 +46,8 @@ UV_CACHE_DIR="${UV_CACHE_DIR:-$SLURM_SUBMIT_DIR/uv_cache}" mkdir -p $UV_CACHE_DIR # Create logs directory -LOG_DIR="$SLURM_SUBMIT_DIR/$SLURM_JOB_ID-logs" +BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}/ +LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs" mkdir -p $LOG_DIR COMMON_SRUN_ARGS="" @@ -56,7 +64,7 @@ COMMON_SRUN_ARGS+=" --gres=gpu:8" # Number of GPUs per node gpus_per_node=8 -num_retries=5 +num_retries=3 # Getting the node names and IP addresses in the SLURM allocation nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST") @@ -72,7 +80,7 @@ done head_node=${nodes_array[0]} head_node_ip=${ip_addresses_array[0]} -ip_head=$head_node_ip:$GCS_SERVER_PORT +ip_head=$head_node_ip:$PORT # First we start the head of the ray cluster on one of the physical nodes # Set GPU/CPU resources to 0 to avoid scheduling on the head node @@ -82,30 +90,60 @@ head_cmd=$(cat < Date: Fri, 25 Apr 2025 00:22:53 -0700 Subject: [PATCH 2/2] tiny Signed-off-by: Terry Kong --- ray.sub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray.sub b/ray.sub index 4baf4460b..0fe8cbdcd 100644 --- a/ray.sub +++ b/ray.sub @@ -46,7 +46,7 @@ UV_CACHE_DIR="${UV_CACHE_DIR:-$SLURM_SUBMIT_DIR/uv_cache}" mkdir -p $UV_CACHE_DIR # Create logs directory -BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}/ +BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR} LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs" mkdir -p $LOG_DIR