8000 fix: improve port selection and exiting early from ray.sub by terrykong · Pull Request #272 · NVIDIA-NeMo/RL · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

fix: improve port selection and exiting early from ray.sub #272

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 102 additions & 32 deletions ray.sub
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,26 @@ CONTAINER=$CONTAINER
MOUNTS=$MOUNTS
COMMAND=${COMMAND:-} # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
########################################################
# Ray ports
GCS_SERVER_PORT=${GCS_SERVER_PORT:-6379}
DASHBOARD_PORT=${DASHBOARD_PORT:-8265}
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-8076}
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-8077}
DASHBOARD_AGENT_PORT=${DASHBOARD_AGENT_PORT:-52365}
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-52366}
METRICS_PORT=${METRICS_PORT:-9002}
# On our clusters, the largest port range on an idle worker appeared between 52367-64607
# Ports for all nodes (should be odd numbers since we place head/worker[0] on the same node) so all workers get the odd ports, but the head will get +1 the ports
NODE_MANAGER_PORT=${NODE_MANAGER_PORT:-53001}
OBJECT_MANAGER_PORT=${OBJECT_MANAGER_PORT:-53003}
RUNTIME_ENV_AGENT_PORT=${RUNTIME_ENV_AGENT_PORT:-53005}
DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}

# Ports for the head node
PORT=${PORT:-6379}
RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
#REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367}
DASHBOARD_PORT=${DASHBOARD_PORT:-8265} # Also used by debugger
DASHBOARD_AGENT_LISTEN_PORT=${DASHBOARD_AGENT_LISTEN_PORT:-52365}

# On our clusters, the largest port range on an idle worker appeared between 52369-64607
# (not including the other ports set by this script). So this range is chosen to be
# somewhere in the middle
MIN_WORKER_PORT=${MIN_WORKER_PORT:-53001}
MAX_WORKER_PORT=${MAX_WORKER_PORT:-53257}
MIN_WORKER_PORT=${MIN_WORKER_PORT:-54001}
MAX_WORKER_PORT=${MAX_WORKER_PORT:-54257}
########################################################

# Defaults to placing uv cache inside the SLURM_SUBMIT_DIR
Expand All @@ -39,7 +46,8 @@ UV_CACHE_DIR="${UV_CACHE_DIR:-$SLURM_SUBMIT_DIR/uv_cache}"
mkdir -p $UV_CACHE_DIR

# Create logs directory
LOG_DIR="$SLURM_SUBMIT_DIR/$SLURM_JOB_ID-logs"
BASE_LOG_DIR=${BASE_LOG_DIR:-$SLURM_SUBMIT_DIR}
LOG_DIR="$BASE_LOG_DIR/$SLURM_JOB_ID-logs"
mkdir -p $LOG_DIR

COMMON_SRUN_ARGS=""
Expand All @@ -56,7 +64,7 @@ COMMON_SRUN_ARGS+=" --gres=gpu:8"
# Number of GPUs per node
gpus_per_node=8

num_retries=5
num_retries=3

# Getting the node names and IP addresses in the SLURM allocation
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
Expand All @@ -72,7 +80,7 @@ done
head_node=${nodes_array[0]}
head_node_ip=${ip_addresses_array[0]}

ip_head=$head_node_ip:$GCS_SERVER_PORT
ip_head=$head_node_ip:$PORT

# First we start the head of the ray cluster on one of the physical nodes
# Set GPU/CPU resources to 0 to avoid scheduling on the head node
Expand All @@ -82,30 +90,60 @@ head_cmd=$(cat <<EOF
# Overlapping srun commands will check this file to determine if we can overlap a container command
touch $LOG_DIR/STARTED_RAY_HEAD
env

exit-dramatically() {
# Use SIGTERM to forcefully terminate the srun process
pkill -P $$ || true
kill -TERM 0 || true
# As a last resort, exit with a non-zero code
exit 1
}
export -f exit-dramatically

# Background process to check for ENDED file
monitor-sidecar() {
set +x
while true; do
sleep 60
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Detected ENDED file, terminating..."
exit-dramatically
fi
done
}
monitor-sidecar &

cat <<EOFINNER | tee /launch-head.sh
ray start --head \
--disable-usage-stats \
--num-cpus=0 \
--num-gpus=0 \
--node-ip-address="$head_node_ip" \
--port=${GCS_SERVER_PORT} \
--dashboard-port=${DASHBOARD_PORT} \
--object-manager-port=${OBJECT_MANAGER_PORT} \
--node-manager-port=${NODE_MANAGER_PORT} \
--metrics-export-port=${METRICS_PORT} \
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
--dashboard-agent-listen-port=${DASHBOARD_AGENT_PORT} \
--block
--disable-usage-stats \
--num-cpus=0 \
--num-gpus=0 \
--node-ip-address="$head_node_ip" \
--port=${PORT} \
--ray-client-server-port=${RAY_CLIENT_SERVER_PORT} \
--dashboard-grpc-port=${DASHBOARD_GRPC_PORT} \
--dashboard-port=${DASHBOARD_PORT} \
\
--node-manager-port=$((${NODE_MANAGER_PORT} + 1)) \
--object-manager-port=$((${OBJECT_MANAGER_PORT} + 1)) \
--runtime-env-agent-port=$((${RUNTIME_ENV_AGENT_PORT} + 1)) \
--dashboard-agent-grpc-port=$((${DASHBOARD_AGENT_GRPC_PORT} + 1)) \
--dashboard-agent-listen-port=$((${DASHBOARD_AGENT_LISTEN_PORT} + 1)) \
--metrics-export-port=$((${METRICS_EXPORT_PORT} + 1)) \
\
--block
EOFINNER
chmod +x /launch-head.sh

count=0
while true; do
while [[ \$count -lt $num_retries ]]; do
bash /launch-head.sh
count=\$((count+1))
echo "Head node failed \$count times, restarting..."
echo "Head node failed \$count/$num_retries times, restarting in 5 seconds..."
sleep 5
done
echo ret_code=\$?
touch $LOG_DIR/ENDED
exit 1
EOF
)
srun $COMMON_SRUN_ARGS --container-name=ray-head --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-head.log bash -x -c "$head_cmd" &
Expand All @@ -120,22 +158,54 @@ for ((i = 0; i < SLURM_JOB_NUM_NODES; i++)); do

worker_cmd=$(cat <<EOF
env

exit-dramatically() {
# Use SIGTERM to forcefully terminate the srun process
pkill -P $$ || true
kill -TERM 0 || true
# As a last resort, exit with a non-zero code
exit 1
}

# Background process to check for ENDED file
monitor-sidecar() {
set +x
while true; do
sleep 60
if [[ -f "$LOG_DIR/ENDED" ]]; then
echo "Detected ENDED file, terminating..."
exit-dramatically
fi
done
}
monitor-sidecar &

cat <<EOFINNER | tee /launch-worker.sh
ray start --address "$ip_head" \
--disable-usage-stats \
--resources="{\"worker_units\": $gpus_per_node}" \
--min-worker-port=${MIN_WORKER_PORT} \
--max-worker-port=${MAX_WORKER_PORT} \
\
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these separated just for readability?

--node-manager-port=${NODE_MANAGER_PORT} \
--object-manager-port=${OBJECT_MANAGER_PORT} \
--runtime-env-agent-port=${RUNTIME_ENV_AGENT_PORT} \
--dashboard-agent-grpc-port=${DASHBOARD_AGENT_GRPC_PORT} \
--dashboard-agent-listen-port=${DASHBOARD_AGENT_LISTEN_PORT} \
--metrics-export-port=${METRICS_EXPORT_PORT} \
\
--block
EOFINNER

count=0
while true; do
while [[ \$count -lt $num_retries ]]; do
bash /launch-worker.sh
count=\$((count+1))
echo "Worker failed \$count times, restarting..."
echo "Worker failed \$count/$num_retries times, restarting in 5 seconds..."
sleep 5
done
echo ret_code=\$?
touch $LOG_DIR/ENDED
exit 1
EOF
)
if [[ $i -eq 0 ]]; then
Expand Down
0