Open
Description
The code and scripts run normally on a single node. I tried running them on two nodes. I added ray start --head to the code and set trainer.nnodes=2, but the program still hangs after reading the parameters.
bash
set -x
MODEL_PATH=/xx
export WANDB_API_KEY=xx
/xx/ray start --head
/xx/python -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/xx/train.parquet \
data.val_files=/xx/test.parquet \
data.train_batch_size=8 \
data.val_batch_size=8 \
data.max_prompt_length=400 \
data.max_response_length=2048 \
actor_rollout_ref.model.path=$MODEL_PATH\
actor_rollout_ref.actor.optim.lr=3e-7 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.grad_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.ref.log_prob_micro_batch_size=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['wandb'] \
trainer.project_name='xx' \
trainer.experiment_name='xx' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=2 \
trainer.default_local_dir=/xx/ \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=10 $@ 2>&1 | tee grpo.log
logs:
+ /xx/bin/ray start --head
2025-02-10 17:06:24,769 INFO usage_lib.py:467 -- Usage stats collection is enabled by default without user confirmation because this terminal is detected to be non-interactive. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.
2025-02-10 17:06:24,769 INFO scripts.py:865 -- Local node IP: xx.xx.xxx.x
2025-02-10 17:06:26,075 SUCC scripts.py:902 -- --------------------
2025-02-10 17:06:26,075 SUCC scripts.py:903 -- Ray runtime started.
2025-02-10 17:06:26,075 SUCC scripts.py:904 -- --------------------
2025-02-10 17:06:26,075 INFO scripts.py:906 -- Next steps
2025-02-10 17:06:26,075 INFO scripts.py:909 -- To add another node to this Ray cluster, run
2025-02-10 17:06:26,076 INFO scripts.py:912 -- ray start --address='xx.xx.xxx.x:xxxx'
2025-02-10 17:06:26,076 INFO scripts.py:921 -- To connect to this Ray cluster:
2025-02-10 17:06:26,076 INFO scripts.py:923 -- import ray
2025-02-10 17:06:26,076 INFO scripts.py:924 -- ray.init()
2025-02-10 17:06:26,076 INFO scripts.py:955 -- To terminate the Ray runtime, run
2025-02-10 17:06:26,076 INFO scripts.py:956 -- ray stop
2025-02-10 17:06:26,076 INFO scripts.py:959 -- To view the status of the cluster, use
2025-02-10 17:06:26,076 INFO scripts.py:960 -- ray status
+ tee grpo.log
+ /xx/bin/python -m verl.trainer.main_ppo algorithm.adv_estimator=grpo data.train_files=/xx/train.parquet data.val_files=/xx/test.parquet data.train_batch_size=8 data.val_batch_size=8 data.max_prompt_length=400 data.max_response_length=2048 actor_rollout_ref.model.path=/xx/Qwen2.5-7B-Instruct-1M actor_rollout_ref.actor.optim.lr=3e-7 actor_rollout_ref.model.use_remove_padding=True actor_rollout_ref.actor.ppo_mini_batch_size=256 actor_rollout_ref.actor.ppo_micro_batch_size=64 actor_rollout_ref.actor.use_kl_loss=True actor_rollout_ref.actor.kl_loss_coef=0.001 actor_rollout_ref.actor.kl_loss_type=low_var_kl actor_rollout_ref.model.enable_gradient_checkpointing=True actor_rollout_ref.actor.fsdp_config.param_offload=True actor_rollout_ref.actor.fsdp_config.grad_offload=True actor_rollout_ref.actor.fsdp_config.optimizer_offload=True actor_rollout_ref.rollout.log_prob_micro_batch_size=160 actor_rollout_ref.rollout.tensor_model_parallel_size=1 actor_rollout_ref.rollout.name=vllm actor_rollout_ref.rollout.gpu_memory_utilization=0.6 actor_rollout_ref.rollout.n=16 actor_rollout_ref.ref.log_prob_micro_batch_size=160 actor_rollout_ref.ref.fsdp_config.param_offload=True algorithm.kl_ctrl.kl_coef=0.001 trainer.critic_warmup=0 'trainer.logger=[wandb]' trainer.project_name=GRPO_logic_KK trainer.experiment_name=Qwen-7B_3ppl_2nodes trainer.n_gpus_per_node=8 trainer.nnodes=2 trainer.default_local_dir=/xx/ trainer.default_hdfs_dir=null trainer.save_freq=20 trainer.test_freq=20 trainer.total_epochs=10
/xx/lib/python3.9/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
warnings.warn(
2025-02-10 17:06:34,182 INFO worker.py:1654 -- Connecting to existing Ray cluster at address: xx.xx.xxx.x:xxxx...
2025-02-10 17:06:34,193 INFO worker.py:1841 -- Connected to Ray cluster.
[36m(pid=4153)[0m /xx/lib/python3.9/site-packages/_distutils_hack/__init__.py:53: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
[36m(pid=4153)[0m warnings.warn(
[36m(main_task pid=4153)[0m {'actor_rollout_ref': {'actor': {'clip_ratio': 0.2,
[36m(main_task pid=4153)[0m 'entropy_coeff': 0.001,
[36m(main_task pid=4153)[0m 'fsdp_config': {'fsdp_size': -1,
[36m(main_task pid=4153)[0m 'grad_offload': True,
[36m(main_task pid=4153)[0m 'optimizer_offload': True,
[36m(main_task pid=4153)[0m 'param_offload': True,
[36m(main_task pid=4153)[0m 'wrap_policy': {'min_num_params': 0}},
[36m(main_task pid=4153)[0m 'grad_clip': 1.0,
[36m(main_task pid=4153)[0m 'kl_loss_coef': 0.001,
[36m(main_task pid=4153)[0m 'kl_loss_type': 'low_var_kl',
[36m(main_task pid=4153)[0m 'optim': {'lr': 3e-07,
[36m(main_task pid=4153)[0m 'lr_warmup_steps_ratio': 0.0,
[36m(main_task pid=4153)[0m 'min_lr_ratio': None,
[36m(main_task pid=4153)[0m 'total_training_steps': -1,
[36m(main_task pid=4153)[0m 'warmup_style': 'constant'},
[36m(main_task pid=4153)[0m 'ppo_epochs': 1,
[36m(main_task pid=4153)[0m 'ppo_max_token_len_per_gpu': 16384,
[36m(main_task pid=4153)[0m 'ppo_micro_batch_size': 64,
[36m(main_task pid=4153)[0m 'ppo_mini_batch_size': 256,
[36m(main_task pid=4153)[0m 'shuffle': False,
[36m(main_task pid=4153)[0m 'strategy': 'fsdp',
[36m(main_task pid=4153)[0m 'ulysses_sequence_parallel_size': 1,
[36m(main_task pid=4153)[0m 'use_dynamic_bsz': False,
[36m(main_task pid=4153)[0m 'use_kl_loss': True},
[36m(main_task pid=4153)[0m 'hybrid_engine': True,
[36m(main_task pid=4153)[0m 'model': {'enable_gradient_checkpointing': True,
[36m(main_task pid=4153)[0m 'external_lib': None,
[36m(main_task pid=4153)[0m 'override_config': {},
[36m(main_task pid=4153)[0m 'path': '/xx/Qwen2.5-7B-Instruct-1M',
[36m(main_task pid=4153)[0m 'use_remove_padding': True},
[36m(main_task pid=4153)[0m 'ref': {'fsdp_config': {'fsdp_size': -1,
[36m(main_task pid=4153)[0m 'param_offload': True,
[36m(main_task pid=4153)[0m 'wrap_policy': {'min_num_params': 0}},
[36m(main_task pid=4153)[0m 'log_prob_max_token_len_per_gpu': 16384,
[36m(main_task pid=4153)[0m 'log_prob_micro_batch_size': 160,
[36m(main_task pid=4153)[0m 'log_prob_use_dynamic_bsz': False,
[36m(main_task pid=4153)[0m 'ulysses_sequence_parallel_size': 1},
[36m(main_task pid=4153)[0m 'rollout': {'do_sample': True,
[36m(main_task pid=4153)[0m 'dtype': 'bfloat16',
[36m(main_task pid=4153)[0m 'enforce_eager': True,
[36m(main_task pid=4153)[0m 'free_cache_engine': True,
[36m(main_task pid=4153)[0m 'gpu_memory_utilization': 0.6,
[36m(main_task pid=4153)[0m 'ignore_eos': False,
[36m(main_task pid=4153)[0m 'load_format': 'dummy_dtensor',
[36m(main_task pid=4153)[0m 'log_prob_max_token_len_per_gpu': 16384,
[36m(main_task pid=4153)[0m 'log_prob_micro_batch_size': 160,
[36m(main_task pid=4153)[0m 'log_prob_use_dynamic_bsz': False,
[36m(main_task pid=4153)[0m 'max_num_batched_tokens': 8192,
[36m(main_task pid=4153)[0m 'max_num_seqs': 1024,
[36m(main_task pid=4153)[0m 'n': 16,
[36m(main_task pid=4153)[0m 'name': 'vllm',
[36m(main_task pid=4153)[0m 'prompt_length': 400,
[36m(main_task pid=4153)[0m 'response_length': 2048,
[36m(main_task pid=4153)[0m 'temperature': 1.0,
[36m(main_task pid=4153)[0m 'tensor_model_parallel_size': 1,
[36m(main_task pid=4153)[0m 'top_k': -1,
[36m(main_task pid=4153)[0m 'top_p': 1}},
[36m(main_task pid=4153)[0m 'algorithm': {'adv_estimator': 'grpo',
[36m(main_task pid=4153)[0m 'gamma': 1.0,
[36m(main_task pid=4153)[0m 'kl_ctrl': {'kl_coef': 0.001, 'type': 'fixed'},
[36m(main_task pid=4153)[0m 'kl_penalty': 'kl',
[36m(main_task pid=4153)[0m 'lam': 1.0},
[36m(main_task pid=4153)[0m 'critic': {'cliprange_value': 0.5,
[36m(main_task pid=4153)[0m 'forward_max_token_len_per_gpu': 32768,
[36m(main_task pid=4153)[0m 'forward_micro_batch_size': 64,
[36m(main_task pid=4153)[0m 'grad_clip': 1.0,
[36m(main_task pid=4153)[0m 'model': {'enable_gradient_checkpointing': False,
[36m(main_task pid=4153)[0m 'external_lib': None,
[36m(main_task pid=4153)[0m 'fsdp_config': {'fsdp_size': -1,
[36m(main_task pid=4153)[0m 'grad_offload': False,
[36m(main_task pid=4153)[0m 'optimizer_offload': False,
[36m(main_task pid=4153)[0m 'param_offload': False,
[36m(main_task pid=4153)[0m 'wrap_policy': {'min_num_params': 0}},
[36m(main_task pid=4153)[0m 'override_config': {},
[36m(main_task pid=4153)[0m 'path': '~/models/deepseek-llm-7b-chat',
[36m(main_task pid=4153)[0m 'tokenizer_path': '/xx/Qwen2.5-7B-Instruct-1M',
[36m(main_task pid=4153)[0m 'use_remove_padding': False},
[36m(main_task pid=4153)[0m 'optim': {'lr': 1e-05,
[36m(main_task pid=4153)[0m 'lr_warmup_steps_ratio': 0.0,
[36m(main_task pid=4153)[0m 'min_lr_ratio': None,
[36m(main_task pid=4153)[0m 'total_training_steps': -1,
[36m(main_task pid=4153)[0m 'warmup_style': 'constant'},
[36m(main_task pid=4153)[0m 'ppo_epochs': 1,
[36m(main_task pid=4153)[0m 'ppo_max_token_len_per_gpu': 32768,
[36m(main_task pid=4153)[0m 'ppo_micro_batch_size': 64,
[36m(main_task pid=4153)[0m 'ppo_mini_batch_size': 256,
[36m(main_task pid=4153)[0m 'shuffle': False,
[36m(main_task pid=4153)[0m 'strategy': 'fsdp',
[36m(main_task pid=4153)[0m 'ulysses_sequence_parallel_size': 1,
[36m(main_task pid=4153)[0m 'use_dynamic_bsz': False},
[36m(main_task pid=4153)[0m 'data': {'max_prompt_length': 400,
[36m(main_task pid=4153)[0m 'max_response_length': 2048,
[36m(main_task pid=4153)[0m 'prompt_key': 'prompt',
[36m(main_task pid=4153)[0m 'return_raw_chat': False,
[36m(main_task pid=4153)[0m /xx/lib/python3.9/site-packages/vllm/connections.py:8: RuntimeWarning: Failed to read commit hash:
[36m(main_task pid=4153)[0m No module named 'vllm._version'
[36m(main_task pid=4153)[0m from vllm.version import __version__ as VLLM_VERSION
Metadata
Metadata
Assignees
Labels
No labels