From 11befee3898cb9ab671bb0c289d7f20e25d3a66d Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Wed, 30 Oct 2024 17:34:25 +0800 Subject: [PATCH] align default custom black/white list for dygraph and static graph --- llm/auto_parallel/llama/README.md | 3 +++ paddlenlp/trainer/auto_trainer.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/llm/auto_parallel/llama/README.md b/llm/auto_parallel/llama/README.md index 4786269891e1..68f4849195ef 100644 --- a/llm/auto_parallel/llama/README.md +++ b/llm/auto_parallel/llama/README.md @@ -5,6 +5,9 @@ - 动静统一自动并行组网[modeling_auto.py](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/llama/modeling_auto.py),当前主要支持预训练,包括动态图和动转静训练,未来会扩展支持 SFT 等流程。 ## 2. 预训练准备 + +安装最新的 Paddle,建议使用 nightly 版本,请前往 [Paddle 官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html) 进行安装。 + 下载预先处理好的数据,并解压到 `./data` 目录下: ```shell # llama 模型数据下载 diff --git a/paddlenlp/trainer/auto_trainer.py b/paddlenlp/trainer/auto_trainer.py index 93dbf5d95f46..dd850c7060c0 100644 --- a/paddlenlp/trainer/auto_trainer.py +++ b/paddlenlp/trainer/auto_trainer.py @@ -115,6 +115,7 @@ def _wrap_for_dist_loader(self, train_dataloader): return dist_loader def _wrap_for_auto(self, model, train_dataloader): + logger.info("Wrapping model for auto paralle") dist_loader = self._wrap_for_dist_loader(train_dataloader) if ShardingOption.SHARD_OP in self.args.sharding: @@ -135,6 +136,15 @@ def _wrap_for_auto(self, model, train_dataloader): if self.args.to_static: unified_strategy = dist.Strategy() unified_strategy._from_legacy_strategy(self.args.strategy) + + # same logic as autocast_smart_context_manager() in trainer.py + if self.enable_autocast_context_manager: + unified_strategy.amp.custom_black_list.extend(["reduce_sum", "c_softmax_with_cross_entropy"]) + if self.args.fp16_opt_level == "O2": + print("custom_white_list", unified_strategy.amp.custom_white_list, flush=1) + unified_strategy.amp.custom_white_list.extend(["lookup_table", "lookup_table_v2"]) + print("custom_white_list", unified_strategy.amp.custom_white_list, flush=1) + # dist.to_static() obtains the input spec information through next(dataloader), but this has side effects # on the passed-in dataloader, altering the state of the sampler of the dataloader. In some cases, once # the state of the sampler is changed, it cannot be reverted. Therefore, a temporary dataloader is @@ -156,9 +166,10 @@ def _wrap_amp_model(self, args, model): master_grad=self.args.amp_master_grad, excluded_layers=QuantizationLinear, ) + self.enable_autocast_context_manager = True + if args.to_static: return - self.enable_autocast_context_manager = True self.do_grad_scaling = True if self.args.fp16 else False self.scaler = dist.shard_scaler(paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss))