From e5cbb73c8c38f58a4498586883fd0c4e09f599ac Mon Sep 17 00:00:00 2001 From: wuhuachaocoding Date: Wed, 21 Sep 2022 07:21:49 +0000 Subject: [PATCH 1/2] update moe recompute. --- examples/language_model/moe/dygraph/modeling.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/examples/language_model/moe/dygraph/modeling.py b/examples/language_model/moe/dygraph/modeling.py index 64c1f220ca1d..60ea35d327c2 100644 --- a/examples/language_model/moe/dygraph/modeling.py +++ b/examples/language_model/moe/dygraph/modeling.py @@ -35,8 +35,6 @@ MoeLayer = moe.MoELayer from utils import get_timers -from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _initialize_recompute_setting, _initialize_recompute_hcg - __all__ = [ 'GPTModel', "GPTPretrainedModel", @@ -769,11 +767,6 @@ def __init__(self, self.hidden_size = hidden_size self.vocab_size = vocab_size - if recompute_interval > 0: - _initialize_recompute_hcg(hcg) - _initialize_recompute_setting(recompute_offload, - recompute_partition) - self.embeddings = GPTEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings, From 0b1e8f76a8d32e00f2fe7739c4367d9406155d82 Mon Sep 17 00:00:00 2001 From: sljlp Date: Sat, 24 Sep 2022 13:35:18 +0800 Subject: [PATCH 2/2] fix --- examples/language_model/moe/dygraph/run_moe_pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/language_model/moe/dygraph/run_moe_pretrain.py b/examples/language_model/moe/dygraph/run_moe_pretrain.py index 183a96f39f69..2d0c411a172b 100644 --- a/examples/language_model/moe/dygraph/run_moe_pretrain.py +++ b/examples/language_model/moe/dygraph/run_moe_pretrain.py @@ -494,7 +494,7 @@ def do_train(args): group=sharding_group, sync_op=True) # Multi stream operation will be supported later - dist.wait(tensor=p, group=sharding_group, sync_op=True) + dist.wait(tensor=p, group=sharding_group, use_calc_stream=True) else: initialize_mp_dp_parameters(model, hcg)