From e667c5389e31a160c536ec98e2f793584ccbd63e Mon Sep 17 00:00:00 2001 From: Cheng <435405393@qq.com> Date: Wed, 25 Oct 2023 15:30:11 +0800 Subject: [PATCH 1/7] Refactor trainer.py --- uer/initialize.py | 17 +++++++++ uer/opts.py | 2 + uer/trainer.py | 94 +++++++++++++++++++++++++---------------------- 3 files changed, 69 insertions(+), 44 deletions(-) create mode 100644 uer/initialize.py diff --git a/uer/initialize.py b/uer/initialize.py new file mode 100644 index 00000000..bb731c46 --- /dev/null +++ b/uer/initialize.py @@ -0,0 +1,17 @@ +import torch + + +def initialize(args): + if args.dist_train: + # Initialize multiprocessing distributed training environment. + args.global_rank = args.gpu_ranks[args.local_rank] + torch.distributed.init_process_group(backend=args.backend, + init_method=args.master_ip, + world_size=args.world_size, + rank=args.global_rank) + elif args.single_gpu: + args.global_rank = None + else: + args.global_rank = None + + return None \ No newline at end of file diff --git a/uer/opts.py b/uer/opts.py index 49f8c402..9000e97b 100644 --- a/uer/opts.py +++ b/uer/opts.py @@ -82,6 +82,8 @@ def training_opts(parser): help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") + parser.add_argument("--local_rank", type=int, required=False) + parser.add_argument("--global_rank", type=int, required=False) log_opts(parser) diff --git a/uer/trainer.py b/uer/trainer.py index e80c379f..0dd72e4c 100644 --- a/uer/trainer.py +++ b/uer/trainer.py @@ -13,17 +13,7 @@ from uer.utils.seed import set_seed -def train_and_validate(args): - set_seed(args.seed) - - # Load vocabulary. - if args.data_processor == "mt": - args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args, is_src=False) - args.tgt_vocab = args.tgt_tokenizer.vocab - - args.tokenizer = str2tokenizer[args.tokenizer](args) - args.vocab = args.tokenizer.vocab - +def model_init(args): # Build model. model = build_model(args) @@ -47,16 +37,54 @@ def train_and_validate(args): for n, p in list(model.named_parameters()): if "gamma" not in n and "beta" not in n: p.data.normal_(0, 0.02) + return model + + +def optimizer_init(args, model): + # Build optimizer. + param_optimizer = list(model.named_parameters()) + no_decay = ["bias", "gamma", "beta"] + optimizer_grouped_parameters = [ + {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, + {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0} + ] + + if args.optimizer in ["adamw"]: + custom_optimizer = str2optimizer[args.optimizer](optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) + else: + custom_optimizer = str2optimizer[args.optimizer](optimizer_grouped_parameters, lr=args.learning_rate, scale_parameter=False, relative_step=False) + if args.scheduler in ["constant"]: + custom_scheduler = str2scheduler[args.scheduler](custom_optimizer) + elif args.scheduler in ["constant_with_warmup"]: + custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup) + elif args.scheduler in ["tri_stage"]: + custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup, args.total_steps*args.lr_decay, args.total_steps) + else: + custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup, args.total_steps) + + return custom_optimizer, custom_scheduler + + +def train_and_validate(args): + set_seed(args.seed) + + # Load vocabulary. + if args.data_processor == "mt": + args.tgt_tokenizer = str2tokenizer[args.tgt_tokenizer](args, is_src=False) + args.tgt_vocab = args.tgt_tokenizer.vocab + + args.tokenizer = str2tokenizer[args.tokenizer](args) + args.vocab = args.tokenizer.vocab if args.dist_train: # Multiprocessing distributed mode. - mp.spawn(worker, nprocs=args.ranks_num, args=(args.gpu_ranks, args, model), daemon=False) + mp.spawn(worker, nprocs=args.ranks_num, args=(args), daemon=False) elif args.single_gpu: # Single GPU mode. - worker(args.local_rank, None, args, model) + worker(args.local_rank, args) else: # CPU mode. - worker(None, None, args, model) + worker(None, args) class Trainer(object): @@ -423,33 +451,16 @@ def worker(local_rank, gpu_ranks, args, model): # Get logger args.logger = init_logger(args) - if args.dist_train: - global_rank = gpu_ranks[local_rank] - elif args.single_gpu: - global_rank = None - else: - global_rank = None + # Env initialize. + args.local_rank = local_rank + initialize(args) + global_rank = args.global_rank - # Build optimizer. - param_optimizer = list(model.named_parameters()) - no_decay = ["bias", "gamma", "beta"] - optimizer_grouped_parameters = [ - {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, - {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0} - ] + # Build model. + model = model_init(args) - if args.optimizer in ["adamw"]: - custom_optimizer = str2optimizer[args.optimizer](optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) - else: - custom_optimizer = str2optimizer[args.optimizer](optimizer_grouped_parameters, lr=args.learning_rate, scale_parameter=False, relative_step=False) - if args.scheduler in ["constant"]: - custom_scheduler = str2scheduler[args.scheduler](custom_optimizer) - elif args.scheduler in ["constant_with_warmup"]: - custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup) - elif args.scheduler in ["tri_stage"]: - custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup, args.total_steps*args.lr_decay, args.total_steps) - else: - custom_scheduler = str2scheduler[args.scheduler](custom_optimizer, args.total_steps*args.warmup, args.total_steps) + # Build optimizer. + custom_optimizer, custom_scheduler = optimizer_init(args, model) if local_rank is not None: model.cuda(local_rank) @@ -457,11 +468,6 @@ def worker(local_rank, gpu_ranks, args, model): scheduler = custom_scheduler if args.dist_train: - # Initialize multiprocessing distributed training environment. - dist.init_process_group(backend=args.backend, - init_method=args.master_ip, - world_size=args.world_size, - rank=global_rank) model = DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=True) args.logger.info("Worker %d is training ... " % global_rank) else: From ed85ff4f763949d4e81dbcb669f3dc03ca0448e9 Mon Sep 17 00:00:00 2001 From: Cheng <435405393@qq.com> Date: Wed, 25 Oct 2023 15:33:33 +0800 Subject: [PATCH 2/7] Refactor trainer.py --- uer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uer/trainer.py b/uer/trainer.py index 0dd72e4c..b607b9cd 100644 --- a/uer/trainer.py +++ b/uer/trainer.py @@ -439,7 +439,7 @@ class PrefixlmTrainer(MlmTrainer): "bart": BartTrainer, "prefixlm": PrefixlmTrainer, "cls_mlm": ClsMlmTrainer} -def worker(local_rank, gpu_ranks, args, model): +def workerworker(local_rank, args): """ Args: local_rank: The id of GPU for single GPU mode; From d425c3eea01598f31047acb8d1babbfbb5187c36 Mon Sep 17 00:00:00 2001 From: Cheng <435405393@qq.com> Date: Wed, 25 Oct 2023 15:37:21 +0800 Subject: [PATCH 3/7] Refactor trainer.py --- uer/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/uer/trainer.py b/uer/trainer.py index b607b9cd..359f9932 100644 --- a/uer/trainer.py +++ b/uer/trainer.py @@ -3,6 +3,7 @@ import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel +from uer.initialize import initialize from uer.model_loader import load_model from uer.model_saver import save_model from uer.model_builder import build_model @@ -439,7 +440,7 @@ class PrefixlmTrainer(MlmTrainer): "bart": BartTrainer, "prefixlm": PrefixlmTrainer, "cls_mlm": ClsMlmTrainer} -def workerworker(local_rank, args): +def worker(local_rank, args): """ Args: local_rank: The id of GPU for single GPU mode; From 90b09c9c0baae2f8b11cd7ff4411af53cee89cdd Mon Sep 17 00:00:00 2001 From: Cheng <435405393@qq.com> Date: Wed, 25 Oct 2023 18:09:05 +0800 Subject: [PATCH 4/7] Refactor trainer.py --- uer/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uer/initialize.py b/uer/initialize.py index bb731c46..2ba12c55 100644 --- a/uer/initialize.py +++ b/uer/initialize.py @@ -14,4 +14,4 @@ def initialize(args): else: args.global_rank = None - return None \ No newline at end of file + return None From 6e166a93d56cfc5e9505d58a1059686cdbdfd80c Mon Sep 17 00:00:00 2001 From: Cheng <435405393@qq.com> Date: Wed, 25 Oct 2023 18:24:37 +0800 Subject: [PATCH 5/7] Refactor trainer.py --- uer/initialize.py | 2 +- uer/trainer.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/uer/initialize.py b/uer/initialize.py index 2ba12c55..26e57785 100644 --- a/uer/initialize.py +++ b/uer/initialize.py @@ -1,7 +1,7 @@ import torch -def initialize(args): +def init_env(args): if args.dist_train: # Initialize multiprocessing distributed training environment. args.global_rank = args.gpu_ranks[args.local_rank] diff --git a/uer/trainer.py b/uer/trainer.py index 359f9932..ad68dec7 100644 --- a/uer/trainer.py +++ b/uer/trainer.py @@ -3,7 +3,7 @@ import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel -from uer.initialize import initialize +from uer.initialize import init_env from uer.model_loader import load_model from uer.model_saver import save_model from uer.model_builder import build_model @@ -14,7 +14,7 @@ from uer.utils.seed import set_seed -def model_init(args): +def init_model(args): # Build model. model = build_model(args) @@ -41,7 +41,7 @@ def model_init(args): return model -def optimizer_init(args, model): +def init_optimizer(args, model): # Build optimizer. param_optimizer = list(model.named_parameters()) no_decay = ["bias", "gamma", "beta"] @@ -454,14 +454,14 @@ def worker(local_rank, args): # Env initialize. args.local_rank = local_rank - initialize(args) + init_env(args) global_rank = args.global_rank # Build model. - model = model_init(args) + model = init_model(args) # Build optimizer. - custom_optimizer, custom_scheduler = optimizer_init(args, model) + custom_optimizer, custom_scheduler = init_optimizer(args, model) if local_rank is not None: model.cuda(local_rank) From 01f70c11bd198b53e906687dc104fc1cde469ee9 Mon Sep 17 00:00:00 2001 From: Cheng hou <59219579+hhou435@users.noreply.github.com> Date: Thu, 26 Oct 2023 21:21:30 +0800 Subject: [PATCH 6/7] Update trainer.py --- uer/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/uer/trainer.py b/uer/trainer.py index ad68dec7..2aa97b44 100644 --- a/uer/trainer.py +++ b/uer/trainer.py @@ -79,13 +79,13 @@ def train_and_validate(args): if args.dist_train: # Multiprocessing distributed mode. - mp.spawn(worker, nprocs=args.ranks_num, args=(args), daemon=False) + mp.spawn(worker, nprocs=args.ranks_num, args=(args.gpu_ranks, args), daemon=False) elif args.single_gpu: # Single GPU mode. - worker(args.local_rank, args) + worker(args.local_rank, None, args) else: # CPU mode. - worker(None, args) + worker(None, None, args) class Trainer(object): @@ -440,7 +440,7 @@ class PrefixlmTrainer(MlmTrainer): "bart": BartTrainer, "prefixlm": PrefixlmTrainer, "cls_mlm": ClsMlmTrainer} -def worker(local_rank, args): +def worker(local_rank, gpu_ranks, args): """ Args: local_rank: The id of GPU for single GPU mode; From dec8cf59f4cdd70335539df7f55ae3c32f729a27 Mon Sep 17 00:00:00 2001 From: Cheng hou <59219579+hhou435@users.noreply.github.com> Date: Mon, 30 Oct 2023 18:40:27 +0800 Subject: [PATCH 7/7] Update opts.py --- uer/opts.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/uer/opts.py b/uer/opts.py index 9000e97b..49f8c402 100644 --- a/uer/opts.py +++ b/uer/opts.py @@ -82,8 +82,6 @@ def training_opts(parser): help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") - parser.add_argument("--local_rank", type=int, required=False) - parser.add_argument("--global_rank", type=int, required=False) log_opts(parser)