From 7eaa80cb1f1e947e7d24345b5875485d443267f6 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:14:28 +0800 Subject: [PATCH 1/4] fix(pt): use user seed in `DpLoaderSet` --- deepmd/pt/entrypoints/main.py | 13 ++++++++++++- deepmd/pt/train/training.py | 5 ----- deepmd/pt/utils/dataloader.py | 3 ++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index b0edf66878..f3cbbcf7bf 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -109,7 +109,9 @@ def get_trainer( assert dist.is_nccl_available() dist.init_process_group(backend="nccl") - def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): + def prepare_trainer_input_single( + model_params_single, data_dict_single, rank=0, seed=None + ): training_dataset_params = data_dict_single["training_data"] validation_dataset_params = data_dict_single.get("validation_data", None) validation_systems = ( @@ -139,6 +141,9 @@ def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): validation_systems, validation_dataset_params["batch_size"], model_params_single["type_map"], + seed=(seed + rank) % (2**32) + if seed is not None + else None, # avoid the same batch sequence among workers ) if validation_systems else None @@ -147,6 +152,9 @@ def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): training_systems, training_dataset_params["batch_size"], model_params_single["type_map"], + seed=(seed + rank) % (2**32) + if seed is not None + else None, # avoid the same batch sequence among workers ) return ( train_data_single, @@ -155,6 +163,7 @@ def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): ) rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + data_seed = config["training"].get("seed", None) if not multi_task: ( train_data, @@ -164,6 +173,7 @@ def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): config["model"], config["training"], rank=rank, + seed=data_seed, ) else: train_data, validation_data, stat_file_path = {}, {}, {} @@ -176,6 +186,7 @@ def prepare_trainer_input_single(model_params_single, data_dict_single, rank=0): config["model"]["model_dict"][model_key], config["training"]["data_dict"][model_key], rank=rank, + seed=data_seed, ) trainer = training.Trainer( diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 389bf28508..a7a97b3f8d 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -274,10 +274,6 @@ def get_lr(lr_params): self.opt_type, self.opt_param = get_opt_param(training_params) # Model - dp_random.seed(training_params["seed"]) - if training_params["seed"] is not None: - torch.manual_seed(training_params["seed"]) - self.model = get_model_for_wrapper(model_params) # Loss @@ -302,7 +298,6 @@ def get_lr(lr_params): ) # Data - dp_random.seed(training_params["seed"]) if not self.multi_task: self.get_sample_func = single_model_stat( self.model, diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 8ebe75868e..6b2070351d 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -80,7 +80,8 @@ def __init__( seed=10, shuffle=True, ): - setup_seed(seed) + if seed is not None: + setup_seed(seed) if isinstance(systems, str): with h5py.File(systems) as file: systems = [os.path.join(systems, item) for item in file.keys()] From efd20f7b3b5ac85d3956e84cd33706604b9ee12d Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 24 Jul 2024 15:22:20 +0800 Subject: [PATCH 2/4] Update main.py --- deepmd/pt/entrypoints/main.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index f3cbbcf7bf..1da5b1bf35 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -136,14 +136,14 @@ def prepare_trainer_input_single( stat_file_path_single = DPPath(stat_file_path_single, "a") # validation and training data + # avoid the same batch sequence among workers + rank_seed = (seed + rank) % (2**32) if seed is not None else None validation_data_single = ( DpLoaderSet( validation_systems, validation_dataset_params["batch_size"], model_params_single["type_map"], - seed=(seed + rank) % (2**32) - if seed is not None - else None, # avoid the same batch sequence among workers + seed=rank_seed, ) if validation_systems else None @@ -152,9 +152,7 @@ def prepare_trainer_input_single( training_systems, training_dataset_params["batch_size"], model_params_single["type_map"], - seed=(seed + rank) % (2**32) - if seed is not None - else None, # avoid the same batch sequence among workers + seed=rank_seed, ) return ( train_data_single, From 234ca1655b1e895cef14504eb6f86e64f739826e Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Wed, 24 Jul 2024 17:29:49 +0800 Subject: [PATCH 3/4] Update main.py --- deepmd/pt/entrypoints/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 1da5b1bf35..416e6ea1f8 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -136,7 +136,7 @@ def prepare_trainer_input_single( stat_file_path_single = DPPath(stat_file_path_single, "a") # validation and training data - # avoid the same batch sequence among workers + # avoid the same batch sequence among devices rank_seed = (seed + rank) % (2**32) if seed is not None else None validation_data_single = ( DpLoaderSet( From 8786f20649eb6fc6381450d0f72c83e91aca55bf Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 26 Jul 2024 00:16:01 +0800 Subject: [PATCH 4/4] Update dataloader.py --- deepmd/pt/utils/dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 6b2070351d..6a37a4a843 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -77,7 +77,7 @@ def __init__( systems, batch_size, type_map, - seed=10, + seed=None, shuffle=True, ): if seed is not None: