From c5f63dfd5ece64ed594218ed6b39a7ecae90a105 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:11:20 -0400 Subject: [PATCH 01/37] debug --- pytorch_lightning/trainer/data_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 3557e2c83a761..4405c2087742b 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -135,6 +135,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: 'ddp2': self.num_nodes, 'ddp_cpu': self.num_processes * self.num_nodes } + print(world_size['ddp']) sampler = DistributedSampler( dataloader.dataset, num_replicas=world_size.get(self.distributed_backend, 0), From 54f510c7324e7c9a9c418b7a5a2dcf6fb2f9bab3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:15:26 -0400 Subject: [PATCH 02/37] debug --- pytorch_lightning/trainer/data_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 4405c2087742b..3b9adb098e22a 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -135,6 +135,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: 'ddp2': self.num_nodes, 'ddp_cpu': self.num_processes * self.num_nodes } + print('----------------------------') print(world_size['ddp']) sampler = DistributedSampler( dataloader.dataset, From 47d016191654996bfdbe4c5974ea04422cc96c13 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:20:01 -0400 Subject: [PATCH 03/37] debug --- pytorch_lightning/trainer/data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 3b9adb098e22a..3631551e6dc9a 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -135,7 +135,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: 'ddp2': self.num_nodes, 'ddp_cpu': self.num_processes * self.num_nodes } - print('----------------------------') + print('-------------***---------------') print(world_size['ddp']) sampler = DistributedSampler( dataloader.dataset, From b91d072aa5b310b713b32822166eb547697931ce Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:26:11 -0400 Subject: [PATCH 04/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 2f59cb2b2d748..61cb6b9b87ec2 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -221,7 +221,8 @@ def set_distributed_mode(self, distributed_backend): self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True - self.num_processes = self.num_gpus + self.num_processes = self.num_gpus + elif distributed_backend == "ddp2": # do nothing if num_gpus == 0 if self.num_gpus >= 1: From 83d709d78d70c2c9ce08cccac561f1d2143faddf Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:29:11 -0400 Subject: [PATCH 05/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- pytorch_lightning/trainer/trainer.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 61cb6b9b87ec2..fad2529bab808 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -221,7 +221,7 @@ def set_distributed_mode(self, distributed_backend): self.use_ddp = True elif self.num_gpus > 1: self.use_ddp = True - self.num_processes = self.num_gpus + self.num_processes = self.num_gpus elif distributed_backend == "ddp2": # do nothing if num_gpus == 0 diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9b27655675dc9..74b90c137afaa 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -462,6 +462,7 @@ def __init__( self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) + print(self.data_parallel_device_ids) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) self.root_device = torch.device("cpu") From ba8d8b5cd20d6ea8a72c7535aa5f74735699d3f4 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:30:01 -0400 Subject: [PATCH 06/37] debug --- pytorch_lightning/trainer/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 74b90c137afaa..9b27655675dc9 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -462,7 +462,6 @@ def __init__( self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) - print(self.data_parallel_device_ids) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) self.root_device = torch.device("cpu") From c1a5c4afdaa0244418b0453898d53a61fcca5b65 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:36:11 -0400 Subject: [PATCH 07/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index fad2529bab808..b6161064a92e8 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -155,6 +155,7 @@ class TrainerDDPMixin(ABC): default_root_dir: str use_native_amp: bool progress_bar_callback: ... + num_processes: int @property @abstractmethod @@ -213,6 +214,7 @@ def set_distributed_mode(self, distributed_backend): elif self.num_gpus > 1: self.use_dp = True elif distributed_backend == "ddp": + import pdb; pdb.set_trace() if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu From d734ba1b55325b17a801c0c13fd429f506b44b33 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:38:07 -0400 Subject: [PATCH 08/37] debug --- pytorch_lightning/trainer/data_loading.py | 2 -- pytorch_lightning/trainer/distrib_data_parallel.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 3631551e6dc9a..3557e2c83a761 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -135,8 +135,6 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: 'ddp2': self.num_nodes, 'ddp_cpu': self.num_processes * self.num_nodes } - print('-------------***---------------') - print(world_size['ddp']) sampler = DistributedSampler( dataloader.dataset, num_replicas=world_size.get(self.distributed_backend, 0), diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index b6161064a92e8..4a884554231e5 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -206,7 +206,8 @@ def set_distributed_mode(self, distributed_backend): ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp for you.') self.use_ddp = True - elif distributed_backend == "dp": + + if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: self.single_gpu = True @@ -214,7 +215,6 @@ def set_distributed_mode(self, distributed_backend): elif self.num_gpus > 1: self.use_dp = True elif distributed_backend == "ddp": - import pdb; pdb.set_trace() if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: self.use_ddp = True # ddp_cpu From b17efc9f9c92217f359d75ad48ce976bdd279d94 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:38:57 -0400 Subject: [PATCH 09/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 4a884554231e5..7ec840f15917c 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -206,6 +206,7 @@ def set_distributed_mode(self, distributed_backend): ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp for you.') self.use_ddp = True + self.distributed_backend = 'ddp' if distributed_backend == "dp": # do nothing if num_gpus == 0 From f5ccb6447381aabb368dadce4d35ca3772b0c918 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:39:10 -0400 Subject: [PATCH 10/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 7ec840f15917c..82598932205dc 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -205,7 +205,6 @@ def set_distributed_mode(self, distributed_backend): rank_zero_warn('You requested multiple GPUs but did not specify a backend, e.g.' ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp for you.') - self.use_ddp = True self.distributed_backend = 'ddp' if distributed_backend == "dp": From c7da23bc0b0ec41b364edf5e2810d72caff80da9 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:48:35 -0400 Subject: [PATCH 11/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 71f893e080fdf..52defad7f0d8d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -222,6 +222,7 @@ def on_validation_end(self, trainer, pl_module): if self.save_top_k != -1: current = metrics.get(self.monitor) + print(current, self.monitor) if current is None: rank_zero_warn( From 93bd9da97a9c1afeb2e4b3343c488a6c286b9aa7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:52:54 -0400 Subject: [PATCH 12/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 52defad7f0d8d..1e8e30d8e5183 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -222,7 +222,11 @@ def on_validation_end(self, trainer, pl_module): if self.save_top_k != -1: current = metrics.get(self.monitor) - print(current, self.monitor) + + if not isinstance(current, 'str'): + rank_zero_warn( + f'The metric you returned {current} must be a Torch.Tensor instance' + ) if current is None: rank_zero_warn( From f181c440c42e5e1341bd4b95ea0d7ee9c3e423dc Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:54:29 -0400 Subject: [PATCH 13/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 1e8e30d8e5183..6557fe18aad38 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -223,7 +223,7 @@ def on_validation_end(self, trainer, pl_module): if self.save_top_k != -1: current = metrics.get(self.monitor) - if not isinstance(current, 'str'): + if not isinstance(current, str): rank_zero_warn( f'The metric you returned {current} must be a Torch.Tensor instance' ) From a0ef963d5ac2536f9e80195ce36695296e2f3a6e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:54:50 -0400 Subject: [PATCH 14/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 6557fe18aad38..f60663ee290e6 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -223,7 +223,7 @@ def on_validation_end(self, trainer, pl_module): if self.save_top_k != -1: current = metrics.get(self.monitor) - if not isinstance(current, str): + if not isinstance(current, torch.Tensor): rank_zero_warn( f'The metric you returned {current} must be a Torch.Tensor instance' ) From 0493245630f90d7c92720d42cf54a9f8820d596d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:56:21 -0400 Subject: [PATCH 15/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index f60663ee290e6..c86fda81a5d58 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -225,7 +225,8 @@ def on_validation_end(self, trainer, pl_module): if not isinstance(current, torch.Tensor): rank_zero_warn( - f'The metric you returned {current} must be a Torch.Tensor instance' + f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved', + f'hint: what is the value of {self.monitor} in validation_*_end()?' ) if current is None: From 725231b978c45ac624aa283e4855dcc219f39332 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 15:58:55 -0400 Subject: [PATCH 16/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index c86fda81a5d58..faf0fb9f125e1 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -149,6 +149,10 @@ def check_monitor_top_k(self, current): return True if not isinstance(current, torch.Tensor): + rank_zero_warn( + f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly', + f'hint: check the value of {self.monitor} in your validation loop' + ) current = torch.tensor(current) monitor_op = { From 7b3bce1ad08e32e3d1c0c58a069ed99cced53008 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:00:25 -0400 Subject: [PATCH 17/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index faf0fb9f125e1..18d6bf7701c4d 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -152,6 +152,7 @@ def check_monitor_top_k(self, current): rank_zero_warn( f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly', f'hint: check the value of {self.monitor} in your validation loop' + , RuntimeWarning ) current = torch.tensor(current) @@ -231,6 +232,7 @@ def on_validation_end(self, trainer, pl_module): rank_zero_warn( f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved', f'hint: what is the value of {self.monitor} in validation_*_end()?' + , RuntimeWarning ) if current is None: From ad10a5575e679185e705355be3a4ef7080d3567e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:02:06 -0400 Subject: [PATCH 18/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 18d6bf7701c4d..fbe3acbb28da4 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -150,7 +150,7 @@ def check_monitor_top_k(self, current): if not isinstance(current, torch.Tensor): rank_zero_warn( - f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly', + f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly' f'hint: check the value of {self.monitor} in your validation loop' , RuntimeWarning ) @@ -230,8 +230,8 @@ def on_validation_end(self, trainer, pl_module): if not isinstance(current, torch.Tensor): rank_zero_warn( - f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved', - f'hint: what is the value of {self.monitor} in validation_*_end()?' + f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved' + f'hint: what is the value of {self.monitor} in validation_end()?' , RuntimeWarning ) From 458e724ba522b4640055048ca10bde357a240195 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:03:45 -0400 Subject: [PATCH 19/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index fbe3acbb28da4..668b0e2b24a05 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -151,7 +151,7 @@ def check_monitor_top_k(self, current): if not isinstance(current, torch.Tensor): rank_zero_warn( f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly' - f'hint: check the value of {self.monitor} in your validation loop' + f'. HINT: check the value of {self.monitor} in your validation loop' , RuntimeWarning ) current = torch.tensor(current) @@ -231,7 +231,7 @@ def on_validation_end(self, trainer, pl_module): if not isinstance(current, torch.Tensor): rank_zero_warn( f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved' - f'hint: what is the value of {self.monitor} in validation_end()?' + f' HINT: what is the value of {self.monitor} in validation_end()?' , RuntimeWarning ) From f49412b41bc3baf89138a9cd96027989a6ff5655 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:11:12 -0400 Subject: [PATCH 20/37] debug --- pytorch_lightning/trainer/data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 3557e2c83a761..c08ce485643b9 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -137,7 +137,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: } sampler = DistributedSampler( dataloader.dataset, - num_replicas=world_size.get(self.distributed_backend, 0), + num_replicas=world_size[self.distributed_backend], rank=self.proc_rank, ) From b593873f5f06fbba2020577f3452ca0adf599422 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:20:01 -0400 Subject: [PATCH 21/37] debug --- pytorch_lightning/trainer/data_loading.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index c08ce485643b9..c62d81085fec7 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -130,6 +130,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: num_replicas=hvd.size(), rank=hvd.rank()) else: + print('----------------- SAMPLER --------------------') world_size = { 'ddp': self.num_nodes * self.num_processes, 'ddp2': self.num_nodes, From 807ed4b078af887f3ed7e903b9df38820f69926e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:21:49 -0400 Subject: [PATCH 22/37] debug --- pytorch_lightning/trainer/data_loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index c62d81085fec7..08e445763de0f 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -99,6 +99,7 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: ' in the `DataLoader` init to improve performance.') def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: + print('----------------- SAMPLER --------------------') # don't do anything if it's not a dataloader # don't manipulate iterable datasets @@ -130,7 +131,6 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: num_replicas=hvd.size(), rank=hvd.rank()) else: - print('----------------- SAMPLER --------------------') world_size = { 'ddp': self.num_nodes * self.num_processes, 'ddp2': self.num_nodes, From 92b6ca05ea8c217eb729dc3740e22d46aaedf30b Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:22:26 -0400 Subject: [PATCH 23/37] debug --- pytorch_lightning/trainer/data_loading.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 08e445763de0f..003b0e547f1dc 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -112,6 +112,8 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: if not is_dataloader or is_iterable_ds: return dataloader need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu) + print(need_dist_sampler) + print(self.replace_sampler_ddp) if self.replace_sampler_ddp and need_dist_sampler: skip_keys = ['sampler', 'batch_sampler', 'dataset_kind'] From 348982a0c4265ec0f22c2e0b1056b60d99868403 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:25:41 -0400 Subject: [PATCH 24/37] debug --- pytorch_lightning/trainer/data_loading.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 003b0e547f1dc..6d0d0698e83ad 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -112,8 +112,7 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: if not is_dataloader or is_iterable_ds: return dataloader need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu) - print(need_dist_sampler) - print(self.replace_sampler_ddp) + print(self.use_ddp, self.use_ddp2) if self.replace_sampler_ddp and need_dist_sampler: skip_keys = ['sampler', 'batch_sampler', 'dataset_kind'] From 284307307f382a391e467162c6a9024627effce7 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:29:03 -0400 Subject: [PATCH 25/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 82598932205dc..b8d2b43f4aecd 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -207,6 +207,7 @@ def set_distributed_mode(self, distributed_backend): ' Setting distributed_backend=ddp for you.') self.distributed_backend = 'ddp' + import pdb; pdb.set_trace() if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: @@ -214,6 +215,7 @@ def set_distributed_mode(self, distributed_backend): self.use_dp = True elif self.num_gpus > 1: self.use_dp = True + elif distributed_backend == "ddp": if self.num_gpus == 0: if self.num_nodes > 1 or self.num_processes > 1: From 4b40b52ed1d9fcf41a621626f77803f2af924bdb Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:30:01 -0400 Subject: [PATCH 26/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index b8d2b43f4aecd..ef6f5c37cbd2e 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -206,8 +206,8 @@ def set_distributed_mode(self, distributed_backend): ' Trainer(distributed_backend=dp) (or ddp, ddp2).' ' Setting distributed_backend=ddp for you.') self.distributed_backend = 'ddp' + distributed_backend = 'ddp' - import pdb; pdb.set_trace() if distributed_backend == "dp": # do nothing if num_gpus == 0 if self.num_gpus == 1: From 8ae37a8558817b07230c72db039931479767c639 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:31:31 -0400 Subject: [PATCH 27/37] debug --- pytorch_lightning/trainer/data_loading.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/data_loading.py b/pytorch_lightning/trainer/data_loading.py index 6d0d0698e83ad..b84e2d27a667e 100644 --- a/pytorch_lightning/trainer/data_loading.py +++ b/pytorch_lightning/trainer/data_loading.py @@ -99,7 +99,6 @@ def _worker_check(self, dataloader: DataLoader, name: str) -> None: ' in the `DataLoader` init to improve performance.') def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: - print('----------------- SAMPLER --------------------') # don't do anything if it's not a dataloader # don't manipulate iterable datasets @@ -112,9 +111,8 @@ def auto_add_sampler(self, dataloader: DataLoader, train: bool) -> DataLoader: if not is_dataloader or is_iterable_ds: return dataloader need_dist_sampler = (self.use_ddp or self.use_ddp2 or self.use_horovod or self.use_tpu) - print(self.use_ddp, self.use_ddp2) - if self.replace_sampler_ddp and need_dist_sampler: + if self.replace_sampler_ddp and need_dist_sampler: skip_keys = ['sampler', 'batch_sampler', 'dataset_kind'] dl_args = { From 24931c946e9e2403d9456d6649a29a9fccc906f2 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:35:21 -0400 Subject: [PATCH 28/37] debug --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9b27655675dc9..0af9ab6dd1aeb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -121,7 +121,7 @@ def __init__( print_nan_grads: bool = False, # backward compatible, todo: remove in v0.9.0 weights_summary: Optional[str] = 'full', weights_save_path: Optional[str] = None, - num_sanity_val_steps: int = 5, + num_sanity_val_steps: int = 2, truncated_bptt_steps: Optional[int] = None, resume_from_checkpoint: Optional[str] = None, profiler: Optional[Union[BaseProfiler, bool]] = None, From f1a969c8c43afbfc7ec2a17f05054984b712c470 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:38:40 -0400 Subject: [PATCH 29/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index ef6f5c37cbd2e..4c8066ba50307 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -304,6 +304,7 @@ def determine_ddp_node_rank(self): return int(rank) def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): + import pdb; pdb.set_trace() if data_parallel_device_ids is None: return From 8506a161b7fba5c86bcee7ca928f94e7cee200f3 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:40:02 -0400 Subject: [PATCH 30/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 4c8066ba50307..ef6f5c37cbd2e 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -304,7 +304,6 @@ def determine_ddp_node_rank(self): return int(rank) def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): - import pdb; pdb.set_trace() if data_parallel_device_ids is None: return From 75d0131669c1b926b2fc834ece4959312a683964 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:40:41 -0400 Subject: [PATCH 31/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index ef6f5c37cbd2e..1384abac1bf5b 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -319,7 +319,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): gpu_str = ','.join([str(x) for x in data_parallel_device_ids]) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str - log.debug(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') + log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') def ddp_train(self, process_idx, model): """ From 95aead215e90b2bf9ceeae5ef68fa93f52d26ece Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 16:40:57 -0400 Subject: [PATCH 32/37] debug --- pytorch_lightning/trainer/distrib_data_parallel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py index 1384abac1bf5b..37f9fa9194db9 100644 --- a/pytorch_lightning/trainer/distrib_data_parallel.py +++ b/pytorch_lightning/trainer/distrib_data_parallel.py @@ -319,6 +319,7 @@ def set_nvidia_flags(self, is_slurm_managing_tasks, data_parallel_device_ids): gpu_str = ','.join([str(x) for x in data_parallel_device_ids]) os.environ["CUDA_VISIBLE_DEVICES"] = gpu_str + # don't make this debug... this is good UX log.info(f'CUDA_VISIBLE_DEVICES: [{os.environ["CUDA_VISIBLE_DEVICES"]}]') def ddp_train(self, process_idx, model): From f54d2a1616146c3203919ec61fc99e4fa2da5c6f Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 17:00:41 -0400 Subject: [PATCH 33/37] debug --- pytorch_lightning/trainer/distrib_parts.py | 4 +++- pytorch_lightning/trainer/trainer.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 842496402cf98..e8b45199a8e29 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -530,7 +530,9 @@ def tpu_train(self, tpu_core_idx, model): # continue training routine self.run_pretrain_routine(model) - self.save_spawn_weights(model) + # when training ends on these platforms dump weights to get out of the main process + if self.on_colab_kaggle: + self.save_spawn_weights(model) def dp_train(self, model): diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 0af9ab6dd1aeb..887812bdde2d7 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -526,6 +526,8 @@ def __init__( self.amp_level = amp_level self.init_amp(use_amp) + self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') + # Callback system self.on_init_end() @@ -811,7 +813,7 @@ def fit( # train mp.spawn(self.ddp_train, nprocs=self.num_processes, args=(model,)) # load weights if not interrupted - if os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE'): + if self.on_colab_kaggle: self.load_spawn_weights(model) self.model = model @@ -830,7 +832,7 @@ def fit( log.info(f'training on {self.num_tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. - start_method = 'fork' if os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') else 'spawn' + start_method = 'fork' if self.on_colab_kaggle else 'spawn' # track for predict self.model = model From dfb60509109fdef5126dc0b756298c454befded5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 17:15:46 -0400 Subject: [PATCH 34/37] debug --- tests/trainer/test_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 1c340d8ccd612..969a6be5f250a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -744,7 +744,7 @@ def test_gpu_choice(tmpdir): ), pytest.param( dict(distributed_backend=None, gpus=2), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), pytest.param( From dbb5a59fc66d13a05f132e5e6a4ebefbe6485804 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 17:17:37 -0400 Subject: [PATCH 35/37] debug --- tests/trainer/test_trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 969a6be5f250a..ad03fd32ae9cf 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -747,6 +747,11 @@ def test_gpu_choice(tmpdir): dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), + pytest.param( + dict(distributed_backend=None, gpus=4), + dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=4, on_gpu=True, single_gpu=False, num_processes=4), + marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] + ), pytest.param( dict(distributed_backend="dp", gpus=2), dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), From 82aaf418c21d1e10942efd83877f4cceefc2df5e Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 17:26:52 -0400 Subject: [PATCH 36/37] debug --- tests/trainer/test_trainer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index ad03fd32ae9cf..969a6be5f250a 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -747,11 +747,6 @@ def test_gpu_choice(tmpdir): dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=2), marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] ), - pytest.param( - dict(distributed_backend=None, gpus=4), - dict(use_dp=False, use_ddp=True, use_ddp2=False, num_gpus=4, on_gpu=True, single_gpu=False, num_processes=4), - marks=[pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Multiple GPUs needed")] - ), pytest.param( dict(distributed_backend="dp", gpus=2), dict(use_dp=True, use_ddp=False, use_ddp2=False, num_gpus=2, on_gpu=True, single_gpu=False, num_processes=1), From 5c5674705d98fd71f27a3bd43a3b561d3d32ad88 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 13 May 2020 18:21:06 -0400 Subject: [PATCH 37/37] debug --- pytorch_lightning/callbacks/model_checkpoint.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py index 668b0e2b24a05..ec403ee6d1686 100644 --- a/pytorch_lightning/callbacks/model_checkpoint.py +++ b/pytorch_lightning/callbacks/model_checkpoint.py @@ -150,9 +150,8 @@ def check_monitor_top_k(self, current): if not isinstance(current, torch.Tensor): rank_zero_warn( - f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly' - f'. HINT: check the value of {self.monitor} in your validation loop' - , RuntimeWarning + f'{current} is supposed to be a torch.Tensor. Saving checkpoint may not work correctly. ' + f'HINT: check the value of {self.monitor} in your validation loop', RuntimeWarning ) current = torch.tensor(current) @@ -230,9 +229,8 @@ def on_validation_end(self, trainer, pl_module): if not isinstance(current, torch.Tensor): rank_zero_warn( - f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved' - f' HINT: what is the value of {self.monitor} in validation_end()?' - , RuntimeWarning + f'The metric you returned {current} must be a Torch.Tensor instance, checkpoint not saved ' + f'HINT: what is the value of {self.monitor} in validation_end()?', RuntimeWarning ) if current is None: