From 5c554a1131040f11e3afe38d65d5130aa6e39255 Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Sun, 8 Mar 2020 19:58:44 +0000 Subject: [PATCH 1/7] SA: for #958: set torch cuda device when finding root --- pytorch_lightning/trainer/distrib_parts.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 0a629a6f21c62..c318df9a2863b 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -640,4 +640,10 @@ def determine_root_gpu_device(gpus): # set root gpu root_gpu = gpus[0] + # set cuda device to root gpu + # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 + # Refer solution: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 + root_device = torch.device("cuda", root_gpu) + torch.cuda.set_device(root_device) + return root_gpu From 2f17b2f25f40a6c81328ede73056319155d16cbb Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Sun, 8 Mar 2020 20:06:05 +0000 Subject: [PATCH 2/7] SA: for #958: removing root gpu hack in trainer/evaluation_loop --- pytorch_lightning/trainer/evaluation_loop.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 1ca088ebbc720..c074af4bee55c 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -421,9 +421,13 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: # single GPU data transfer if self.single_gpu: # for single GPU put inputs on gpu manually - root_gpu = 0 + if isinstance(self.data_parallel_device_ids, list): root_gpu = self.data_parallel_device_ids[0] + else: + raise RuntimeError( + 'Expected `data_parallel_device_ids` as a list, cannot determine root gpu.' + ) batch = self.transfer_batch_to_gpu(batch, root_gpu) args[0] = batch From 6d895055987647c93ff42a54fc81702cad911a97 Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Sun, 8 Mar 2020 20:07:57 +0000 Subject: [PATCH 3/7] SA: setting torch cuda device --- pytorch_lightning/trainer/evaluation_loop.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index c074af4bee55c..071be073ba756 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -424,6 +424,12 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: if isinstance(self.data_parallel_device_ids, list): root_gpu = self.data_parallel_device_ids[0] + + # set cuda device to root gpu + # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 + # Refer solution: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 + root_device = torch.device("cuda", root_gpu) + torch.cuda.set_device(root_device) else: raise RuntimeError( 'Expected `data_parallel_device_ids` as a list, cannot determine root gpu.' From 54e9a5e1c6ca316576a2eecccb2e0c6ca4f685fe Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Sun, 8 Mar 2020 20:27:11 +0000 Subject: [PATCH 4/7] comment line too long --- pytorch_lightning/trainer/evaluation_loop.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 071be073ba756..da0c16d0e4f52 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -427,7 +427,7 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: # set cuda device to root gpu # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 - # Refer solution: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 + # Refer: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 root_device = torch.device("cuda", root_gpu) torch.cuda.set_device(root_device) else: From 83c291dbc8bbc29c8666dc072ca1e1108cfb752c Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Mon, 9 Mar 2020 09:30:57 +0000 Subject: [PATCH 5/7] check if root gpu exists or available --- pytorch_lightning/trainer/distrib_parts.py | 3 ++- pytorch_lightning/trainer/evaluation_loop.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index c318df9a2863b..39cdf88b1f600 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -643,7 +643,8 @@ def determine_root_gpu_device(gpus): # set cuda device to root gpu # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 # Refer solution: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 - root_device = torch.device("cuda", root_gpu) + # root_device = torch.device("cuda", root_gpu) + root_device = (torch.device("cuda", root_gpu) if root_gpu >= 0 else torch.device("cpu")) torch.cuda.set_device(root_device) return root_gpu diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index da0c16d0e4f52..91037753b7637 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -428,7 +428,8 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: # set cuda device to root gpu # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 # Refer: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 - root_device = torch.device("cuda", root_gpu) + root_device = (torch.device("cuda", root_gpu) + if root_gpu >= 0 else torch.device("cpu")) torch.cuda.set_device(root_device) else: raise RuntimeError( From 6cba621496fd90f2f0f90980e9a425eddebc5c18 Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Tue, 31 Mar 2020 16:53:47 +0100 Subject: [PATCH 6/7] Incorporating suggestions on #1094 --- pytorch_lightning/trainer/distrib_parts.py | 7 ------- pytorch_lightning/trainer/evaluation_loop.py | 5 ----- pytorch_lightning/trainer/trainer.py | 3 +++ 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 39cdf88b1f600..0a629a6f21c62 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -640,11 +640,4 @@ def determine_root_gpu_device(gpus): # set root gpu root_gpu = gpus[0] - # set cuda device to root gpu - # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 - # Refer solution: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 - # root_device = torch.device("cuda", root_gpu) - root_device = (torch.device("cuda", root_gpu) if root_gpu >= 0 else torch.device("cpu")) - torch.cuda.set_device(root_device) - return root_gpu diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 91037753b7637..dbf157f0232b3 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -420,14 +420,9 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: # single GPU data transfer if self.single_gpu: - # for single GPU put inputs on gpu manually if isinstance(self.data_parallel_device_ids, list): root_gpu = self.data_parallel_device_ids[0] - - # set cuda device to root gpu - # related to https://github.com/PyTorchLightning/pytorch-lightning/issues/958 - # Refer: https://github.com/pytorch/pytorch/issues/9871#issuecomment-408304190 root_device = (torch.device("cuda", root_gpu) if root_gpu >= 0 else torch.device("cpu")) torch.cuda.set_device(root_device) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 8c5a906fc3573..b5d237dd11e71 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -365,6 +365,9 @@ def __init__( self.gpus = gpus self.data_parallel_device_ids = parse_gpu_ids(self.gpus) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) + root_device = (torch.device("cuda", self.root_gpu) + if self.root_gpu >= 0 else torch.device("cpu")) + torch.cuda.set_device(root_device) # tpu state flags self.use_tpu = False From 2c7b80265779f8cd702b10ef1386a1c263ad2681 Mon Sep 17 00:00:00 2001 From: Shubham Agarwal Date: Tue, 31 Mar 2020 17:50:28 +0100 Subject: [PATCH 7/7] since root gpu returns none instead of -1 for cpu --- pytorch_lightning/trainer/evaluation_loop.py | 2 +- pytorch_lightning/trainer/trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index dbf157f0232b3..cf17a84ce175e 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -424,7 +424,7 @@ def evaluation_forward(self, model, batch, batch_idx, dataloader_idx, test_mode: if isinstance(self.data_parallel_device_ids, list): root_gpu = self.data_parallel_device_ids[0] root_device = (torch.device("cuda", root_gpu) - if root_gpu >= 0 else torch.device("cpu")) + if root_gpu else torch.device("cpu")) torch.cuda.set_device(root_device) else: raise RuntimeError( diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b5d237dd11e71..3b2219ec75ff4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -366,7 +366,7 @@ def __init__( self.data_parallel_device_ids = parse_gpu_ids(self.gpus) self.root_gpu = determine_root_gpu_device(self.data_parallel_device_ids) root_device = (torch.device("cuda", self.root_gpu) - if self.root_gpu >= 0 else torch.device("cpu")) + if self.root_gpu else torch.device("cpu")) torch.cuda.set_device(root_device) # tpu state flags