From 8995fbc8d7dcf23b768fa8ed7db50ccb37e822ec Mon Sep 17 00:00:00 2001 From: lezwon Date: Sun, 19 Apr 2020 21:04:08 +0530 Subject: [PATCH 01/24] added tpu_id added tpu_id to mixins --- pytorch_lightning/trainer/distrib_parts.py | 5 +++-- pytorch_lightning/trainer/evaluation_loop.py | 4 +++- pytorch_lightning/trainer/trainer.py | 3 +++ pytorch_lightning/trainer/training_loop.py | 3 ++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index a9f4b6114522e..504ffc6bf7aa6 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -398,6 +398,7 @@ class TrainerDPMixin(ABC): data_parallel_device_ids: ... logger: Union[LightningLoggerBase, bool] progress_bar_callback: ... + tpu_id: int @property @abstractmethod @@ -443,7 +444,7 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None): if device == 'tpu' and XLA_AVAILABLE: # base case: object can be directly moved using `to` if callable(getattr(batch, 'to', None)): - return batch.to(xm.xla_device()) + return batch.to(xm.xla_device(self.tpu_id)) if device == 'gpu': # base case: object can be directly moved using `cuda` or `to` @@ -498,7 +499,7 @@ def single_gpu_train(self, model): def tpu_train(self, tpu_core_idx, model): # put model on tpu - model.to(xm.xla_device()) + model.to(xm.xla_device(self.tpu_id)) # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 0320bf35419ea..676344801da8e 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -175,6 +175,8 @@ class TrainerEvaluationLoopMixin(ABC): val_dataloaders: DataLoader use_tpu: bool reload_dataloaders_every_epoch: ... + progress_bar_refresh_rate: ... + tpu_id: int # Callback system on_validation_batch_start: Callable @@ -250,7 +252,7 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ # on TPU we have to wrap it under the ParallelLoader if self.use_tpu: - device = xm.xla_device() + device = xm.xla_device(self.tpu_id) dataloader = xla_pl.ParallelLoader(dataloader, [device]) dataloader = dataloader.per_device_loader(device) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 7bfd97bfb83f1..d684548ab36e1 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -90,6 +90,7 @@ def __init__( gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, num_tpu_cores: Optional[int] = None, + tpu_id: Optional[int] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_pct: float = 0.0, @@ -321,6 +322,8 @@ def __init__( self.num_tpu_cores = num_tpu_cores assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8' + self.tpu_id = tpu_id + if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.") self.num_processes = num_processes diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 37bac3d99727f..49d661cf92b53 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -231,6 +231,7 @@ class TrainerTrainLoopMixin(ABC): total_batch_idx: int checkpoint_callback: ... terminate_on_nan: bool + tpu_id: int # Callback system callbacks: List[Callback] @@ -394,7 +395,7 @@ def run_training_epoch(self): # on TPU we have to wrap it under the ParallelLoader if self.use_tpu: - device = xm.xla_device() + device = xm.xla_device(self.tpu_id) train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device]) train_dataloader = train_dataloader.per_device_loader(device) From bd9e88c6c54a8694b2e507c46e4c33c8cc27bf81 Mon Sep 17 00:00:00 2001 From: lezwon Date: Sun, 26 Apr 2020 13:23:17 +0530 Subject: [PATCH 02/24] train on individual tpu --- pytorch_lightning/trainer/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d684548ab36e1..9e1f31316ddf8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -778,7 +778,10 @@ def fit( self.model = model # train - xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) + if self.tpu_id is not None: + self.tpu_train(self.tpu_id, model) + else: + xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) # load weights if not interrupted self.load_spawn_weights(model) From 1daadfa4d0f749a1e1ded3f92c503a0b66b708a7 Mon Sep 17 00:00:00 2001 From: lezwon Date: Sun, 3 May 2020 21:41:55 +0530 Subject: [PATCH 03/24] parallel loader if tpu_id is None --- pytorch_lightning/trainer/evaluation_loop.py | 2 +- pytorch_lightning/trainer/training_loop.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 676344801da8e..0e153134e6ebf 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -251,7 +251,7 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ dl_outputs = [] # on TPU we have to wrap it under the ParallelLoader - if self.use_tpu: + if self.use_tpu and self.tpu_id is None: device = xm.xla_device(self.tpu_id) dataloader = xla_pl.ParallelLoader(dataloader, [device]) dataloader = dataloader.per_device_loader(device) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index 49d661cf92b53..d375202e4b50d 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -394,7 +394,7 @@ def run_training_epoch(self): train_dataloader = self.train_dataloader # on TPU we have to wrap it under the ParallelLoader - if self.use_tpu: + if self.use_tpu and self.tpu_id is None: device = xm.xla_device(self.tpu_id) train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device]) train_dataloader = train_dataloader.per_device_loader(device) From e4d49d0565909048b927f2cdaf33d6f3cc4d0a77 Mon Sep 17 00:00:00 2001 From: lezwon Date: Mon, 4 May 2020 21:33:48 +0530 Subject: [PATCH 04/24] removed progress_bar_refresh_rate --- pytorch_lightning/trainer/evaluation_loop.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 0e153134e6ebf..4bd74f2f6c7e4 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -175,7 +175,6 @@ class TrainerEvaluationLoopMixin(ABC): val_dataloaders: DataLoader use_tpu: bool reload_dataloaders_every_epoch: ... - progress_bar_refresh_rate: ... tpu_id: int # Callback system From 0ed38cd4ec3b3f8ae1c7107a7ff9868f0c92e725 Mon Sep 17 00:00:00 2001 From: Jirka Date: Tue, 5 May 2020 21:40:05 +0200 Subject: [PATCH 05/24] chlog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5457a6e980318..9a8075240ff3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) +- Allow user to select individual TPU core to train on ([#1729](https://github.com/PyTorchLightning/pytorch-lightning/pull/1729)) + ### Changed - Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) From 725ef5d6561df221791ed2b87bab352fe4a282e0 Mon Sep 17 00:00:00 2001 From: lezwon Date: Wed, 6 May 2020 18:47:07 +0530 Subject: [PATCH 06/24] replaced num_tpu_cores with tpu_cores --- pytorch_lightning/trainer/trainer.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9e1f31316ddf8..ef86763fa68ec 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -89,8 +89,7 @@ def __init__( num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, auto_select_gpus: bool = False, - num_tpu_cores: Optional[int] = None, - tpu_id: Optional[int] = None, + tpu_cores: Optional[Union[List[int], int]] = None, log_gpu_memory: Optional[str] = None, progress_bar_refresh_rate: int = 1, overfit_pct: float = 0.0, @@ -180,7 +179,7 @@ def __init__( GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. - num_tpu_cores: How many TPU cores to train on (1 or 8). + tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] log_gpu_memory: None, 'min_max', 'all'. Might slow performance @@ -318,11 +317,12 @@ def __init__( self.on_gpu = True if (gpus and torch.cuda.is_available()) else False # tpu config - self.on_tpu = num_tpu_cores is not None - self.num_tpu_cores = num_tpu_cores - assert num_tpu_cores in [1, 8, None], 'num_tpu_cores can only be 1 or 8' + self.on_tpu = tpu_cores is not None + self.tpu_cores = tpu_cores + assert tpu_cores in [1, 8, None] or len(tpu_cores) == 1, 'tpu_cores can only be 1, 8 or [<1-8>]' - self.tpu_id = tpu_id + if isinstance(tpu_cores, list): + self.tpu_id = tpu_cores[0] if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.") @@ -454,7 +454,7 @@ def __init__( # override dist backend when using tpus if self.on_tpu: self.init_tpu() - self.current_tpu_idx = None + self.current_tpu_idx = self.tpu_id # init flags for SLURM+ddp to work self.proc_rank = 0 @@ -769,7 +769,7 @@ def fit( self.single_gpu_train(model) elif self.use_tpu: # pragma: no-cover - log.info(f'training on {self.num_tpu_cores} TPU cores') + log.info(f'training on {self.tpu_cores} TPU cores') # COLAB_GPU is an env var available by default in Colab environments. start_method = 'fork' if os.getenv('COLAB_GPU') or os.getenv('KAGGLE_URL_BASE') else 'spawn' @@ -781,7 +781,7 @@ def fit( if self.tpu_id is not None: self.tpu_train(self.tpu_id, model) else: - xmp.spawn(self.tpu_train, args=(model,), nprocs=self.num_tpu_cores, start_method=start_method) + xmp.spawn(self.tpu_train, args=(model,), nprocs=self.tpu_cores, start_method=start_method) # load weights if not interrupted self.load_spawn_weights(model) From c0a4f9d1247e459c5d0dd62fc1a3e857eda47d26 Mon Sep 17 00:00:00 2001 From: lezwon Date: Wed, 6 May 2020 18:57:31 +0530 Subject: [PATCH 07/24] set tpu_id to None if int --- pytorch_lightning/trainer/trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index ef86763fa68ec..dffb254e3b2cb 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -321,8 +321,7 @@ def __init__( self.tpu_cores = tpu_cores assert tpu_cores in [1, 8, None] or len(tpu_cores) == 1, 'tpu_cores can only be 1, 8 or [<1-8>]' - if isinstance(tpu_cores, list): - self.tpu_id = tpu_cores[0] + self.tpu_id = tpu_cores[0] if isinstance(tpu_cores, list) else None if num_processes != 1 and distributed_backend != "ddp_cpu": rank_zero_warn("num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it.") From f25d5161c54b5f1798197bdc2cb07799fe337992 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 7 May 2020 00:54:23 +0530 Subject: [PATCH 08/24] changed num_tpu_cores to tpu_cores in docs --- README.md | 2 +- docs/source/apex.rst | 4 ++-- docs/source/introduction_guide.rst | 8 ++++---- docs/source/multi_gpu.rst | 2 +- docs/source/new-project.rst | 2 +- docs/source/tpu.rst | 4 ++-- pytorch_lightning/trainer/__init__.py | 12 ++++++------ 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index afe61c131e2ca..f65ab8cff7cfd 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,7 @@ trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32) Or TPUs ```python -trainer = Trainer(num_tpu_cores=8) +trainer = Trainer(tpu_cores=8) ``` When you're done training, run the test accuracy diff --git a/docs/source/apex.rst b/docs/source/apex.rst index e1c7a1b2c8364..f371b134d6b62 100644 --- a/docs/source/apex.rst +++ b/docs/source/apex.rst @@ -53,7 +53,7 @@ TPU 16-bit .. code-block:: python # DEFAULT - trainer = Trainer(num_tpu_cores=8, precision=32) + trainer = Trainer(tpu_cores=8, precision=32) # turn on 16-bit - trainer = Trainer(num_tpu_cores=8, precision=16) + trainer = Trainer(tpu_cores=8, precision=16) diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index a7a406bbcb68d..d1d2092b349b9 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -589,7 +589,7 @@ Now we can train the LightningModule on a TPU without doing anything else! .. code-block:: python model = LitMNIST() - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) trainer.fit(model) You'll now see the TPU cores booting up. @@ -676,7 +676,7 @@ while checking the validation set. from pytorch_lightning import Trainer model = LitMNIST() - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) trainer.fit(model) You may have noticed the words `Validation sanity check` logged. This is because Lightning runs 5 batches @@ -727,7 +727,7 @@ Once you train your model simply call `.test()`. from pytorch_lightning import Trainer model = LitMNIST() - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) trainer.fit(model) # run test set @@ -749,7 +749,7 @@ You can also run the test from a saved lightning model .. code-block:: python model = LitMNIST.load_from_checkpoint(PATH) - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) trainer.test(model) .. note:: Lightning disables gradients, puts model in eval mode and does everything needed for testing. diff --git a/docs/source/multi_gpu.rst b/docs/source/multi_gpu.rst index 55d9fdb5faac2..a33cce831d280 100644 --- a/docs/source/multi_gpu.rst +++ b/docs/source/multi_gpu.rst @@ -122,7 +122,7 @@ Lightning allows multiple ways of training - DistributedDataParallel (`distributed_backend='ddp'`) (multiple-gpus across many machines). - DistributedDataParallel2 (`distributed_backend='ddp2'`) (dp in a machine, ddp across machines). - Horovod (`distributed_backend='horovod'`) (multi-machine, multi-gpu, configured at runtime) -- TPUs (`num_tpu_cores=8|x`) (tpu or TPU pod) +- TPUs (`tpu_cores=8|x`) (tpu or TPU pod) Data Parallel (dp) ^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index e3f3a892d983f..5181bb7b0db52 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -177,7 +177,7 @@ However, this time you need to specifically call test (this is done so you don't # OPTION 2: # test after loading weights model = LitModel.load_from_checkpoint(PATH) - trainer = Trainer(num_tpu_cores=1) + trainer = Trainer(tpu_cores=1) trainer.test() Again, under the hood, lightning does the following in (pseudocode): diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index b2fb6e8571e26..e3795a5a9c2e4 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -156,7 +156,7 @@ To use a full TPU pod skip to the TPU pod section. import pytorch_lightning as pl my_model = MyLightningModule() - trainer = pl.Trainer(num_tpu_cores=8) + trainer = pl.Trainer(tpu_cores=8) trainer.fit(my_model) That's it! Your model will train on all 8 TPU cores. @@ -195,7 +195,7 @@ set the 16-bit flag. import pytorch_lightning as pl my_model = MyLightningModule() - trainer = pl.Trainer(num_tpu_cores=8, precision=16) + trainer = pl.Trainer(tpu_cores=8, precision=16) trainer.fit(my_model) Under the hood the xla library will use the `bfloat16 type `_. diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 42f92979d6430..8d0a2dcc1488e 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -550,7 +550,7 @@ def on_train_end(self): Use `num_sanity_val_steps` instead. Will remove 0.8.0. -num_tpu_cores +tpu_cores ^^^^^^^^^^^^^ How many TPU cores to train on (1 or 8). @@ -569,21 +569,21 @@ def on_train_end(self): # your_trainer_file.py # default used by the Trainer (ie: train on CPU) - trainer = Trainer(num_tpu_cores=None) + trainer = Trainer(tpu_cores=None) # int: train on a single core - trainer = Trainer(num_tpu_cores=1) + trainer = Trainer(tpu_cores=1) # int: train on all cores few cores - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) # for 8+ cores must submit via xla script with # a max of 8 cores specified. The XLA script # will duplicate script onto each TPU in the POD - trainer = Trainer(num_tpu_cores=8) + trainer = Trainer(tpu_cores=8) # -1: train on all available TPUs - trainer = Trainer(num_tpu_cores=-1) + trainer = Trainer(tpu_cores=-1) To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script. From b22f4853791570999717e7dc161cfc5037b8f2cf Mon Sep 17 00:00:00 2001 From: lezwon Date: Sat, 9 May 2020 13:01:43 +0530 Subject: [PATCH 09/24] updated docs --- README.md | 4 ++ docs/source/introduction_guide.rst | 48 ++---------------------- docs/source/new-project.rst | 2 +- docs/source/tpu.rst | 59 ++++++++---------------------- 4 files changed, 23 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index f65ab8cff7cfd..b8c3432f543d2 100644 --- a/README.md +++ b/README.md @@ -248,7 +248,11 @@ trainer = Trainer(max_epochs=1, gpus=8, num_nodes=32) Or TPUs ```python +# Distributes TPU core training trainer = Trainer(tpu_cores=8) + +# Single TPU core training +trainer = Trainer(tpu_cores=[1]) ``` When you're done training, run the test accuracy diff --git a/docs/source/introduction_guide.rst b/docs/source/introduction_guide.rst index d0050c77a0503..210dec4dbc14a 100644 --- a/docs/source/introduction_guide.rst +++ b/docs/source/introduction_guide.rst @@ -185,7 +185,7 @@ EXACTLY the same as you would a PyTorch Module. Out: - .. code-block:: none + .. code-block:: python torch.Size([1, 10]) @@ -519,50 +519,8 @@ First, change the runtime to TPU (and reinstall lightning). Next, install the required xla library (adds support for PyTorch on TPUs) -.. code-block:: python - - import collections - from datetime import datetime, timedelta - import os - import requests - import threading - - _VersionConfig = collections.namedtuple('_VersionConfig', 'wheels,server') - VERSION = "torch_xla==nightly" #@param ["xrt==1.15.0", "torch_xla==nightly"] - CONFIG = { - 'xrt==1.15.0': _VersionConfig('1.15', '1.15.0'), - 'torch_xla==nightly': _VersionConfig('nightly', 'XRT-dev{}'.format( - (datetime.today() - timedelta(1)).strftime('%Y%m%d'))), - }[VERSION] - DIST_BUCKET = 'gs://tpu-pytorch/wheels' - TORCH_WHEEL = 'torch-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - TORCH_XLA_WHEEL = 'torch_xla-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - TORCHVISION_WHEEL = 'torchvision-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - - # Update TPU XRT version - def update_server_xrt(): - print('Updating server-side XRT to {} ...'.format(CONFIG.server)) - url = 'http://{TPU_ADDRESS}:8475/requestversion/{XRT_VERSION}'.format( - TPU_ADDRESS=os.environ['COLAB_TPU_ADDR'].split(':')[0], - XRT_VERSION=CONFIG.server, - ) - print('Done updating server-side XRT: {}'.format(requests.post(url))) - - update = threading.Thread(target=update_server_xrt) - update.start() - -.. code-block:: - - # Install Colab TPU compat PyTorch/TPU wheels and dependencies - !pip uninstall -y torch torchvision - !gsutil cp "$DIST_BUCKET/$TORCH_WHEEL" . - !gsutil cp "$DIST_BUCKET/$TORCH_XLA_WHEEL" . - !gsutil cp "$DIST_BUCKET/$TORCHVISION_WHEEL" . - !pip install "$TORCH_WHEEL" - !pip install "$TORCH_XLA_WHEEL" - !pip install "$TORCHVISION_WHEEL" - !sudo apt-get install libomp5 - update.join() + !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py + !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev In distributed training (multiple GPUs and multiple TPU cores) each GPU or TPU core will run a copy of this program. This means that without taking any care you will download the dataset N times which diff --git a/docs/source/new-project.rst b/docs/source/new-project.rst index 6f42b32f9fc9e..e1efb898da3f5 100644 --- a/docs/source/new-project.rst +++ b/docs/source/new-project.rst @@ -236,7 +236,7 @@ Without changing a SINGLE line of your code, you can now do the following with t # train on TPUs using 16 bit precision with early stopping # using only half the training data and checking validation every quarter of a training epoch trainer = Trainer( - nb_tpu_cores=8, + tpu_cores=8, precision=16, early_stop_checkpoint=True, train_percent_check=0.5, diff --git a/docs/source/tpu.rst b/docs/source/tpu.rst index e3795a5a9c2e4..774af763c78f1 100644 --- a/docs/source/tpu.rst +++ b/docs/source/tpu.rst @@ -1,8 +1,8 @@ TPU support =========== -Lightning supports running on TPUs. At this moment, TPUs are only available -on Google Cloud (GCP). For more information on TPUs +Lightning supports running on TPUs. At this moment, TPUs are available +on Google Cloud (GCP), Google Colab and Kaggle Environments. For more information on TPUs `watch this video `_. --------------- @@ -31,6 +31,7 @@ To access TPUs there are two main ways. 1. Using google colab. 2. Using Google Cloud (GCP). +3. Using Kaggle. --------------- @@ -51,50 +52,10 @@ To get a TPU on colab, follow these steps: 4. Next, insert this code into the first cell and execute. This will install the xla library that interfaces between PyTorch and the TPU. - .. code-block:: python - - import collections - from datetime import datetime, timedelta - import os - import requests - import threading - - _VersionConfig = collections.namedtuple('_VersionConfig', 'wheels,server') - VERSION = "xrt==1.15.0" #@param ["xrt==1.15.0", "torch_xla==nightly"] - CONFIG = { - 'xrt==1.15.0': _VersionConfig('1.15', '1.15.0'), - 'torch_xla==nightly': _VersionConfig('nightly', 'XRT-dev{}'.format( - (datetime.today() - timedelta(1)).strftime('%Y%m%d'))), - }[VERSION] - DIST_BUCKET = 'gs://tpu-pytorch/wheels' - TORCH_WHEEL = 'torch-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - TORCH_XLA_WHEEL = 'torch_xla-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - TORCHVISION_WHEEL = 'torchvision-{}-cp36-cp36m-linux_x86_64.whl'.format(CONFIG.wheels) - - # Update TPU XRT version - def update_server_xrt(): - print('Updating server-side XRT to {} ...'.format(CONFIG.server)) - url = 'http://{TPU_ADDRESS}:8475/requestversion/{XRT_VERSION}'.format( - TPU_ADDRESS=os.environ['COLAB_TPU_ADDR'].split(':')[0], - XRT_VERSION=CONFIG.server, - ) - print('Done updating server-side XRT: {}'.format(requests.post(url))) - - update = threading.Thread(target=update_server_xrt) - update.start() - .. code-block:: - # Install Colab TPU compat PyTorch/TPU wheels and dependencies - !pip uninstall -y torch torchvision - !gsutil cp "$DIST_BUCKET/$TORCH_WHEEL" . - !gsutil cp "$DIST_BUCKET/$TORCH_XLA_WHEEL" . - !gsutil cp "$DIST_BUCKET/$TORCHVISION_WHEEL" . - !pip install "$TORCH_WHEEL" - !pip install "$TORCH_XLA_WHEEL" - !pip install "$TORCHVISION_WHEEL" - !sudo apt-get install libomp5 - update.join() + !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py + !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev 5. Once the above is done, install PyTorch Lightning (v 0.7.0+). @@ -163,6 +124,16 @@ That's it! Your model will train on all 8 TPU cores. --------------- +Single TPU core training +---------------------------- +Lightning supports training on a single TPU core. Just pass the TPU core ID [1-8] in a list. + +.. code-block:: python + + trainer = pl.Trainer(tpu_cores=[1]) + +--------------- + Distributed Backend with TPU ---------------------------- The ```distributed_backend``` option used for GPUs does not apply to TPUs. From 0669ad2b301b8acf8fb09973ac0a648565bc03da Mon Sep 17 00:00:00 2001 From: lezwon Date: Sat, 9 May 2020 13:30:10 +0530 Subject: [PATCH 10/24] updated __init__.py removed self.tpu_id for ParallelLoader --- pytorch_lightning/trainer/__init__.py | 6 +++++- pytorch_lightning/trainer/evaluation_loop.py | 2 +- pytorch_lightning/trainer/training_loop.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 8d0a2dcc1488e..41cc73368fae9 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -552,7 +552,8 @@ def on_train_end(self): tpu_cores ^^^^^^^^^^^^^ -How many TPU cores to train on (1 or 8). +- How many TPU cores to train on (1 or 8). +- Which TPU core to train on [1-8] A single TPU v2 or v3 has 8 cores. A TPU pod has up to 2048 cores. A slice of a POD means you get as many cores @@ -574,6 +575,9 @@ def on_train_end(self): # int: train on a single core trainer = Trainer(tpu_cores=1) + # list: train on a single selected core + trainer = Trainer(tpu_cores=[2]) + # int: train on all cores few cores trainer = Trainer(tpu_cores=8) diff --git a/pytorch_lightning/trainer/evaluation_loop.py b/pytorch_lightning/trainer/evaluation_loop.py index 4c72b5bd68715..94383bcd25811 100644 --- a/pytorch_lightning/trainer/evaluation_loop.py +++ b/pytorch_lightning/trainer/evaluation_loop.py @@ -251,7 +251,7 @@ def _evaluate(self, model: LightningModule, dataloaders, max_batches: int, test_ # on TPU we have to wrap it under the ParallelLoader if self.use_tpu and self.tpu_id is None: - device = xm.xla_device(self.tpu_id) + device = xm.xla_device() dataloader = xla_pl.ParallelLoader(dataloader, [device]) dataloader = dataloader.per_device_loader(device) diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py index f58d271566ea1..cbe1186480c28 100644 --- a/pytorch_lightning/trainer/training_loop.py +++ b/pytorch_lightning/trainer/training_loop.py @@ -395,7 +395,7 @@ def run_training_epoch(self): # on TPU we have to wrap it under the ParallelLoader if self.use_tpu and self.tpu_id is None: - device = xm.xla_device(self.tpu_id) + device = xm.xla_device() train_dataloader = xla_pl.ParallelLoader(train_dataloader, [device]) train_dataloader = train_dataloader.per_device_loader(device) From 2253b9f1ac759dfa7a309dc2a3bded12bc701db1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 10 May 2020 16:51:21 +0200 Subject: [PATCH 11/24] Update pytorch_lightning/trainer/__init__.py --- pytorch_lightning/trainer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 41cc73368fae9..5e49a98d31c7d 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -551,7 +551,7 @@ def on_train_end(self): Use `num_sanity_val_steps` instead. Will remove 0.8.0. tpu_cores -^^^^^^^^^^^^^ +^^^^^^^^^ - How many TPU cores to train on (1 or 8). - Which TPU core to train on [1-8] From 67c56889db7ee9c87c9441ebf1cac9e79cb59431 Mon Sep 17 00:00:00 2001 From: Lezwon Castelino Date: Thu, 14 May 2020 00:56:57 +0530 Subject: [PATCH 12/24] check if tpu_cores is a list Co-authored-by: Jirka Borovec --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b5cb33c3866cd..918e6dc39f13f 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -326,7 +326,7 @@ def __init__( # tpu config self.on_tpu = tpu_cores is not None self.tpu_cores = tpu_cores - assert tpu_cores in [1, 8, None] or len(tpu_cores) == 1, 'tpu_cores can only be 1, 8 or [<1-8>]' + assert self.tpu_cores in (1, 8, None) or (isinstance(self.tpu_cores, (list, tuple, set)) and len(self.tpu_cores) == 1), '`tpu_cores` can only be 1, 8 or [<1-8>]' self.tpu_id = tpu_cores[0] if isinstance(tpu_cores, list) else None From ec278d1ec930b8b6523e5d616026a10d646bff8b Mon Sep 17 00:00:00 2001 From: lezwon Date: Sun, 10 May 2020 09:34:50 +0530 Subject: [PATCH 13/24] xla device conditional --- pytorch_lightning/trainer/distrib_parts.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index acd3956b12605..d909014701363 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -444,7 +444,8 @@ def __transfer_data_to_device(self, batch, device, gpu_id=None): if device == 'tpu' and XLA_AVAILABLE: # base case: object can be directly moved using `to` if callable(getattr(batch, 'to', None)): - return batch.to(xm.xla_device(self.tpu_id)) + xla_device = xm.xla_device(self.tpu_id) if self.tpu_id is not None else xm.xla_device() + return batch.to(xla_device) if device == 'gpu': # base case: object can be directly moved using `cuda` or `to` @@ -499,7 +500,8 @@ def single_gpu_train(self, model): def tpu_train(self, tpu_core_idx, model): # put model on tpu - model.to(xm.xla_device(self.tpu_id)) + xla_device = xm.xla_device(self.tpu_id) if self.tpu_id is not None else xm.xla_device() + model.to(xla_device) # get the appropriate tpu ranks self.tpu_local_core_rank = xm.get_local_ordinal() From 100071b7553f62cbfa1bec726cf1618686191927 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 00:33:58 +0530 Subject: [PATCH 14/24] num_tpu_cores deprecation --- pytorch_lightning/trainer/__init__.py | 17 ++++++++++++++--- pytorch_lightning/trainer/deprecated_api.py | 7 +++++++ pytorch_lightning/trainer/trainer.py | 12 +++++++++++- tests/test_deprecated.py | 4 ++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py index 5e49a98d31c7d..479bffe088d4c 100644 --- a/pytorch_lightning/trainer/__init__.py +++ b/pytorch_lightning/trainer/__init__.py @@ -550,6 +550,20 @@ def on_train_end(self): Use `num_sanity_val_steps` instead. Will remove 0.8.0. +num_tpu_cores +^^^^^^^^^^^^^ +.. warning:: .. deprecated:: 0.7.6 + + Use `tpu_cores` instead. Will remove 0.9.0. + +Example:: + + python -m torch_xla.distributed.xla_dist + --tpu=$TPU_POD_NAME + --conda-env=torch-xla-nightly + --env=XLA_USE_BF16=1 + -- python your_trainer_file.py + tpu_cores ^^^^^^^^^ - How many TPU cores to train on (1 or 8). @@ -586,9 +600,6 @@ def on_train_end(self): # will duplicate script onto each TPU in the POD trainer = Trainer(tpu_cores=8) - # -1: train on all available TPUs - trainer = Trainer(tpu_cores=-1) - To train on more than 8 cores (ie: a POD), submit this script using the xla_dist script. diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 2705c4f160464..30aa2526cd839 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -135,3 +135,10 @@ def training_tqdm_dict(self): rank_zero_warn("`training_tqdm_dict` was renamed to `progress_bar_dict` in v0.7.3" " and this method will be removed in v0.9.0", DeprecationWarning) return self.progress_bar_dict + + @property + def num_tpu_cores(self): + """Back compatibility, will be removed in v0.9.0""" + rank_zero_warn("`num_tpu_cores` is now set by `tpu_cores` in v0.7.6" + " and this method will be removed in v0.9.0", DeprecationWarning) + return self.num_tpu_cores diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 918e6dc39f13f..bec5303bd5d51 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -81,7 +81,7 @@ class Trainer( 'gradient_clip', 'nb_gpu_nodes', 'max_nb_epochs', 'min_nb_epochs', 'add_row_log_interval', 'nb_sanity_val_steps', 'tng_tqdm_dic', ) - DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict') + DEPRECATED_IN_0_9 = ('use_amp', 'show_progress_bar', 'training_tqdm_dict', 'num_tpu_cores') def __init__( self, @@ -95,6 +95,7 @@ def __init__( num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, + num_tpu_cores: Optional[int] = None, # backward compatible, todo: remove in v0.9.0 auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], int]] = None, log_gpu_memory: Optional[str] = None, @@ -188,6 +189,9 @@ def __init__( tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] + num_tpu_cores: How many TPU cores to train on (1 or 8) + .. warning:: .. deprecated:: 0.7.6. Will remove 0.9.0. + log_gpu_memory: None, 'min_max', 'all'. Might slow performance show_progress_bar: @@ -324,6 +328,12 @@ def __init__( self.on_gpu = True if (gpus and torch.cuda.is_available()) else False # tpu config + if num_tpu_cores is not None: + rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" + " and this argument will be removed in v0.9.0", DeprecationWarning) + + if tpu_cores is None: + tpu_cores = num_tpu_cores self.on_tpu = tpu_cores is not None self.tpu_cores = tpu_cores assert self.tpu_cores in (1, 8, None) or (isinstance(self.tpu_cores, (list, tuple, set)) and len(self.tpu_cores) == 1), '`tpu_cores` can only be 1, 8 or [<1-8>]' diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index 437e5f35ab77f..15f33aed5717b 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -96,6 +96,10 @@ def test_tbd_remove_in_v0_9_0_trainer(): trainer = Trainer(progress_bar_refresh_rate=50, show_progress_bar=False) assert getattr(trainer, 'show_progress_bar') + with pytest.deprecated_call(match='v0.9.0'): + trainer = Trainer(num_tpu_cores=8) + assert not getattr(trainer, 'num_tpu_cores') + def test_tbd_remove_in_v0_9_0_module_imports(): _soft_unimport_module("pytorch_lightning.core.decorators") From 8adb0a9baf44793f93f7958f5e2da8c9c6392474 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 00:50:46 +0530 Subject: [PATCH 15/24] removed duplicate warning --- pytorch_lightning/trainer/trainer.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index bec5303bd5d51..050713204250d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -327,11 +327,6 @@ def __init__( self.track_grad_norm = track_grad_norm self.on_gpu = True if (gpus and torch.cuda.is_available()) else False - # tpu config - if num_tpu_cores is not None: - rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" - " and this argument will be removed in v0.9.0", DeprecationWarning) - if tpu_cores is None: tpu_cores = num_tpu_cores self.on_tpu = tpu_cores is not None From f779d013fef6c68464c3f13695b5616a26c23a18 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 01:20:49 +0530 Subject: [PATCH 16/24] fixed pep8 error --- pytorch_lightning/trainer/trainer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 75c9074564997..520ba69ed32f4 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -349,7 +349,10 @@ def __init__( tpu_cores = num_tpu_cores self.on_tpu = tpu_cores is not None self.tpu_cores = tpu_cores - assert self.tpu_cores in (1, 8, None) or (isinstance(self.tpu_cores, (list, tuple, set)) and len(self.tpu_cores) == 1), '`tpu_cores` can only be 1, 8 or [<1-8>]' + assert self.tpu_cores in (1, 8, None) or ( + isinstance(self.tpu_cores, (list, tuple, set)) + and len(self.tpu_cores) == 1 + ), '`tpu_cores` can only be 1, 8 or [<1-8>]' self.tpu_id = tpu_cores[0] if isinstance(tpu_cores, list) else None From dafe1745ba896253bd8f6c7f769a5fb70cfae299 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 08:37:36 +0530 Subject: [PATCH 17/24] Revert "removed duplicate warning" This reverts commit 8adb0a9b --- pytorch_lightning/trainer/trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 520ba69ed32f4..d5fd30a36a071 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -345,6 +345,11 @@ def __init__( self.track_grad_norm = track_grad_norm self.on_gpu = True if (gpus and torch.cuda.is_available()) else False + # tpu config + if num_tpu_cores is not None: + rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" + " and this argument will be removed in v0.9.0", DeprecationWarning) + if tpu_cores is None: tpu_cores = num_tpu_cores self.on_tpu = tpu_cores is not None From 4c6958e7cbe202461b0765ec82f843eabf0c4e3f Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 08:39:00 +0530 Subject: [PATCH 18/24] deprecated api update --- pytorch_lightning/trainer/deprecated_api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 30aa2526cd839..9aba51853ee5c 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -139,6 +139,7 @@ def training_tqdm_dict(self): @property def num_tpu_cores(self): """Back compatibility, will be removed in v0.9.0""" - rank_zero_warn("`num_tpu_cores` is now set by `tpu_cores` in v0.7.6" - " and this method will be removed in v0.9.0", DeprecationWarning) + rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" + " and this argument will be removed in v0.9.0", DeprecationWarning) + return self.num_tpu_cores From 5c0db30b6a6211b2b4d921fc4ed5f5c1ff2afad6 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 08:53:32 +0530 Subject: [PATCH 19/24] fixed recursion error --- pytorch_lightning/trainer/deprecated_api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 9aba51853ee5c..3f5911b972a1f 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -141,5 +141,4 @@ def num_tpu_cores(self): """Back compatibility, will be removed in v0.9.0""" rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" " and this argument will be removed in v0.9.0", DeprecationWarning) - - return self.num_tpu_cores + return self.tpu_cores \ No newline at end of file From c7a9b4e27f2bf7a26a87d73f967af81388cdb02e Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 11:19:33 +0530 Subject: [PATCH 20/24] fixed tests --- tests/test_deprecated.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index cb544d8f2e08b..df541b623e4ad 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -97,8 +97,7 @@ def test_tbd_remove_in_v0_9_0_trainer(): assert getattr(trainer, 'show_progress_bar') with pytest.deprecated_call(match='v0.9.0'): - trainer = Trainer(num_tpu_cores=8) - assert not getattr(trainer, 'num_tpu_cores') + _ = Trainer(num_tpu_cores=8) def test_tbd_remove_in_v0_9_0_module_imports(): From 83e5d99345f666924249328ade4fb19443beab90 Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 11:39:35 +0530 Subject: [PATCH 21/24] fixed flake errors --- pytorch_lightning/trainer/deprecated_api.py | 2 +- pytorch_lightning/trainer/trainer.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index 3f5911b972a1f..a3446dcfa1185 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -141,4 +141,4 @@ def num_tpu_cores(self): """Back compatibility, will be removed in v0.9.0""" rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" " and this argument will be removed in v0.9.0", DeprecationWarning) - return self.tpu_cores \ No newline at end of file + return self.tpu_cores diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index d5fd30a36a071..8efc8489d1b3d 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -35,7 +35,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities import rank_zero_warn, parsing - try: from apex import amp except ImportError: @@ -96,7 +95,7 @@ def __init__( num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, - num_tpu_cores: Optional[int] = None, # backward compatible, todo: remove in v0.9.0 + num_tpu_cores: Optional[int] = None, # backward compatible, todo: remove in v0.9.0 auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], int]] = None, log_gpu_memory: Optional[str] = None, @@ -355,8 +354,7 @@ def __init__( self.on_tpu = tpu_cores is not None self.tpu_cores = tpu_cores assert self.tpu_cores in (1, 8, None) or ( - isinstance(self.tpu_cores, (list, tuple, set)) - and len(self.tpu_cores) == 1 + isinstance(self.tpu_cores, (list, tuple, set)) and len(self.tpu_cores) == 1 ), '`tpu_cores` can only be 1, 8 or [<1-8>]' self.tpu_id = tpu_cores[0] if isinstance(tpu_cores, list) else None From 59e0b49b3e03f34db4f20614378e089f8351730c Mon Sep 17 00:00:00 2001 From: lezwon Date: Thu, 14 May 2020 16:27:42 +0530 Subject: [PATCH 22/24] removed current_tpu_index --- pytorch_lightning/trainer/deprecated_api.py | 1 - pytorch_lightning/trainer/distrib_parts.py | 3 --- pytorch_lightning/trainer/trainer.py | 1 - 3 files changed, 5 deletions(-) diff --git a/pytorch_lightning/trainer/deprecated_api.py b/pytorch_lightning/trainer/deprecated_api.py index a3446dcfa1185..5b615ebafaa09 100644 --- a/pytorch_lightning/trainer/deprecated_api.py +++ b/pytorch_lightning/trainer/deprecated_api.py @@ -141,4 +141,3 @@ def num_tpu_cores(self): """Back compatibility, will be removed in v0.9.0""" rank_zero_warn("Argument `num_tpu_cores` is now set by `tpu_cores` since v0.7.6" " and this argument will be removed in v0.9.0", DeprecationWarning) - return self.tpu_cores diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py index 4080875e5b3a1..9e350525efaee 100644 --- a/pytorch_lightning/trainer/distrib_parts.py +++ b/pytorch_lightning/trainer/distrib_parts.py @@ -389,7 +389,6 @@ class TrainerDPMixin(ABC): root_gpu: ... amp_level: str precision: ... - current_tpu_idx: ... proc_rank: int tpu_local_core_rank: int tpu_global_core_rank: int @@ -513,8 +512,6 @@ def tpu_train(self, tpu_core_idx, model): if self.tpu_global_core_rank != 0 and self.progress_bar_callback is not None: self.progress_bar_callback.disable() - # track current tpu - self.current_tpu_idx = tpu_core_idx self.proc_rank = self.tpu_local_core_rank rank_zero_only.rank = self.proc_rank diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 5164cae7e0b2d..c1f4511b77b6b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -491,7 +491,6 @@ def __init__( # override dist backend when using tpus if self.on_tpu: self.init_tpu() - self.current_tpu_idx = self.tpu_id # init flags for SLURM+ddp to work self.proc_rank = 0 From 940f70ba2dc9a27134195e3fa5637bd3d28b65af Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 17 May 2020 17:06:55 +0200 Subject: [PATCH 23/24] Update CHANGELOG.md --- CHANGELOG.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c142c179b24e8..3a1265fdc3111 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,15 +8,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). + ### Changed +- Allow user to select individual TPU core to train on ([#1729](https://github.com/PyTorchLightning/pytorch-lightning/pull/1729)) + ### Deprecated ### Removed -- Allow user to select individual TPU core to train on ([#1729](https://github.com/PyTorchLightning/pytorch-lightning/pull/1729)) - -- Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). ### Fixed ## [0.7.6] - 2020-05-16 From ec300ee2a44b1cba04c0a9021a976b877937e019 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 17 May 2020 17:20:11 +0200 Subject: [PATCH 24/24] Update trainer.py --- pytorch_lightning/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 105480fd825e1..45b0cb0f3d9bd 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -95,7 +95,6 @@ def __init__( num_nodes: int = 1, num_processes: int = 1, gpus: Optional[Union[List[int], str, int]] = None, - num_tpu_cores: Optional[int] = None, # backward compatible, todo: remove in v0.9.0 auto_select_gpus: bool = False, tpu_cores: Optional[Union[List[int], int]] = None, log_gpu_memory: Optional[str] = None, @@ -133,6 +132,7 @@ def __init__( progress_bar_callback: Optional[Union[ProgressBarBase, bool]] = True, terminate_on_nan: bool = False, auto_scale_batch_size: Union[str, bool] = False, + num_tpu_cores: Optional[int] = None, # backward compatible, todo: remove in v0.9.0 amp_level: str = 'O1', # backward compatible, todo: remove in v0.8.0 default_save_path=None, # backward compatible, todo: remove in v0.8.0 gradient_clip=None, # backward compatible, todo: remove in v0.8.0