Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

PTL 1.2 Compatibility #15

Merged
merged 27 commits into from
Mar 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 56 additions & 56 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,34 @@ jobs:
python -m pytest -v --durations=0 -x test_horovod.py
python -m pytest -v --durations=0 -x test_tune.py

# test_linux_ray_master_examples:
# runs-on: ubuntu-latest
# timeout-minutes: 12
# steps:
# - uses: actions/checkout@v2
# - name: Set up Python 3.7
# uses: actions/setup-python@v2
# with:
# python-version: 3.7
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# python -m pip install --upgrade setuptools
# python -m pip install codecov
# python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
# if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
# HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
# - name: Install package
# run: |
# python -m pip install -e .
# - name: Run Examples
# run: |
# pushd examples/
# echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
# echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
# echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
# echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
# echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
test_linux_ray_master_examples:
runs-on: ubuntu-latest
timeout-minutes: 12
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade setuptools
python -m pip install codecov
python -m pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl
if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install git+https://github.com/horovod/horovod.git
- name: Install package
run: |
python -m pip install -e .
- name: Run Examples
run: |
pushd examples/
# echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
# echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
# echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
# echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
# echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
Comment on lines +73 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we plan to not run any of these?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These all use the MNIST dataset from torchvision which is failing right now due to this error https://discuss.pytorch.org/t/mnist-server-down/114433. After the next torchvision release we can re-enable these tests (and the ones on the Ray repo).


test_linux_ray_release:
runs-on: ubuntu-latest
Expand Down Expand Up @@ -104,31 +104,31 @@ jobs:
python -m pytest -v --durations=0 -x test_tune.py


# test_linux_ray_release_examples:
# runs-on: ubuntu-latest
# timeout-minutes: 12
# steps:
# - uses: actions/checkout@v2
# - name: Set up Python 3.7
# uses: actions/setup-python@v2
# with:
# python-version: 3.7
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# python -m pip install --upgrade setuptools
# python -m pip install codecov
# python -m pip install -U ray
# if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
# HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git
# - name: Install package
# run: |
# python -m pip install -e .
# - name: Run Examples
# run: |
# pushd examples/
# echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
# echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
# echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
# echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
# echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
test_linux_ray_release_examples:
runs-on: ubuntu-latest
timeout-minutes: 12
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install --upgrade setuptools
python -m pip install codecov
python -m pip install -U ray
if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_MXNET=1 pip install -U git+https://github.com/horovod/horovod.git
- name: Install package
run: |
python -m pip install -e .
- name: Run Examples
run: |
pushd examples/
# echo "running ray_ddp_example.py" && python ray_ddp_example.py --smoke-test
# echo "running ray_ddp_example.py with Tune" && python ray_ddp_example.py --smoke-test --tune
# echo "running ray_ddp_tune.py" && python ray_ddp_tune.py --smoke-test
# echo "running ray_horovod_example.py" && python ray_horovod_example.py --smoke-test
# echo "running ray_horovod_example.py with Tune" && python ray_horovod_example.py --smoke-test --tune
Comment on lines +130 to +134
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we plan to not run any?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above comment.

34 changes: 17 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Distributed PyTorch Lightning Training on Ray
This library adds new PyTorch Lightning accelerators for distributed training using the Ray distributed computing framework.
This library adds new PyTorch Lightning plugins for distributed training using the Ray distributed computing framework.

These PyTorch Lightning Accelerators on Ray enable quick and easy parallel training while still leveraging all the benefits of PyTorch Lightning and using your desired training protocol, either [PyTorch Distributed Data Parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) or [Horovod](https://github.com/horovod/horovod).
These PyTorch Lightning Plugins on Ray enable quick and easy parallel training while still leveraging all the benefits of PyTorch Lightning and using your desired training protocol, either [PyTorch Distributed Data Parallel](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) or [Horovod](https://github.com/horovod/horovod).

Once you add your accelerator to the PyTorch Lightning Trainer, you can parallelize training to all the cores in your laptop, or across a massive multi-node, multi-GPU cluster with no additional code changes.
Once you add your plugin to the PyTorch Lightning Trainer, you can parallelize training to all the cores in your laptop, or across a massive multi-node, multi-GPU cluster with no additional code changes.

This library also comes with an integration with [Ray Tune](tune.io) for distributed hyperparameter tuning experiments.

Expand All @@ -12,45 +12,45 @@ You can install the master branch of ray_lightning_accelerators like so:

`pip install git+https://github.com/ray-project/ray_lightning_accelerators#ray_lightning`

## PyTorch Distributed Data Parallel Accelerator on Ray
The `RayAccelerator` provides Distributed Data Parallel training on a Ray cluster. PyTorch DDP is used as the distributed training protocol, and Ray is used to launch and manage the training worker processes.
## PyTorch Distributed Data Parallel Plugin on Ray
The `RayPlugin` provides Distributed Data Parallel training on a Ray cluster. PyTorch DDP is used as the distributed training protocol, and Ray is used to launch and manage the training worker processes.

Here is a simplified example:

```python
import pytorch_lightning as ptl
from ray_lightning import RayAccelerator
from ray_lightning import RayPlugin

# Create your PyTorch Lightning model here.
ptl_model = MNISTClassifier(...)
accelerator = RayAccelerator(num_workers=4, cpus_per_worker=1, use_gpu=True)
plugin = RayPlugin(num_workers=4, cpus_per_worker=1, use_gpu=True)

# If using GPUs, set the ``gpus`` arg to a value > 0.
# The actual number of GPUs is determined by ``num_workers``.
trainer = pl.Trainer(..., gpus=1, accelerator=accelerator)
trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
trainer.fit(ptl_model)
```

Because Ray is used to launch processes, instead of the same script being called multiple times, you CAN use this accelerator even in cases when you cannot use the standard `DDPAccelerator` such as
Because Ray is used to launch processes, instead of the same script being called multiple times, you CAN use this plugin even in cases when you cannot use the standard `DDPPlugin` such as
- Jupyter Notebooks, Google Colab, Kaggle
- Calling `fit` or `test` multiple times in the same script

## Horovod Accelerator on Ray
Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayAccelerator` instead.
## Horovod Plugin on Ray
Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayPlugin` instead.

```python
import pytorch_lightning as ptl
from ray.util.lightning_accelerators import HorovodRayAccelerator
from ray_lightning import HorovodRayPlugin

# Create your PyTorch Lightning model here.
ptl_model = MNISTClassifier(...)

# 2 nodes, 4 workers per node, each using 1 CPU and 1 GPU.
accelerator = HorovodRayAccelerator(num_hosts=2, num_slots=4, use_gpu=True)
plugin = HorovodRayPlugin(num_hosts=2, num_slots=4, use_gpu=True)

# If using GPUs, set the ``gpus`` arg to a value > 0.
# The actual number of GPUs is determined by ``num_slots``.
trainer = pl.Trainer(..., gpus=1, accelerator=accelerator)
trainer = pl.Trainer(..., gpus=1, plugins=[plugin])
trainer.fit(ptl_model)
```

Expand Down Expand Up @@ -78,7 +78,7 @@ def train_mnist(config):
trainer = pl.Trainer(
max_epochs=4,
callbacks=callbacks,
accelerator=RayAccelerator(num_workers=4, use_gpu=False))
plugins=[RayPlugin(num_workers=4, use_gpu=False)])
trainer.fit(model)

config = {
Expand Down Expand Up @@ -110,10 +110,10 @@ The key difference is which Trainer you'll be interacting with. In this library,

With RaySGD's integration, you'll be converting your `LightningModule` to be RaySGD compatible, and will be interacting with RaySGD's `TorchTrainer`. RaySGD's `TorchTrainer` is not as feature rich nor as easy to use as Pytorch Lightning's `Trainer` (no built in support for logging, early stopping, etc.). However, it does have built in support for fault-tolerant and elastic training. If these are hard requirements for you, then RaySGD's integration with PTL might be a better option.

> I see that `RayAccelerator` is based off of Pytorch Lightning's `DDPSpawnAccelerator`. However, doesn't the PTL team discourage the use of spawn?
> I see that `RayPlugin` is based off of Pytorch Lightning's `DDPSpawnPlugin`. However, doesn't the PTL team discourage the use of spawn?

As discussed [here](https://github.com/pytorch/pytorch/issues/51688#issuecomment-773539003), using a spawn approach instead of launch is not all that detrimental. The original factors for discouraging spawn were:
1. not being able to use 'spawn' in a Jupyter or Colab notebook, and
2. not being able to use multiple workers for data loading.

Neither of these should be an issue with the `RayAccelerator` due to Ray's serialization mechanisms. The only thing to keep in mind is that when using this accelerator, your model does have to be serializable/pickleable.
Neither of these should be an issue with the `RayPlugin` due to Ray's serialization mechanisms. The only thing to keep in mind is that when using this plugin, your model does have to be serializable/pickleable.
4 changes: 2 additions & 2 deletions examples/ray_ddp_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ray import tune
from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
from ray_lightning.tune import TuneReportCallback
from ray_lightning import RayAccelerator
from ray_lightning import RayPlugin


class MNISTClassifier(LightningMNISTClassifier):
Expand Down Expand Up @@ -72,7 +72,7 @@ def train_mnist(config,
max_epochs=num_epochs,
gpus=int(use_gpu),
callbacks=callbacks,
accelerator=RayAccelerator(num_workers=num_workers, use_gpu=use_gpu))
plugins=[RayPlugin(num_workers=num_workers, use_gpu=use_gpu)])
trainer.fit(model)


Expand Down
12 changes: 7 additions & 5 deletions examples/ray_ddp_tune.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
from ray import tune
from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
from ray_lightning.tune import TuneReportCallback
from ray_lightning import RayAccelerator


from ray_lightning import RayPlugin


def train_mnist(config,
Expand All @@ -35,8 +33,12 @@ def download_data():
gpus=int(use_gpu),
callbacks=callbacks,
progress_bar_refresh_rate=0,
accelerator=RayAccelerator(
num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data))
plugins=[
RayPlugin(
num_workers=num_workers,
use_gpu=use_gpu,
init_hook=download_data)
])
dm = MNISTDataModule(
data_dir=data_dir, num_workers=1, batch_size=config["batch_size"])
trainer.fit(model, dm)
Expand Down
8 changes: 5 additions & 3 deletions examples/ray_horovod_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ray import tune
from ray.tune.examples.mnist_ptl_mini import LightningMNISTClassifier
from ray_lightning.tune import TuneReportCallback
from ray_lightning import HorovodRayAccelerator
from ray_lightning import HorovodRayPlugin


class MNISTClassifier(LightningMNISTClassifier):
Expand Down Expand Up @@ -75,8 +75,10 @@ def train_mnist(config,
max_epochs=num_epochs,
gpus=int(use_gpu),
callbacks=callbacks,
accelerator=HorovodRayAccelerator(
num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu))
plugins=[
HorovodRayPlugin(
num_hosts=num_hosts, num_slots=num_slots, use_gpu=use_gpu)
])
trainer.fit(model)


Expand Down
6 changes: 3 additions & 3 deletions ray_lightning/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ray_lightning.ray_ddp import RayAccelerator
from ray_lightning.ray_horovod import HorovodRayAccelerator
from ray_lightning.ray_ddp import RayPlugin
from ray_lightning.ray_horovod import HorovodRayPlugin

__all__ = ["RayAccelerator", "HorovodRayAccelerator"]
__all__ = ["RayPlugin", "HorovodRayPlugin"]
Loading