Lightning-AI · Borda · Mar 11, 2023 · Mar 11, 2023 · Mar 11, 2023 · Mar 11, 2023
@@ -57,8 +57,10 @@ Cutting-edge and third-party Strategies
 =======================================
 
 Cutting-edge Lightning strategies are being developed by third-parties outside of Lightning.
+
 If you want to try some of the latest and greatest features for model-parallel training, check out the :doc:`Colossal-AI Strategy <./third_party/colossalai>` integration.
 
+Another enabler for training script to be used for single-GPU, multi-GPU, and multi-node training, check out the :doc:`Horovod Strategy <./third_party/horovod>` integration.
 
 ----
 

@@ -0,0 +1,41 @@
+:orphan:
+
+#######
+Horovod
+#######
+
+The `Horovod strategy <https://github.com/Lightning-AI/lightning-horovod>`_ allows the same training script to be used for single-GPU, multi-GPU, and multi-node training.
+
+.. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
+
+Like Distributed Data Parallel, every process in Horovod operates on a single GPU with a fixed subset of the data.  Gradients are averaged across all GPUs in parallel during the backward pass, then synchronously applied before beginning the next step.
+
+The number of worker processes is configured by a driver application (`horovodrun` or `mpirun`). In the training script, Horovod will detect the number of workers from the environment, and automatically scale the learning rate to compensate for the increased total batch size.
+
+You can install the Horovod integration by running
+
+.. code-block:: bash
+
+    pip install lightning-horovod
+
+This will install both the `Horovod <https://github.com/horovod/horovod#install>`_ package as well as the ``HorovodStrategy`` for the Lightning Trainer.
+Horovod can be configured in the training script to run with any number of GPUs / processes as follows:
+
+.. code-block:: python
+
+    # train Horovod on CPU (number of processes / machines provided on command-line)
+    trainer = Trainer(strategy=HorovodStrategy())
+
+When starting the training job, the driver application will then be used to specify the total number of worker processes:
+
+
+.. code-block:: bash
+
+    # run training with 4 GPUs on a single machine
+    horovodrun -np 4 python train.py
+
+    # run training with 8 GPUs on two machines (4 GPUs each)
+    horovodrun -np 8 -H hostname1:4,hostname2:4 python train.py
+
+
+See the official [Horovod documentation](https://horovod.readthedocs.io/en/stable) for details on installation and performance tuning.
@@ -116,6 +116,9 @@ There are powerful third-party strategies that integrate well with Lightning but
    * - colossalai
      - `Lightning-AI/lightning-colossalai <https://github.com/Lightning-AI/lightning-colossalai>`_
      - Colossal-AI provides a collection of parallel components for you. It aims to support you to write your distributed deep learning models just like how you write your model on your laptop. `Learn more. <https://www.colossalai.org/>`__
+   * - horovod
+     - `Lightning-AI/lightning-horovod <https://github.com/Lightning-AI/lightning-horovod>`_
+     - Horovod allows the same training script to be used for single-GPU, multi-GPU, and multi-node training. `Learn more. <http://horovod.ai>`__
 
 
 ----

@@ -2,4 +2,5 @@
 #  in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
 
 deepspeed>=0.6.0, <0.8.0  # TODO: Include 0.8.x after https://github.com/microsoft/DeepSpeed/commit/b587c7e85470329ac25df7c7c2521ff9b2833db7 gets released
-lightning-colossalai==0.1.0dev1
+lightning-colossalai>=0.1.0dev1
+lightning-horovod>=0.1.0dev0
@@ -25,7 +25,7 @@
 import lightning.pytorch as pl
 from lightning.pytorch.callbacks.callback import Callback
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
+from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE, _LIGHTNING_HOROVOD_AVAILABLE
 from lightning.pytorch.utilities.model_helpers import is_overridden
 from lightning.pytorch.utilities.rank_zero import rank_zero_warn
 
@@ -131,6 +131,11 @@ def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule")
 
             unsupported_strategies.append(ColossalAIStrategy)
 
+        if _LIGHTNING_HOROVOD_AVAILABLE:
+            from lightning_horovod import HorovodStrategy
+
+            unsupported_strategies.append(HorovodStrategy)
+
         if isinstance(trainer.accelerator, unsupported_accelerators):
             raise RuntimeError(
                 f"The `{type(trainer.accelerator).__name__}` does not support `accumulate_grad_batches` changing"

@@ -68,7 +68,7 @@
 )
 from lightning.pytorch.strategies.ddp import _DDP_FORK_ALIASES
 from lightning.pytorch.utilities.exceptions import MisconfigurationException
-from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE
+from lightning.pytorch.utilities.imports import _LIGHTNING_COLOSSALAI_AVAILABLE, _LIGHTNING_HOROVOD_AVAILABLE
 from lightning.pytorch.utilities.rank_zero import rank_zero_info, rank_zero_warn
 
 log = logging.getLogger(__name__)
@@ -650,3 +650,10 @@ def _register_external_accelerators_and_strategies() -> None:
         # TODO: Prevent registering multiple times
         if "colossalai" not in StrategyRegistry:
             ColossalAIStrategy.register_strategies(StrategyRegistry)
+
+    if _LIGHTNING_HOROVOD_AVAILABLE:
+        from lightning_horovod import HorovodStrategy
+
+        # TODO: Prevent registering multiple times
+        if "horovod" not in StrategyRegistry:
+            HorovodStrategy.register_strategies(StrategyRegistry)
@@ -30,6 +30,7 @@
 _RICH_AVAILABLE = package_available("rich") and compare_version("rich", operator.ge, "10.2.2")
 _TORCHVISION_AVAILABLE = RequirementCache("torchvision")
 _LIGHTNING_COLOSSALAI_AVAILABLE = RequirementCache("lightning-colossalai")
+_LIGHTNING_HOROVOD_AVAILABLE = RequirementCache("lightning-horovod")
 
 if _POPTORCH_AVAILABLE:
     import poptorch