From fe55d6e5bedd371de5b900947ae1cbb6ac6bfd0b Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 12:41:27 +0200
Subject: [PATCH 001/106] resolve issues

---
 pytorch_lightning/loops/base.py       |  90 +++++++++++++++++++++--
 pytorch_lightning/trainer/progress.py |  14 ++--
 tests/loops/test_loops.py             | 100 +++++++++++++++++++++++++-
 3 files changed, 189 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 3293b3eba29ab..61fef73982e20 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, OrderedDict
 
 from deprecate import void
 
 import pytorch_lightning as pl
+from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.model_helpers import is_overridden
+from pytorch_lightning.utilities.warnings import WarningCache
+
+warning_cache = WarningCache()
 
 
 class Loop(ABC):
@@ -47,6 +52,40 @@ def __init__(self) -> None:
         self.iteration_count: int = 0
         self.trainer: Optional['pl.Trainer'] = None
         self.restarting = False
+        self._loops = OrderedDict()
+        self._progress = OrderedDict()
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if isinstance(value, Loop):
+            self._loops[name] = value
+        elif isinstance(value, BaseProgress):
+            self._progress[name] = value
+        else:
+            object.__setattr__(self, name, value)
+
+    def __getattr__(self, name) -> Any:
+        loops = self.__dict__.get('_loops')
+
+        if loops is not None and name in loops:
+            return loops[name]
+
+        progress = self.__dict__.get('_progress')
+
+        if progress is not None and name in progress:
+            return progress[name]
+
+        if name not in self.__dict__:
+            raise AttributeError(f"{self.__class__.__name__} Loop doesn't have attribute {name}.")
+
+        return self.__dict__[name]
+
+    def __delattr__(self, name) -> None:
+        if name in self._loops:
+            del self._loops[name]
+        elif name in self._progress:
+            del self._progress[name]
+        else:
+            object.__delattr__(self, name)
 
     @property
     @abstractmethod
@@ -89,6 +128,8 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
             return self.on_skip()
 
         if self.restarting:
+            if not is_overridden("restore", self, Loop):
+                warning_cache.warn(f"{self.__class__.__name__} Loop doesn't override the restore function.")
             self.restore()
             self.restarting = False
         else:
@@ -108,7 +149,7 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         output = self.on_run_end()
         return output
 
-    def restore(self) -> None:
+    def restore(self, state: Optional[Dict] = None) -> None:
         """Restore the internal state of the loop the beginning of run if restarting is ``True``."""
 
     @abstractmethod
@@ -142,9 +183,46 @@ def on_run_end(self) -> Any:
     def teardown(self) -> None:
         """Use to release memory etc."""
 
-    def load_state_dict(self, state_dict: Dict) -> None:
-        """Restore the loop state from the provided state_dict."""
-
     def state_dict(self) -> Dict:
-        """Return the loop current states."""
+        """Current Loop state"""
         return {}
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        """Reload Loop state"""
+
+    def get_state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional[str] = '') -> OrderedDict:
+        if destination is None:
+            destination = OrderedDict()
+
+        destination[prefix + "state_dict"] = self.state_dict()
+
+        for name, progress in self._progress.items():
+            destination[prefix + name] = progress.state_dict()
+
+        for name, loop in self._loops.items():
+            loop.get_state_dict(destination, prefix + name + '.')
+        return destination
+
+    def _load_from_state_dict(self, state_dict, prefix, strict, missing_keys, unexpected_keys, error_msgs):
+        self.load_state_dict(state_dict[prefix + "state_dict"])
+
+        for name, progress in self._progress.items():
+            progress.load_state_dict(state_dict[prefix + name])
+
+    def _load_state_dict(self, state_dict: Dict, strict: bool = True):
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+
+        state_dict = state_dict.copy()
+
+        def load(loop, prefix=''):
+            loop._load_from_state_dict(state_dict, prefix, True, missing_keys, unexpected_keys, error_msgs)
+            loop.restarting = True
+            for name, loop_children in loop._loops.items():
+                if loop_children is not None:
+                    load(loop_children, prefix + name + '.')
+
+        load(self)
+        load = None  # break load->load reference cycle
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 25f76ad085cc6..3acae2485cea0 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -16,7 +16,7 @@
 
 
 @dataclass
-class _DataclassStateDictMixin:
+class BaseProgress:
 
     def state_dict(self) -> dict:
         return asdict(self)
@@ -25,14 +25,14 @@ def load_state_dict(self, state_dict: dict) -> None:
         self.__dict__.update(state_dict)
 
     @classmethod
-    def from_state_dict(cls, state_dict: dict) -> "_DataclassStateDictMixin":
+    def from_state_dict(cls, state_dict: dict) -> "BaseProgress":
         obj = cls()
         obj.load_state_dict(state_dict)
         return obj
 
 
 @dataclass
-class Tracker(_DataclassStateDictMixin):
+class Tracker(BaseProgress):
     """
     Track an event's progress.
 
@@ -72,7 +72,7 @@ def __repr__(self):
 
 
 @dataclass
-class Progress(_DataclassStateDictMixin):
+class Progress(BaseProgress):
     """
     Track aggregated and current progress.
 
@@ -150,7 +150,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 
 
 @dataclass
-class OptimizerProgress(_DataclassStateDictMixin):
+class OptimizerProgress(BaseProgress):
     """
     Track optimizer progress.
 
@@ -172,7 +172,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 
 
 @dataclass
-class OptimizationProgress(_DataclassStateDictMixin):
+class OptimizationProgress(BaseProgress):
     """
     Track optimization progress.
 
@@ -203,7 +203,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 
 
 @dataclass
-class EpochLoopProgress(_DataclassStateDictMixin):
+class EpochLoopProgress(BaseProgress):
     """
     Tracks epoch loop progress.
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index af5801d2b4552..9f553247ab9a3 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -11,10 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import Dict, Iterator
+from collections import OrderedDict
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any, Dict, Iterator
 
 from pytorch_lightning.loops.base import Loop
+from pytorch_lightning.trainer.progress import BaseProgress
 
 
 def test_loop_restore():
@@ -72,3 +75,96 @@ def load_state_dict(self, state_dict: Dict) -> None:
 
     assert not loop.restarting
     assert loop.outputs == list(range(10))
+
+
+def test_loop_hierarchy():
+
+    @dataclass
+    class SimpleProgress(BaseProgress):
+
+        increment: int = 0
+
+        def state_dict(self):
+            return {"increment": self.increment}
+
+        def load_state_dict(self, state_dict):
+            self.increment = state_dict["increment"]
+
+    class Simple(Loop):
+
+        def __init__(self, a):
+            super().__init__()
+            self.a = a
+            self.progress = SimpleProgress()
+
+        def advance(self, *args: Any, **kwargs: Any) -> None:
+            for loop in self._loops.values():
+                loop.run()
+                self.progress.increment += 1
+            self.progress.increment += 1
+
+        @property
+        def done(self) -> bool:
+            return self.iteration_count > 0
+
+        def reset(self) -> None:
+            pass
+
+        def restore(self) -> None:
+            pass
+
+        def state_dict(self) -> Dict:
+            return {"a": self.a}
+
+        def load_state_dict(self, state_dict: Dict) -> None:
+            self.a = state_dict["a"]
+
+    loop_parent = Simple(1)
+    loop_child = Simple(2)
+    loop_parent.loop_child = loop_child
+    state_dict = loop_parent.get_state_dict()
+    assert state_dict == OrderedDict([('state_dict', {
+        'a': 1
+    }), ('progress', {
+        'increment': 0
+    }), ('loop_child.state_dict', {
+        'a': 2
+    }), ('loop_child.progress', {
+        'increment': 0
+    })])
+
+    state_dict["loop_child.state_dict"]["a"] = 3
+    loop_parent._load_state_dict(state_dict)
+    assert loop_parent.restarting
+
+    loop_parent.run()
+
+    loop_parent_copy = deepcopy(loop_parent)
+    assert loop_parent_copy.get_state_dict() == loop_parent.get_state_dict()
+
+    assert loop_parent_copy.state_dict() == {'a': 1}
+    assert loop_parent_copy.loop_child.state_dict() == {'a': 3}
+
+    assert not loop_parent.restarting
+
+    state_dict = loop_parent.get_state_dict()
+    assert state_dict == OrderedDict([('state_dict', {
+        'a': 1
+    }), ('progress', {
+        'increment': 2
+    }), ('loop_child.state_dict', {
+        'a': 3
+    }), ('loop_child.progress', {
+        'increment': 1
+    })])
+
+    loop_parent = Simple(1)
+    loop_child = Simple(2)
+    loop_parent.loop_child = loop_child
+    loop_parent._load_state_dict(state_dict)
+    assert loop_parent.progress.increment == 2
+    assert loop_parent.loop_child.progress.increment == 1
+
+    del loop_parent.loop_child
+    state_dict = loop_parent.get_state_dict()
+    assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 2})])

From 4ee4a7301b0b18dca7434ea5a050b49b78d9cf73 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 15:02:20 +0200
Subject: [PATCH 002/106] update

---
 pytorch_lightning/loops/base.py       | 24 +++++++++++++++++++++++-
 pytorch_lightning/trainer/progress.py | 11 ++++++++++-
 tests/loops/test_loops.py             | 24 +++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 61fef73982e20..59101e4b98ee8 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -18,7 +18,7 @@
 from deprecate import void
 
 import pytorch_lightning as pl
-from pytorch_lightning.trainer.progress import BaseProgress
+from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.warnings import WarningCache
@@ -55,8 +55,30 @@ def __init__(self) -> None:
         self._loops = OrderedDict()
         self._progress = OrderedDict()
 
+    @property
+    def is_leaf(self) -> bool:
+        loops = self.__dict__.get('_loops')
+        return len(loops) == 0
+
+    @property
+    def loop_progress(self) -> Dict[str, Any]:
+        progress = {}
+        for n, p in self.__dict__.get('_progress').items():
+            progress[n] = p
+
+        loops = self.__dict__.get('_loops')
+
+        if loops is not None:
+            for name, loop in loops.items():
+                progress[name] = ProgressDict(**loop.loop_progress)
+        return ProgressDict(**progress)
+
     def __setattr__(self, name: str, value: Any) -> None:
         if isinstance(value, Loop):
+            if getattr(self, "__children__loops__", None) is not None and name not in self.__children__loops__:
+                raise MisconfigurationException(
+                    f"The current loop accept only {self.__children__loops__} as children attribute names."
+                )
             self._loops[name] = value
         elif isinstance(value, BaseProgress):
             self._progress[name] = value
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 3acae2485cea0..54b85273d9c0a 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -12,7 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import asdict, dataclass, field
-from typing import Optional
+from typing import Dict, Optional
+
+
+class ProgressDict(Dict):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        for k, v in kwargs.items():
+            setattr(self, k, v)
 
 
 @dataclass
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 9f553247ab9a3..1061d7e2d0cee 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -16,8 +16,11 @@
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator
 
+import pytest
+
 from pytorch_lightning.loops.base import Loop
-from pytorch_lightning.trainer.progress import BaseProgress
+from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 def test_loop_restore():
@@ -92,6 +95,8 @@ def load_state_dict(self, state_dict):
 
     class Simple(Loop):
 
+        __children__loops__ = ("loop_child")
+
         def __init__(self, a):
             super().__init__()
             self.a = a
@@ -123,6 +128,21 @@ def load_state_dict(self, state_dict: Dict) -> None:
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
     state_dict = loop_parent.get_state_dict()
+
+    with pytest.raises(MisconfigurationException, match="The current loop accept only loop_child"):
+        loop_parent.wrong_name = loop_child
+
+    loop_progress: ProgressDict = loop_parent.loop_progress
+    assert loop_progress["progress"] == loop_parent.progress
+    assert loop_progress["loop_child"]["progress"] == loop_child.progress
+
+    assert loop_progress.progress == loop_parent.progress
+    assert loop_progress.loop_child.progress == loop_child.progress
+
+    loop_progress = loop_child.loop_progress
+    assert loop_progress["progress"] == loop_child.progress
+    assert loop_progress.progress == loop_child.progress
+
     assert state_dict == OrderedDict([('state_dict', {
         'a': 1
     }), ('progress', {
@@ -133,6 +153,8 @@ def load_state_dict(self, state_dict: Dict) -> None:
         'increment': 0
     })])
 
+    loop_parent.progress
+
     state_dict["loop_child.state_dict"]["a"] = 3
     loop_parent._load_state_dict(state_dict)
     assert loop_parent.restarting

From 12914183842760407c054a648f06834fef25b3b5 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 15:03:35 +0200
Subject: [PATCH 003/106] update

---
 pytorch_lightning/loops/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 59101e4b98ee8..9c8f6365685ac 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional, OrderedDict
+from collections import OrderedDict
+from typing import Any, Dict, Optional
 
 from deprecate import void
 

From 0e69bea98a779fc66a26cabea6c811a234036e90 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 15:25:46 +0200
Subject: [PATCH 004/106] update

---
 pytorch_lightning/loops/base.py | 30 ++++++++++++++++++++++++++----
 tests/loops/test_loops.py       | 17 +++++++++++++++++
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 9c8f6365685ac..5ae5bfe244de1 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -55,11 +55,20 @@ def __init__(self) -> None:
         self.restarting = False
         self._loops = OrderedDict()
         self._progress = OrderedDict()
+        self._num_parents: int = 0
 
     @property
-    def is_leaf(self) -> bool:
+    def has_parent(self) -> Optional[bool]:
+        return self._num_parents > 0
+
+    @property
+    def has_children(self) -> bool:
         loops = self.__dict__.get('_loops')
-        return len(loops) == 0
+        return len(loops) > 0
+
+    @property
+    def is_leaf(self) -> bool:
+        return not self.has_children and self.has_parent
 
     @property
     def loop_progress(self) -> Dict[str, Any]:
@@ -75,12 +84,21 @@ def loop_progress(self) -> Dict[str, Any]:
         return ProgressDict(**progress)
 
     def __setattr__(self, name: str, value: Any) -> None:
-        if isinstance(value, Loop):
+        if isinstance(value, pl.Trainer):
+            # when assigning a Trainer to a loop, it will assign to its children too.
+            object.__setattr__(self, name, value)
+            for loop in self._loops.values():
+                object.__setattr__(loop, name, value)
+        elif isinstance(value, Loop):
             if getattr(self, "__children__loops__", None) is not None and name not in self.__children__loops__:
                 raise MisconfigurationException(
                     f"The current loop accept only {self.__children__loops__} as children attribute names."
                 )
-            self._loops[name] = value
+            if value not in self._loops.values():
+                self._loops[name] = value
+                value._num_parents += 1
+            else:
+                raise MisconfigurationException("This loop has already been assigned.")
         elif isinstance(value, BaseProgress):
             self._progress[name] = value
         else:
@@ -104,6 +122,7 @@ def __getattr__(self, name) -> Any:
 
     def __delattr__(self, name) -> None:
         if name in self._loops:
+            self._loops[name]._num_parents -= 1
             del self._loops[name]
         elif name in self._progress:
             del self._progress[name]
@@ -147,6 +166,9 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         Returns:
             the output of :attr:`on_run_end` (often outputs collected from each step of the loop)
         """
+        if self.trainer is None:
+            raise MisconfigurationException(f"The {self.__class__.__name__} Loop hasn't been attached to any Trainer.")
+
         if self.skip:
             return self.on_skip()
 
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 1061d7e2d0cee..23e62faddaa55 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -20,6 +20,7 @@
 
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
+from pytorch_lightning.trainer.trainer import Trainer
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -124,9 +125,21 @@ def state_dict(self) -> Dict:
         def load_state_dict(self, state_dict: Dict) -> None:
             self.a = state_dict["a"]
 
+    grand_loop_parent = Simple(0)
     loop_parent = Simple(1)
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
+
+    with pytest.raises(MisconfigurationException, match="Loop hasn't been attached to any Trainer."):
+        grand_loop_parent.run()
+
+    grand_loop_parent.loop_child = loop_child
+    assert loop_child._num_parents == 2
+    del grand_loop_parent.loop_child
+    assert loop_child._num_parents == 1
+    assert loop_child.has_parent
+    assert loop_parent.has_children
+
     state_dict = loop_parent.get_state_dict()
 
     with pytest.raises(MisconfigurationException, match="The current loop accept only loop_child"):
@@ -143,6 +156,9 @@ def load_state_dict(self, state_dict: Dict) -> None:
     assert loop_progress["progress"] == loop_child.progress
     assert loop_progress.progress == loop_child.progress
 
+    loop_parent.trainer = Trainer()
+    assert loop_child.trainer == loop_parent.trainer
+
     assert state_dict == OrderedDict([('state_dict', {
         'a': 1
     }), ('progress', {
@@ -188,5 +204,6 @@ def load_state_dict(self, state_dict: Dict) -> None:
     assert loop_parent.loop_child.progress.increment == 1
 
     del loop_parent.loop_child
+    assert loop_child._num_parents == 0
     state_dict = loop_parent.get_state_dict()
     assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 2})])

From 368e17916ac60bd9ba78ad044d69c9505e5fca13 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 15:36:17 +0200
Subject: [PATCH 005/106] add more exceptions

---
 pytorch_lightning/loops/base.py | 17 ++++++++++++-----
 tests/loops/test_loops.py       |  7 +++++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 5ae5bfe244de1..5a9e3d6e80346 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -59,19 +59,23 @@ def __init__(self) -> None:
 
     @property
     def has_parent(self) -> Optional[bool]:
+        """Whether the number of loop parents is not null"""
         return self._num_parents > 0
 
     @property
     def has_children(self) -> bool:
+        """Whether this loop has any children"""
         loops = self.__dict__.get('_loops')
         return len(loops) > 0
 
     @property
     def is_leaf(self) -> bool:
+        """Whether this loop is a children and has no children itself."""
         return not self.has_children and self.has_parent
 
     @property
     def loop_progress(self) -> Dict[str, Any]:
+        """Return the progress for the current loop and children loop."""
         progress = {}
         for n, p in self.__dict__.get('_progress').items():
             progress[n] = p
@@ -94,11 +98,14 @@ def __setattr__(self, name: str, value: Any) -> None:
                 raise MisconfigurationException(
                     f"The current loop accept only {self.__children__loops__} as children attribute names."
                 )
-            if value not in self._loops.values():
-                self._loops[name] = value
-                value._num_parents += 1
-            else:
-                raise MisconfigurationException("This loop has already been assigned.")
+            for loop_name, loop in self._loops.items():
+                if loop == value and name != loop_name:
+                    raise MisconfigurationException(
+                        f"The {self.__class__.__name__} already contains the provided loop "
+                        f"{loop} under the attribute_name {loop_name}."
+                    )
+            self._loops[name] = value
+            value._num_parents += 1
         elif isinstance(value, BaseProgress):
             self._progress[name] = value
         else:
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 23e62faddaa55..b19c387a501a1 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -96,7 +96,7 @@ def load_state_dict(self, state_dict):
 
     class Simple(Loop):
 
-        __children__loops__ = ("loop_child")
+        __children__loops__ = ("loop_child", "something")
 
         def __init__(self, a):
             super().__init__()
@@ -130,6 +130,9 @@ def load_state_dict(self, state_dict: Dict) -> None:
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
 
+    with pytest.raises(MisconfigurationException, match="The Simple already contains the provided loop"):
+        loop_parent.something = loop_child
+
     with pytest.raises(MisconfigurationException, match="Loop hasn't been attached to any Trainer."):
         grand_loop_parent.run()
 
@@ -142,7 +145,7 @@ def load_state_dict(self, state_dict: Dict) -> None:
 
     state_dict = loop_parent.get_state_dict()
 
-    with pytest.raises(MisconfigurationException, match="The current loop accept only loop_child"):
+    with pytest.raises(MisconfigurationException, match="The current loop accept only"):
         loop_parent.wrong_name = loop_child
 
     loop_progress: ProgressDict = loop_parent.loop_progress

From eb4475cf222715fa140b633959abb1eacb142a91 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 15:38:04 +0200
Subject: [PATCH 006/106] resolve bug

---
 pytorch_lightning/loops/base.py | 18 +++++++++++-------
 tests/loops/test_loops.py       |  3 +++
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 5a9e3d6e80346..bf3534f397f8f 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -98,14 +98,18 @@ def __setattr__(self, name: str, value: Any) -> None:
                 raise MisconfigurationException(
                     f"The current loop accept only {self.__children__loops__} as children attribute names."
                 )
+            is_contained = False
             for loop_name, loop in self._loops.items():
-                if loop == value and name != loop_name:
-                    raise MisconfigurationException(
-                        f"The {self.__class__.__name__} already contains the provided loop "
-                        f"{loop} under the attribute_name {loop_name}."
-                    )
-            self._loops[name] = value
-            value._num_parents += 1
+                if loop == value:
+                    is_contained = True
+                    if name != loop_name:
+                        raise MisconfigurationException(
+                            f"The {self.__class__.__name__} already contains the provided loop "
+                            f"{loop} under the attribute_name {loop_name}."
+                        )
+            if not is_contained:
+                self._loops[name] = value
+                value._num_parents += 1
         elif isinstance(value, BaseProgress):
             self._progress[name] = value
         else:
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index b19c387a501a1..e4c832eeb7f89 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -129,6 +129,9 @@ def load_state_dict(self, state_dict: Dict) -> None:
     loop_parent = Simple(1)
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
+    assert loop_child._num_parents == 1
+    loop_parent.loop_child = loop_child
+    assert loop_child._num_parents == 1
 
     with pytest.raises(MisconfigurationException, match="The Simple already contains the provided loop"):
         loop_parent.something = loop_child

From 449ca622f0250f24b4b83f28f08348bb7441787d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 16:36:29 +0200
Subject: [PATCH 007/106] update

---
 pytorch_lightning/loops/base.py | 37 ++++++++++++++------------------
 tests/loops/test_loops.py       | 38 +++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index bf3534f397f8f..2b822911411d6 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -55,12 +55,13 @@ def __init__(self) -> None:
         self.restarting = False
         self._loops = OrderedDict()
         self._progress = OrderedDict()
-        self._num_parents: int = 0
+        self._has_parent: bool = False
+        self.__parent_loop: Optional['Loop'] = None
 
     @property
     def has_parent(self) -> Optional[bool]:
-        """Whether the number of loop parents is not null"""
-        return self._num_parents > 0
+        """Whether this loop has been attached to another loop"""
+        return self._has_parent
 
     @property
     def has_children(self) -> bool:
@@ -94,22 +95,18 @@ def __setattr__(self, name: str, value: Any) -> None:
             for loop in self._loops.values():
                 object.__setattr__(loop, name, value)
         elif isinstance(value, Loop):
+            if name == "_Loop__parent_loop":
+                object.__setattr__(self, name, value)
+                return
             if getattr(self, "__children__loops__", None) is not None and name not in self.__children__loops__:
                 raise MisconfigurationException(
-                    f"The current loop accept only {self.__children__loops__} as children attribute names."
+                    f"The current loop accept only {self.__children__loops__} as children attribute names. Found {name}"
                 )
-            is_contained = False
-            for loop_name, loop in self._loops.items():
-                if loop == value:
-                    is_contained = True
-                    if name != loop_name:
-                        raise MisconfigurationException(
-                            f"The {self.__class__.__name__} already contains the provided loop "
-                            f"{loop} under the attribute_name {loop_name}."
-                        )
-            if not is_contained:
-                self._loops[name] = value
-                value._num_parents += 1
+            if value._has_parent:
+                raise MisconfigurationException(f"This provided loop {value} already has a parent. ")
+            self._loops[name] = value
+            value._has_parent = True
+            value.__parent_loop = self
         elif isinstance(value, BaseProgress):
             self._progress[name] = value
         else:
@@ -126,14 +123,12 @@ def __getattr__(self, name) -> Any:
         if progress is not None and name in progress:
             return progress[name]
 
-        if name not in self.__dict__:
-            raise AttributeError(f"{self.__class__.__name__} Loop doesn't have attribute {name}.")
-
-        return self.__dict__[name]
+        return object.__getattribute__(self, name)
 
     def __delattr__(self, name) -> None:
         if name in self._loops:
-            self._loops[name]._num_parents -= 1
+            self._loops[name]._has_parent = False
+            self._loops[name]._Loop__parent_loop = None
             del self._loops[name]
         elif name in self._progress:
             del self._progress[name]
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index e4c832eeb7f89..da9708f525feb 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -35,6 +35,10 @@ def __init__(self, dataset: Iterator):
             super().__init__()
             self.dataset = dataset
 
+        @property
+        def skip(self) -> bool:
+            return False
+
         def restore(self) -> None:
             self.iter_dataset = iter(self.dataset)
             for _ in range(self.iteration_count):
@@ -64,8 +68,11 @@ def load_state_dict(self, state_dict: Dict) -> None:
             self.iteration_count = state_dict["iteration_count"]
             self.outputs = state_dict["outputs"]
 
+    trainer = Trainer()
+
     data = range(10)
     loop = Simple(data)
+    loop.trainer = trainer
     try:
         loop.run()
         state_dict = {}
@@ -73,6 +80,7 @@ def load_state_dict(self, state_dict: Dict) -> None:
         state_dict = loop.state_dict()
 
     loop = Simple(data)
+    loop.trainer = trainer
     loop.load_state_dict(state_dict)
     loop.restarting = True
     loop.run()
@@ -109,6 +117,10 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
                 self.progress.increment += 1
             self.progress.increment += 1
 
+        @property
+        def skip(self) -> bool:
+            return False
+
         @property
         def done(self) -> bool:
             return self.iteration_count > 0
@@ -128,21 +140,28 @@ def load_state_dict(self, state_dict: Dict) -> None:
     grand_loop_parent = Simple(0)
     loop_parent = Simple(1)
     loop_child = Simple(2)
+
+    assert not loop_child.has_parent
     loop_parent.loop_child = loop_child
-    assert loop_child._num_parents == 1
-    loop_parent.loop_child = loop_child
-    assert loop_child._num_parents == 1
 
-    with pytest.raises(MisconfigurationException, match="The Simple already contains the provided loop"):
+    assert loop_child._Loop__parent_loop == loop_parent
+
+    assert loop_child.has_parent
+
+    with pytest.raises(MisconfigurationException, match="already has a parent"):
+        loop_parent.loop_child = loop_child
+
+    assert not loop_parent.skip
+
+    with pytest.raises(MisconfigurationException, match="already has a parent"):
         loop_parent.something = loop_child
 
     with pytest.raises(MisconfigurationException, match="Loop hasn't been attached to any Trainer."):
         grand_loop_parent.run()
 
-    grand_loop_parent.loop_child = loop_child
-    assert loop_child._num_parents == 2
-    del grand_loop_parent.loop_child
-    assert loop_child._num_parents == 1
+    with pytest.raises(MisconfigurationException, match="already has a parent"):
+        grand_loop_parent.loop_child = loop_child
+
     assert loop_child.has_parent
     assert loop_parent.has_children
 
@@ -210,6 +229,7 @@ def load_state_dict(self, state_dict: Dict) -> None:
     assert loop_parent.loop_child.progress.increment == 1
 
     del loop_parent.loop_child
-    assert loop_child._num_parents == 0
+    assert not loop_child.has_parent
+    assert loop_child._Loop__parent_loop is None
     state_dict = loop_parent.get_state_dict()
     assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 2})])

From cdf38f0b79d094998c1e499c9d40f14dd3d56831 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 16:38:28 +0200
Subject: [PATCH 008/106] update

---
 pytorch_lightning/loops/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 2b822911411d6..e9a0d0affd2de 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -71,8 +71,8 @@ def has_children(self) -> bool:
 
     @property
     def is_leaf(self) -> bool:
-        """Whether this loop is a children and has no children itself."""
-        return not self.has_children and self.has_parent
+        """This loop is a leaf if it doesn't possess any loops."""
+        return not self.has_children
 
     @property
     def loop_progress(self) -> Dict[str, Any]:

From 88bafafa1a5d4177a16f3655e0c82b3dcde68a8f Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 16:39:49 +0200
Subject: [PATCH 009/106] update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f344f490de6c1..002e01098ab16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -256,6 +256,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `Trainer(resume_from_checkpoint=...)` now restores the model directly after `LightningModule.setup()`, which is before `LightningModule.configure_sharded_model()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652))
 
 
+- Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334))
+
+
 ### Deprecated
 
 

From 0981e949c38ab1fd8ea6f74b8b2bbe90116adf85 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Thu, 8 Jul 2021 17:12:31 +0200
Subject: [PATCH 010/106] resolve bug

---
 pytorch_lightning/loops/base.py | 2 +-
 tests/loops/test_loops.py       | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index e9a0d0affd2de..3f98cf4214523 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -93,7 +93,7 @@ def __setattr__(self, name: str, value: Any) -> None:
             # when assigning a Trainer to a loop, it will assign to its children too.
             object.__setattr__(self, name, value)
             for loop in self._loops.values():
-                object.__setattr__(loop, name, value)
+                loop.__setattr__(name, value)
         elif isinstance(value, Loop):
             if name == "_Loop__parent_loop":
                 object.__setattr__(self, name, value)
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index da9708f525feb..5b1b12ebdf054 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -233,3 +233,12 @@ def load_state_dict(self, state_dict: Dict) -> None:
     assert loop_child._Loop__parent_loop is None
     state_dict = loop_parent.get_state_dict()
     assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 2})])
+
+    grand_loop_parent = Simple(0)
+    loop_parent = Simple(1)
+    loop_child = Simple(2)
+    grand_loop_parent.loop_child = loop_parent
+    loop_parent.loop_child = loop_child
+
+    grand_loop_parent.trainer = Trainer()
+    assert loop_child.trainer is not None

From 7f8000f0bc453761d9e4c6fa092853b4f696dd34 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 11:59:30 +0200
Subject: [PATCH 011/106] resolve comments

---
 pytorch_lightning/loops/base.py | 187 +++++++++++++-------------------
 tests/loops/test_loops.py       |  88 +++++----------
 2 files changed, 106 insertions(+), 169 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 3f98cf4214523..2405ec49ed704 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -17,11 +17,11 @@
 from typing import Any, Dict, Optional
 
 from deprecate import void
+from torch.nn.modules.module import _IncompatibleKeys
 
 import pytorch_lightning as pl
 from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.warnings import WarningCache
 
 warning_cache = WarningCache()
@@ -51,89 +51,35 @@ class Loop(ABC):
 
     def __init__(self) -> None:
         self.iteration_count: int = 0
-        self.trainer: Optional['pl.Trainer'] = None
+        self._trainer: Optional['pl.Trainer'] = None
         self.restarting = False
-        self._loops = OrderedDict()
-        self._progress = OrderedDict()
-        self._has_parent: bool = False
-        self.__parent_loop: Optional['Loop'] = None
-
-    @property
-    def has_parent(self) -> Optional[bool]:
-        """Whether this loop has been attached to another loop"""
-        return self._has_parent
-
-    @property
-    def has_children(self) -> bool:
-        """Whether this loop has any children"""
-        loops = self.__dict__.get('_loops')
-        return len(loops) > 0
-
-    @property
-    def is_leaf(self) -> bool:
-        """This loop is a leaf if it doesn't possess any loops."""
-        return not self.has_children
 
     @property
     def loop_progress(self) -> Dict[str, Any]:
         """Return the progress for the current loop and children loop."""
         progress = {}
-        for n, p in self.__dict__.get('_progress').items():
-            progress[n] = p
-
-        loops = self.__dict__.get('_loops')
-
-        if loops is not None:
-            for name, loop in loops.items():
-                progress[name] = ProgressDict(**loop.loop_progress)
+        for k, v in self.__dict__.items():
+            if isinstance(v, BaseProgress):
+                progress[k] = v
+            elif isinstance(v, Loop):
+                progress[k] = ProgressDict(**v.loop_progress)
         return ProgressDict(**progress)
 
-    def __setattr__(self, name: str, value: Any) -> None:
-        if isinstance(value, pl.Trainer):
-            # when assigning a Trainer to a loop, it will assign to its children too.
-            object.__setattr__(self, name, value)
-            for loop in self._loops.values():
-                loop.__setattr__(name, value)
-        elif isinstance(value, Loop):
-            if name == "_Loop__parent_loop":
-                object.__setattr__(self, name, value)
-                return
-            if getattr(self, "__children__loops__", None) is not None and name not in self.__children__loops__:
-                raise MisconfigurationException(
-                    f"The current loop accept only {self.__children__loops__} as children attribute names. Found {name}"
-                )
-            if value._has_parent:
-                raise MisconfigurationException(f"This provided loop {value} already has a parent. ")
-            self._loops[name] = value
-            value._has_parent = True
-            value.__parent_loop = self
-        elif isinstance(value, BaseProgress):
-            self._progress[name] = value
-        else:
-            object.__setattr__(self, name, value)
-
-    def __getattr__(self, name) -> Any:
-        loops = self.__dict__.get('_loops')
-
-        if loops is not None and name in loops:
-            return loops[name]
-
-        progress = self.__dict__.get('_progress')
-
-        if progress is not None and name in progress:
-            return progress[name]
-
-        return object.__getattribute__(self, name)
-
-    def __delattr__(self, name) -> None:
-        if name in self._loops:
-            self._loops[name]._has_parent = False
-            self._loops[name]._Loop__parent_loop = None
-            del self._loops[name]
-        elif name in self._progress:
-            del self._progress[name]
-        else:
-            object.__delattr__(self, name)
+    @property
+    def trainer(self) -> Optional['pl.Trainer']:
+        return self._trainer
+
+    @trainer.setter
+    def trainer(self, trainer: 'pl.Trainer'):
+        """Connect the Trainer to itself and all sub-children loops"""
+        if not isinstance(trainer, pl.Trainer):
+            raise MisconfigurationException(
+                f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."
+            )
+        self._trainer = trainer
+        for v in self.__dict__.values():
+            if isinstance(v, Loop):
+                v.trainer = trainer
 
     @property
     @abstractmethod
@@ -148,10 +94,6 @@ def skip(self) -> bool:
     def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects Loop with all the necessary things like connectors and accelerators."""
         # TODO(@justusschock): Make the trainer a weakref/proxy
-        if not isinstance(trainer, pl.Trainer):
-            raise MisconfigurationException(
-                f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."
-            )
         self.trainer = trainer
 
     def on_skip(self) -> Optional[Any]:
@@ -178,13 +120,7 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         if self.skip:
             return self.on_skip()
 
-        if self.restarting:
-            if not is_overridden("restore", self, Loop):
-                warning_cache.warn(f"{self.__class__.__name__} Loop doesn't override the restore function.")
-            self.restore()
-            self.restarting = False
-        else:
-            self.reset()
+        self.reset()
 
         self.on_run_start(*args, **kwargs)
 
@@ -200,9 +136,6 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         output = self.on_run_end()
         return output
 
-    def restore(self, state: Optional[Dict] = None) -> None:
-        """Restore the internal state of the loop the beginning of run if restarting is ``True``."""
-
     @abstractmethod
     def reset(self) -> None:
         """Resets the internal state of the loop at the beginning of each call to :attr:`run`."""
@@ -234,33 +167,46 @@ def on_run_end(self) -> Any:
     def teardown(self) -> None:
         """Use to release memory etc."""
 
-    def state_dict(self) -> Dict:
-        """Current Loop state"""
+    def on_save_checkpoint(self) -> Dict:
+        """
+        Called when saving a model checkpoint, use to persist loop state.
+
+        Returns:
+            The current loop state.
+        """
         return {}
 
-    def load_state_dict(self, state_dict: Dict) -> None:
-        """Reload Loop state"""
+    def on_load_checkpoint(self, state_dict: Dict):
+        """Called when loading a model checkpoint, use to reload loop state."""
 
-    def get_state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional[str] = '') -> OrderedDict:
+    def state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional[str] = '') -> Dict:
         if destination is None:
             destination = OrderedDict()
 
-        destination[prefix + "state_dict"] = self.state_dict()
+        destination[prefix + "state_dict"] = self.on_save_checkpoint()
 
-        for name, progress in self._progress.items():
-            destination[prefix + name] = progress.state_dict()
+        for k, v in self.__dict__.items():
+            if isinstance(v, BaseProgress):
+                destination[prefix + k] = v.state_dict()
+            elif isinstance(v, Loop):
+                v.state_dict(destination, prefix + k + '.')
 
-        for name, loop in self._loops.items():
-            loop.get_state_dict(destination, prefix + name + '.')
         return destination
 
-    def _load_from_state_dict(self, state_dict, prefix, strict, missing_keys, unexpected_keys, error_msgs):
-        self.load_state_dict(state_dict[prefix + "state_dict"])
+    def _load_from_state_dict(
+        self, state_dict, prefix, strict, restart_progress, missing_keys, unexpected_keys, error_msgs
+    ):
+        print(state_dict, prefix)
+        for k, v in self.__dict__.items():
+            if isinstance(v, BaseProgress):
+                v.load_state_dict(state_dict[prefix + k])
 
-        for name, progress in self._progress.items():
-            progress.load_state_dict(state_dict[prefix + name])
+        self.on_load_checkpoint(state_dict[prefix + "state_dict"])
 
-    def _load_state_dict(self, state_dict: Dict, strict: bool = True):
+    def load_state_dict(self, state_dict: Dict, restart_progress: bool = True, strict: bool = True):
+        """
+        This function is highly inspired from ``PyTorch nn.Module``.
+        """
 
         missing_keys = []
         unexpected_keys = []
@@ -269,11 +215,32 @@ def _load_state_dict(self, state_dict: Dict, strict: bool = True):
         state_dict = state_dict.copy()
 
         def load(loop, prefix=''):
-            loop._load_from_state_dict(state_dict, prefix, True, missing_keys, unexpected_keys, error_msgs)
+            if loop.restarting:
+                return
+            loop._load_from_state_dict(
+                state_dict, prefix, True, restart_progress, missing_keys, unexpected_keys, error_msgs
+            )
             loop.restarting = True
-            for name, loop_children in loop._loops.items():
-                if loop_children is not None:
-                    load(loop_children, prefix + name + '.')
+            for k, v in self.__dict__.items():
+                if isinstance(v, Loop):
+                    load(v, prefix + k + '.')
 
         load(self)
-        load = None  # break load->load reference cycle
+
+        if strict:
+            if len(unexpected_keys) > 0:
+                error_msgs.insert(
+                    0, 'Unexpected key(s) in state_dict: {}. '.format(
+                        ', '.join('"{}"'.format(k) for k in unexpected_keys)
+                    )
+                )
+            if len(missing_keys) > 0:
+                error_msgs.insert(
+                    0, 'Missing key(s) in state_dict: {}. '.format(', '.join('"{}"'.format(k) for k in missing_keys))
+                )
+
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                'Error(s) in loading state_dict for {}:\n\t{}'.format(self.__class__.__name__, "\n\t".join(error_msgs))
+            )
+        return _IncompatibleKeys(missing_keys, unexpected_keys)
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 5b1b12ebdf054..5ce182c59a476 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -16,12 +16,9 @@
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator
 
-import pytest
-
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
 from pytorch_lightning.trainer.trainer import Trainer
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 def test_loop_restore():
@@ -39,19 +36,20 @@ def __init__(self, dataset: Iterator):
         def skip(self) -> bool:
             return False
 
-        def restore(self) -> None:
-            self.iter_dataset = iter(self.dataset)
-            for _ in range(self.iteration_count):
-                next(self.iter_dataset)
-            self.iteration_count += 1
-
         @property
         def done(self) -> bool:
             return self.iteration_count > len(self.dataset)
 
         def reset(self) -> None:
             self.iter_dataset = iter(self.dataset)
-            self.outputs = []
+
+            if self.restarting:
+                for _ in range(self.iteration_count):
+                    next(self.iter_dataset)
+                self.iteration_count += 1
+                self.restarting = False
+            else:
+                self.outputs = []
 
         def advance(self) -> None:
             value = next(self.iter_dataset)
@@ -104,17 +102,16 @@ def load_state_dict(self, state_dict):
 
     class Simple(Loop):
 
-        __children__loops__ = ("loop_child", "something")
-
         def __init__(self, a):
             super().__init__()
             self.a = a
             self.progress = SimpleProgress()
 
         def advance(self, *args: Any, **kwargs: Any) -> None:
-            for loop in self._loops.values():
-                loop.run()
-                self.progress.increment += 1
+            loop = getattr(self, "loop_child", None)
+            if not loop:
+                return
+            loop.run()
             self.progress.increment += 1
 
         @property
@@ -126,49 +123,23 @@ def done(self) -> bool:
             return self.iteration_count > 0
 
         def reset(self) -> None:
-            pass
+            self.restarting = False
 
-        def restore(self) -> None:
-            pass
-
-        def state_dict(self) -> Dict:
+        def on_save_checkpoint(self) -> Dict:
             return {"a": self.a}
 
-        def load_state_dict(self, state_dict: Dict) -> None:
+        def on_load_checkpoint(self, state_dict: Dict) -> None:
             self.a = state_dict["a"]
 
     grand_loop_parent = Simple(0)
     loop_parent = Simple(1)
     loop_child = Simple(2)
 
-    assert not loop_child.has_parent
     loop_parent.loop_child = loop_child
 
-    assert loop_child._Loop__parent_loop == loop_parent
-
-    assert loop_child.has_parent
-
-    with pytest.raises(MisconfigurationException, match="already has a parent"):
-        loop_parent.loop_child = loop_child
-
     assert not loop_parent.skip
 
-    with pytest.raises(MisconfigurationException, match="already has a parent"):
-        loop_parent.something = loop_child
-
-    with pytest.raises(MisconfigurationException, match="Loop hasn't been attached to any Trainer."):
-        grand_loop_parent.run()
-
-    with pytest.raises(MisconfigurationException, match="already has a parent"):
-        grand_loop_parent.loop_child = loop_child
-
-    assert loop_child.has_parent
-    assert loop_parent.has_children
-
-    state_dict = loop_parent.get_state_dict()
-
-    with pytest.raises(MisconfigurationException, match="The current loop accept only"):
-        loop_parent.wrong_name = loop_child
+    state_dict = loop_parent.state_dict()
 
     loop_progress: ProgressDict = loop_parent.loop_progress
     assert loop_progress["progress"] == loop_parent.progress
@@ -197,42 +168,41 @@ def load_state_dict(self, state_dict: Dict) -> None:
     loop_parent.progress
 
     state_dict["loop_child.state_dict"]["a"] = 3
-    loop_parent._load_state_dict(state_dict)
+
+    loop_parent.load_state_dict(state_dict)
     assert loop_parent.restarting
 
     loop_parent.run()
 
     loop_parent_copy = deepcopy(loop_parent)
-    assert loop_parent_copy.get_state_dict() == loop_parent.get_state_dict()
+    assert loop_parent_copy.state_dict() == loop_parent.state_dict()
 
-    assert loop_parent_copy.state_dict() == {'a': 1}
-    assert loop_parent_copy.loop_child.state_dict() == {'a': 3}
+    assert loop_parent_copy.on_save_checkpoint() == {'a': 1}
+    assert loop_parent_copy.loop_child.on_save_checkpoint() == {'a': 3}
 
     assert not loop_parent.restarting
 
-    state_dict = loop_parent.get_state_dict()
+    state_dict = loop_parent.state_dict()
     assert state_dict == OrderedDict([('state_dict', {
         'a': 1
     }), ('progress', {
-        'increment': 2
+        'increment': 1
     }), ('loop_child.state_dict', {
         'a': 3
     }), ('loop_child.progress', {
-        'increment': 1
+        'increment': 0
     })])
 
     loop_parent = Simple(1)
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
-    loop_parent._load_state_dict(state_dict)
-    assert loop_parent.progress.increment == 2
-    assert loop_parent.loop_child.progress.increment == 1
+    loop_parent.load_state_dict(state_dict)
+    assert loop_parent.progress.increment == 1
+    assert loop_parent.loop_child.progress.increment == 0
 
     del loop_parent.loop_child
-    assert not loop_child.has_parent
-    assert loop_child._Loop__parent_loop is None
-    state_dict = loop_parent.get_state_dict()
-    assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 2})])
+    state_dict = loop_parent.state_dict()
+    assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 1})])
 
     grand_loop_parent = Simple(0)
     loop_parent = Simple(1)

From 4153b8106a46d26261bebc7558e5d3b7cb1cfc47 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 12:07:07 +0200
Subject: [PATCH 012/106] update

---
 pytorch_lightning/loops/base.py       |  7 +++----
 pytorch_lightning/trainer/progress.py | 11 +----------
 tests/loops/test_loops.py             |  8 ++------
 3 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 2405ec49ed704..629e443b788a7 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -20,7 +20,7 @@
 from torch.nn.modules.module import _IncompatibleKeys
 
 import pytorch_lightning as pl
-from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
+from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.warnings import WarningCache
 
@@ -62,8 +62,8 @@ def loop_progress(self) -> Dict[str, Any]:
             if isinstance(v, BaseProgress):
                 progress[k] = v
             elif isinstance(v, Loop):
-                progress[k] = ProgressDict(**v.loop_progress)
-        return ProgressDict(**progress)
+                progress[k] = v.loop_progress
+        return progress
 
     @property
     def trainer(self) -> Optional['pl.Trainer']:
@@ -196,7 +196,6 @@ def state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional
     def _load_from_state_dict(
         self, state_dict, prefix, strict, restart_progress, missing_keys, unexpected_keys, error_msgs
     ):
-        print(state_dict, prefix)
         for k, v in self.__dict__.items():
             if isinstance(v, BaseProgress):
                 v.load_state_dict(state_dict[prefix + k])
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 54b85273d9c0a..3acae2485cea0 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -12,16 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import asdict, dataclass, field
-from typing import Dict, Optional
-
-
-class ProgressDict(Dict):
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-        for k, v in kwargs.items():
-            setattr(self, k, v)
+from typing import Optional
 
 
 @dataclass
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 5ce182c59a476..03418d5e70430 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -17,7 +17,7 @@
 from typing import Any, Dict, Iterator
 
 from pytorch_lightning.loops.base import Loop
-from pytorch_lightning.trainer.progress import BaseProgress, ProgressDict
+from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.trainer.trainer import Trainer
 
 
@@ -141,16 +141,12 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
     state_dict = loop_parent.state_dict()
 
-    loop_progress: ProgressDict = loop_parent.loop_progress
+    loop_progress = loop_parent.loop_progress
     assert loop_progress["progress"] == loop_parent.progress
     assert loop_progress["loop_child"]["progress"] == loop_child.progress
 
-    assert loop_progress.progress == loop_parent.progress
-    assert loop_progress.loop_child.progress == loop_child.progress
-
     loop_progress = loop_child.loop_progress
     assert loop_progress["progress"] == loop_child.progress
-    assert loop_progress.progress == loop_child.progress
 
     loop_parent.trainer = Trainer()
     assert loop_child.trainer == loop_parent.trainer

From d6280e0677aa3e975ed6a8ae97caf2742a92a08f Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 12:08:16 +0200
Subject: [PATCH 013/106] update

---
 pytorch_lightning/loops/base.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 629e443b788a7..709556356701c 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -22,9 +22,6 @@
 import pytorch_lightning as pl
 from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.warnings import WarningCache
-
-warning_cache = WarningCache()
 
 
 class Loop(ABC):

From c499c241335a249314cb43b3cb9e79c42ac6a94b Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 12:10:01 +0200
Subject: [PATCH 014/106] update changelog

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 002e01098ab16..42dbdb2f8d277 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -342,6 +342,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed deprecated `optimizer` argument in `LightningModule.manual_backward()`; Toggling optimizers in manual optimization should be done using `LightningModule.{un}toggle_optimizer()` ([#8287](https://github.com/PyTorchLightning/pytorch-lightning/pull/8287))
 
 
+- Removed `Loop restore` function to give more control for loop restart ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334))
+
+
 ### Fixed
 
 - Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/PyTorchLightning/pytorch-lightning/pull/7877))
@@ -401,6 +404,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed missing call to `LightningModule.untoggle_optimizer` in training loop when running gradient accumulation with multiple optimizers ([#8284](https://github.com/PyTorchLightning/pytorch-lightning/pull/8284))
 
 
+
 ## [1.3.8] - 2021-07-01
 
 ### Fixed

From 3cb6df2a4cc87386dccc4855de1d546d0c067d6d Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 12:45:09 +0200
Subject: [PATCH 015/106] update

---
 pytorch_lightning/loops/base.py     |   5 +-
 tests/loops/test_loop_state_dict.py | 206 +++++++++++++++++++++++++++-
 2 files changed, 202 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 709556356701c..731192858a308 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from collections import OrderedDict
 from typing import Any, Dict, Optional
 
 from deprecate import void
@@ -176,9 +175,9 @@ def on_save_checkpoint(self) -> Dict:
     def on_load_checkpoint(self, state_dict: Dict):
         """Called when loading a model checkpoint, use to reload loop state."""
 
-    def state_dict(self, destination: Optional[OrderedDict] = None, prefix: Optional[str] = '') -> Dict:
+    def state_dict(self, destination: Optional[Dict] = None, prefix: Optional[str] = '') -> Dict:
         if destination is None:
-            destination = OrderedDict()
+            destination = {}
 
         destination[prefix + "state_dict"] = self.on_save_checkpoint()
 
diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index 1930dc46566fd..eed23a89a8b36 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -40,15 +40,209 @@ def test_loops_state_dict_structure():
         "test_loop": trainer.test_loop.state_dict(),
         "predict_loop": trainer.predict_loop.state_dict(),
     }
+    # todo (tchaton) Update this once new progress as been added.
+    # yapf: disable
     expected = {
         "fit_loop": {
-            'epoch_loop': {
-                'batch_loop': {},
-                'val_loop': {},
+            "epoch_loop": {
+                "batch_loop": {
+                    "state_dict": {},
+                    "progress": {
+                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                        "current": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": 0,
+                            "completed": 0,
+                        },
+                    },
+                    "optim_progress": {
+                        "optimizer": {
+                            "step": {
+                                "total": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": None,
+                                    "completed": 0,
+                                },
+                                "current": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": None,
+                                    "completed": 0,
+                                },
+                            },
+                            "zero_grad": {
+                                "total": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": None,
+                                    "completed": 0,
+                                },
+                                "current": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": None,
+                                    "completed": 0,
+                                },
+                            },
+                        },
+                        "scheduler": {
+                            "total": {
+                                "ready": 0,
+                                "started": None,
+                                "processed": None,
+                                "completed": 0,
+                            },
+                            "current": {
+                                "ready": 0,
+                                "started": None,
+                                "processed": None,
+                                "completed": 0,
+                            },
+                        },
+                    },
+                },
+                "val_loop": {
+                    "state_dict": {},
+                    "progress": {
+                        "epoch": {
+                            "total": {
+                                "ready": 0,
+                                "started": 0,
+                                "processed": 0,
+                                "completed": 0,
+                            },
+                            "current": {
+                                "ready": 0,
+                                "started": 0,
+                                "processed": 0,
+                                "completed": 0,
+                            },
+                            "batch": {
+                                "total": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": 0,
+                                    "completed": 0,
+                                },
+                                "current": {
+                                    "ready": 0,
+                                    "started": 0,
+                                    "processed": 0,
+                                    "completed": 0,
+                                },
+                            },
+                        }
+                    },
+                    "epoch_loop.state_dict": {},
+                    "epoch_loop.progress": {
+                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                        "current": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": 0,
+                            "completed": 0,
+                        },
+                        "batch": {
+                            "total": {
+                                "ready": 0,
+                                "started": 0,
+                                "processed": 0,
+                                "completed": 0,
+                            },
+                            "current": {
+                                "ready": 0,
+                                "started": 0,
+                                "processed": 0,
+                                "completed": 0,
+                            },
+                        },
+                    },
+                },
             }
         },
-        "validate_loop": {},
-        "test_loop": {},
-        "predict_loop": {},
+        "validate_loop": {
+            "state_dict": {},
+            "progress": {
+                "epoch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "batch": {
+                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                        "current": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": 0,
+                            "completed": 0,
+                        },
+                    },
+                }
+            },
+            "epoch_loop.state_dict": {},
+            "epoch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "batch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                },
+            },
+        },
+        "test_loop": {
+            "state_dict": {},
+            "progress": {
+                "epoch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "batch": {
+                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                        "current": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": 0,
+                            "completed": 0,
+                        },
+                    },
+                }
+            },
+            "epoch_loop.state_dict": {},
+            "epoch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "batch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                },
+            },
+        },
+        "predict_loop": {
+            "state_dict": {},
+            "progress": {
+                "epoch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "batch": {
+                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                        "current": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": 0,
+                            "completed": 0,
+                        },
+                    },
+                }
+            },
+            "epoch_loop.state_dict": {},
+            "epoch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "batch": {
+                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                },
+            },
+        },
     }
+    # yapf: enable
     assert state_dict == expected

From e8c12e95d64447919a17cb4e246b4ce35a7ab7eb Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 13:19:05 +0200
Subject: [PATCH 016/106] update

---
 pytorch_lightning/loops/base.py       | 60 +++++++--------------------
 pytorch_lightning/trainer/progress.py | 13 ++++++
 2 files changed, 29 insertions(+), 44 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 731192858a308..0f718ff2ce1be 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -16,10 +16,10 @@
 from typing import Any, Dict, Optional
 
 from deprecate import void
-from torch.nn.modules.module import _IncompatibleKeys
 
 import pytorch_lightning as pl
-from pytorch_lightning.trainer.progress import BaseProgress
+from pytorch_lightning.trainer.progress import BaseProgress, Tracker
+from pytorch_lightning.utilities.apply_func import apply_to_collection
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -189,53 +189,25 @@ def state_dict(self, destination: Optional[Dict] = None, prefix: Optional[str] =
 
         return destination
 
-    def _load_from_state_dict(
-        self, state_dict, prefix, strict, restart_progress, missing_keys, unexpected_keys, error_msgs
-    ):
+    def _load_from_state_dict(self, state_dict, prefix, restart_progress):
         for k, v in self.__dict__.items():
             if isinstance(v, BaseProgress):
                 v.load_state_dict(state_dict[prefix + k])
+                if restart_progress:
 
-        self.on_load_checkpoint(state_dict[prefix + "state_dict"])
+                    def restart(v: Tracker):
+                        v.reset_on_restart()
 
-    def load_state_dict(self, state_dict: Dict, restart_progress: bool = True, strict: bool = True):
-        """
-        This function is highly inspired from ``PyTorch nn.Module``.
-        """
+                    apply_to_collection(v, Tracker, restart)
 
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
+        self.on_load_checkpoint(state_dict[prefix + "state_dict"])
+        self.restarting = True
 
-        state_dict = state_dict.copy()
+    def __load(self, state_dict, restart_progress, prefix=''):
+        self._load_from_state_dict(state_dict, prefix, restart_progress)
+        for k, v in self.__dict__.items():
+            if isinstance(v, Loop):
+                v.__load(state_dict.copy(), restart_progress, prefix + k + '.')
 
-        def load(loop, prefix=''):
-            if loop.restarting:
-                return
-            loop._load_from_state_dict(
-                state_dict, prefix, True, restart_progress, missing_keys, unexpected_keys, error_msgs
-            )
-            loop.restarting = True
-            for k, v in self.__dict__.items():
-                if isinstance(v, Loop):
-                    load(v, prefix + k + '.')
-
-        load(self)
-
-        if strict:
-            if len(unexpected_keys) > 0:
-                error_msgs.insert(
-                    0, 'Unexpected key(s) in state_dict: {}. '.format(
-                        ', '.join('"{}"'.format(k) for k in unexpected_keys)
-                    )
-                )
-            if len(missing_keys) > 0:
-                error_msgs.insert(
-                    0, 'Missing key(s) in state_dict: {}. '.format(', '.join('"{}"'.format(k) for k in missing_keys))
-                )
-
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                'Error(s) in loading state_dict for {}:\n\t{}'.format(self.__class__.__name__, "\n\t".join(error_msgs))
-            )
-        return _IncompatibleKeys(missing_keys, unexpected_keys)
+    def load_state_dict(self, state_dict: Dict, restart_progress: bool = True):
+        self.__load(state_dict.copy(), restart_progress)
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 3acae2485cea0..1098957033855 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -70,6 +70,19 @@ def __repr__(self):
         args = [f"{k}={v}" for k, v in self.__dict__.items() if v is not None]
         return f"{self.__class__.__name__}({', '.join(args)})"
 
+    def reset_on_restart(self):
+        """Reset the progress on restart"""
+        value = self.completed if self.processed is None else self.processed
+
+        if self.ready is not None:
+            self.ready = value
+        if self.started is not None:
+            self.started = value
+        if self.processed is not None:
+            self.processed = value
+        if self.completed is not None:
+            self.completed = value
+
 
 @dataclass
 class Progress(BaseProgress):

From df4b1ba3b5b8be6ef8efc528c61cda9524439f82 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 13:19:54 +0200
Subject: [PATCH 017/106] remove space

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 42dbdb2f8d277..14db839d0d6e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -404,7 +404,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed missing call to `LightningModule.untoggle_optimizer` in training loop when running gradient accumulation with multiple optimizers ([#8284](https://github.com/PyTorchLightning/pytorch-lightning/pull/8284))
 
 
-
 ## [1.3.8] - 2021-07-01
 
 ### Fixed

From ee8d9b80368eacd48d1f10ec99df635efdb9ea44 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 13:20:57 +0200
Subject: [PATCH 018/106] update

---
 pytorch_lightning/loops/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 0f718ff2ce1be..98248a5e631de 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -67,7 +67,7 @@ def trainer(self) -> Optional['pl.Trainer']:
 
     @trainer.setter
     def trainer(self, trainer: 'pl.Trainer'):
-        """Connect the Trainer to itself and all sub-children loops"""
+        """Connect the Trainer to itself and all its children loops"""
         if not isinstance(trainer, pl.Trainer):
             raise MisconfigurationException(
                 f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."

From 65540a88624106bb070b6353913668edc3331ad2 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 14:14:42 +0200
Subject: [PATCH 019/106] add progress tracking to loops

---
 pytorch_lightning/core/optimizer.py           |   1 +
 pytorch_lightning/loops/base.py               |   1 -
 .../loops/batch/training_batch_loop.py        |  58 +++--
 .../loops/dataloader/evaluation_loop.py       |  33 ++-
 .../loops/dataloader/prediction_loop.py       |   2 -
 .../loops/epoch/evaluation_epoch_loop.py      |  25 +-
 .../loops/epoch/training_epoch_loop.py        |  43 ++--
 pytorch_lightning/loops/fit_loop.py           |  19 +-
 .../connectors/checkpoint_connector.py        |  26 ++
 .../trainer/connectors/optimizer_connector.py |   7 +
 pytorch_lightning/trainer/progress.py         |  77 +++---
 pytorch_lightning/trainer/trainer.py          |   9 +-
 pytorch_lightning/utilities/imports.py        |   5 +
 tests/trainer/test_progress.py                | 241 +++++++++++++-----
 14 files changed, 356 insertions(+), 191 deletions(-)

diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index 25e4519eb39fc..33b44a35d31f1 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -207,6 +207,7 @@ def closure_dis():
             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
 
         self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
+        self._trainer.fit_loop.epoch_loop.batch_loop.optim_progress.optimizer.step.increment_processed()
         self._total_optimizer_step_calls += 1
 
     def __repr__(self):
diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 98248a5e631de..0b5df30003ba8 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -186,7 +186,6 @@ def state_dict(self, destination: Optional[Dict] = None, prefix: Optional[str] =
                 destination[prefix + k] = v.state_dict()
             elif isinstance(v, Loop):
                 v.state_dict(destination, prefix + k + '.')
-
         return destination
 
     def _load_from_state_dict(self, state_dict, prefix, restart_progress):
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 41ad9280ffaf7..89ea775977817 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import BatchProgress, OptimizationProgress
+from pytorch_lightning.trainer.progress import OptimizationProgress, Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -50,29 +50,17 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        self.progress = BatchProgress()
+        self.progress = Progress()
         self.optim_progress = OptimizationProgress()
-
         self._warning_cache: WarningCache = WarningCache()
         self._hiddens: Optional[Tensor] = None
         self._optimizer_freq_cumsum: Optional[int] = None
         self._remaining_splits: Optional[List[Any]] = None
         self._skip_backward: bool = False
 
-    def connect(
-        self,
-        trainer: 'pl.Trainer',
-        *args: Any,
-        progress: Optional[BatchProgress] = None,
-        optim_progress: Optional[OptimizationProgress] = None,
-        **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        if optim_progress is not None:
-            self.optim_progress = optim_progress
 
     @property
     def done(self) -> bool:
@@ -98,6 +86,8 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(signal=0, training_step_output=[[]])
 
+        self.progress.increment_ready()
+
         # hook
         self.trainer.logger_connector.on_batch_start()
         response = self.trainer.call_hook("on_batch_start")
@@ -120,6 +110,8 @@ def reset(self) -> None:
         self.batch_idx = 0
         self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))]
 
+        self.optim_progress.optimizer.reset_on_epoch()
+
     def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         """Splits the data into tbptt splits
 
@@ -131,6 +123,10 @@ def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         void(batch_idx, dataloader_idx)
         self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch)))
 
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        super().on_advance_start(*args, **kwargs)
+        self.progress.increment_started()
+
     def advance(self, batch, batch_idx, dataloader_idx):
         """Runs the train step together with optimization (if necessary) on the current batch split
 
@@ -148,7 +144,18 @@ def advance(self, batch, batch_idx, dataloader_idx):
         self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch)
 
         if self.trainer.lightning_module.automatic_optimization:
-            for opt_idx, optimizer in self.get_active_optimizers(batch_idx):
+            active_optimizers = self.get_active_optimizers(batch_idx)
+            for opt_idx, optimizer in active_optimizers:
+
+                # handle optimization restart
+                if self.restarting:
+                    if len(active_optimizers) > 1 and opt_idx < self.progress.current.completed:
+                        continue
+                    self.restarting = False
+
+                # track optimizer_idx
+                self.optim_progress.optimizer_idx = opt_idx
+
                 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
                 if result:
                     self.batch_outputs[opt_idx].append(result.training_step_output)
@@ -158,6 +165,12 @@ def advance(self, batch, batch_idx, dataloader_idx):
             if result:
                 self.batch_outputs[0].append(result.training_step_output)
 
+        self.progress.increment_processed()
+
+    def on_advance_end(self) -> None:
+        super().on_advance_end()
+        self.progress.increment_completed()
+
     def teardown(self) -> None:
         # release memory
         self._remaining_splits = None
@@ -240,6 +253,11 @@ def _training_step_and_backward_closure(
         result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
         if result is not None:
             return_result.update(result)
+
+            # this should be done only if result.loss exists and ``optimizer step`` is being run
+            if not self.should_accumulate():
+                self.optim_progress.optimizer.step.increment_started()
+
             return return_result.loss
 
     def _make_closure(self, *closure_args: Any, **closure_kwargs: Any) -> Callable:
@@ -409,6 +427,8 @@ def _optimizer_step(
         # wraps into LightningOptimizer only for running step
         optimizer = LightningOptimizer._to_lightning_optimizer(optimizer, self.trainer, opt_idx)
 
+        self.optim_progress.optimizer.step.increment_ready()
+
         # model hook
         model_ref.optimizer_step(
             self.trainer.current_epoch,
@@ -421,13 +441,17 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
+        self.optim_progress.optimizer.step.increment_completed()
+
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
 
         Args:
             optimizer: the current optimizer
         """
+        self.optim_progress.optimizer.zero_grad.increment_started()
         self.trainer.call_hook('on_before_zero_grad', optimizer)
+        self.optim_progress.optimizer.zero_grad.increment_ready()
 
     def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int) -> None:
         """Zeroes out all gradients of parameters optimized by the current optimizer.
@@ -439,6 +463,8 @@ def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer,
         """
         self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
 
+        self.optim_progress.optimizer.zero_grad.increment_completed()
+
     def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, Tensor]:
         """Tracks gradient norms and clips the gradients of all parameters optimized by the current optimizer.
 
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 2f6e14b93b767..5dc0270f58774 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -21,7 +21,7 @@
 from pytorch_lightning.loops.dataloader import DataLoaderLoop
 from pytorch_lightning.loops.epoch import EvaluationEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import EpochLoopProgress
+from pytorch_lightning.trainer.progress import DataLoaderProgress
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT
@@ -33,10 +33,8 @@ class EvaluationLoop(DataLoaderLoop):
     def __init__(self):
         super().__init__()
         self.outputs = []
-        self.progress = EpochLoopProgress()
-
+        self.progress = DataLoaderProgress()
         self.epoch_loop = EvaluationEpochLoop()
-
         self._results = ResultCollection(training=False)
         self._max_batches: Optional[Union[int, Sequence[int]]] = None
         self._has_run: bool = False
@@ -66,14 +64,10 @@ def predictions(self):
         """Returns the predictions from all dataloaders"""
         return self.epoch_loop.predictions
 
-    def connect(
-        self, trainer: "pl.Trainer", *args: Any, progress: Optional[EpochLoopProgress] = None, **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        self.epoch_loop.connect(trainer, progress=self.progress.epoch)
+        self.epoch_loop.connect(trainer)
 
     @property
     def done(self) -> bool:
@@ -96,18 +90,31 @@ def reset(self) -> None:
         if isinstance(self._max_batches, int):
             self._max_batches = [self._max_batches] * len(self.dataloaders)
 
+        if self.restarting:
+            self.iteration_count = self.progress.dataloader_idx
+            self.restarting = False
+        else:
+            self.iteration_count = 0
+            # reset batch / epoch progress tracking
+            self.progress.current.reset()
+
     def on_skip(self) -> List:
         return []
 
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs the ``on_evaluation_model_eval``, ``on_evaluation_start`` and ``on_evaluation_epoch_start`` hooks"""
         void(*args, **kwargs)
+
+        self.progress.increment_started()
+
         # hook
         self.on_evaluation_model_eval()
         self.trainer.lightning_module.zero_grad()
         self.on_evaluation_start()
         self.on_evaluation_epoch_start()
 
+        self.progress.increment_ready()
+
     def advance(self, *args: Any, **kwargs: Any) -> None:
         """Performs evaluation on one single dataloader"""
         void(*args, **kwargs)
@@ -115,6 +122,8 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         dataloader_iter = enumerate(dataloader)
         dl_max_batches = self._max_batches[self.current_dataloader_idx]
 
+        self.progress.dataloader_idx = self.iteration_count
+
         dl_outputs = self.epoch_loop.run(
             dataloader_iter,
             self.current_dataloader_idx,
@@ -141,6 +150,8 @@ def on_run_end(self) -> Any:
         if len(outputs) > 0 and self.num_dataloaders == 1:
             outputs = outputs[0]
 
+        self.progress.increment_processed()
+
         # lightning module method
         self.evaluation_epoch_end(outputs)
 
@@ -159,6 +170,8 @@ def on_run_end(self) -> Any:
         # enable train mode again
         self.on_evaluation_model_train()
 
+        self.progress.increment_completed()
+
         return eval_loop_results
 
     def get_max_batches(self) -> List[Union[int, float]]:
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 55647e5d7f2a3..1bdd38ed950b0 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -20,9 +20,7 @@ def __init__(self):
         self.predictions: Optional[List[List[Any]]] = None
         self.epoch_batch_indices: Optional[List[List[int]]] = None
         self.progress = EpochLoopProgress()
-
         self.epoch_loop = PredictionEpochLoop()
-
         self._results = None  # for `trainer._results` access
         self._return_predictions: bool = False
 
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index c01b20a5f84e2..c56b4a7f097d1 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -21,7 +21,7 @@
 import pytorch_lightning as pl
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import EpochProgress
+from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import PredictionCollection
 from pytorch_lightning.utilities.memory import recursive_detach
 from pytorch_lightning.utilities.types import STEP_OUTPUT
@@ -41,15 +41,11 @@ def __init__(self) -> None:
         self.dataloader_idx: Optional[int] = None
         self.num_dataloaders: Optional[int] = None
         self.outputs: List[STEP_OUTPUT] = []
-        self.progress = EpochProgress()
+        self.progress = Progress()
 
-    def connect(
-        self, trainer: "pl.Trainer", *args: Any, progress: Optional[EpochProgress] = None, **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
 
     @property
     def done(self) -> bool:
@@ -65,6 +61,13 @@ def reset(self) -> None:
         self.num_dataloaders = None
         self.outputs = []
 
+        if self.restarting:
+            self.iteration_count = self.progress.current.completed
+            self.restarting = False
+        else:
+            self.iteration_count = 0
+            self.progress.current.reset()
+
     def on_run_start(
         self,
         dataloader_iter: Iterator,
@@ -114,9 +117,13 @@ def advance(
         with self.trainer.profiler.profile("evaluation_batch_to_device"):
             batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx)
 
+        self.progress.increment_started()
+
         # hook
         self.on_evaluation_batch_start(batch, batch_idx, dataloader_idx)
 
+        self.progress.increment_ready()
+
         # lightning module methods
         with self.trainer.profiler.profile("evaluation_step_and_end"):
             output = self.evaluation_step(batch, batch_idx, dataloader_idx)
@@ -131,6 +138,10 @@ def advance(
         # track epoch level outputs
         self.outputs = self._track_output_for_epoch_end(self.outputs, output)
 
+        self.progress.increment_processed()
+
+        self.progress.increment_completed()
+
     def on_run_end(self) -> List[STEP_OUTPUT]:
         """Returns the outputs of the whole run"""
         outputs = self.outputs
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index bc378c6bed0fb..af4e4fc52d63f 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -69,19 +69,11 @@ def done(self) -> bool:
         max_steps_reached = self.max_steps is not None and self.global_step >= self.max_steps
         return max_steps_reached or self.trainer.should_stop or self._num_training_batches_reached(self.is_last_batch)
 
-    def connect(
-        self,
-        trainer: 'pl.Trainer',
-        *args: Any,
-        progress: Optional[TrainingEpochProgress] = None,
-        **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        self.batch_loop.connect(trainer, progress=self.progress.batch, optim_progress=self.progress.optim)
-        self.val_loop.connect(trainer, progress=self.progress.val)
+        self.batch_loop.connect(trainer)
+        self.val_loop.connect(trainer)
 
     def reset(self) -> None:
         """Resets the internal state of the loop for a new run"""
@@ -93,12 +85,25 @@ def reset(self) -> None:
         # track epoch output
         self._epoch_output = [[] for _ in range(self.batch_loop.num_active_optimizers(self.total_batch_idx))]
 
+        if self.restarting:
+            self.iteration_count = self.batch_loop.current_batch_completed
+            self.batches_seen = self.batch_loop.current_batch_completed
+            # restarting is finished.
+            self.restarting = False
+        else:
+            # todo (tchaton) the batch_loop should be responsible for that.
+            self.batch_loop.progress.current.reset()
+
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
+        self.progress.increment_ready()
+
         # hook
         self.trainer.logger_connector.on_epoch_start()
         self.trainer.call_hook("on_epoch_start")
         self.trainer.call_hook("on_train_epoch_start")
 
+        self.progress.increment_started()
+
     def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
         """Runs a single training batch.
 
@@ -158,7 +163,10 @@ def on_advance_end(self):
         # -----------------------------------------
         # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
         # -----------------------------------------
-        should_check_val = self._should_check_val_fx(self.iteration_count, self.is_last_batch)
+        self.progress.should_check_val = should_check_val = self._should_check_val_fx(
+            self.iteration_count, self.is_last_batch
+        )
+
         if should_check_val:
             self.trainer.validating = True
             self._run_validation()
@@ -216,11 +224,15 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]:
                     'HINT: remove the return statement in training_epoch_end'
                 )
 
+        self.progress.increment_processed()
+
         # call train epoch end hooks
         self._on_train_epoch_end_hook(processed_outputs)
         self.trainer.call_hook('on_epoch_end')
         self.trainer.logger_connector.on_epoch_end()
 
+        self.progress.increment_completed()
+
         epoch_output = self._epoch_output
         # free memory
         self._epoch_output = None
@@ -430,10 +442,3 @@ def _save_loggers_on_train_batch_end(self) -> None:
         should_flush_logs = self.trainer.logger_connector.should_flush_logs
         if should_flush_logs and self.trainer.is_global_zero and self.trainer.logger is not None:
             self.trainer.logger.save()
-
-    def state_dict(self) -> Dict:
-        return {"batch_loop": self.batch_loop.state_dict(), "val_loop": self.val_loop.state_dict()}
-
-    def load_state_dict(self, state_dict: Dict) -> None:
-        self.batch_loop.load_state_dict(state_dict["batch_loop"])
-        self.val_loop.load_state_dict(state_dict["val_loop"])
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index a8eb44923a241..6963f4b3f2c4a 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -14,13 +14,12 @@
 
 import logging
 from contextlib import suppress
-from typing import Any, Dict, Optional
+from typing import Any, Optional
 
 import pytorch_lightning as pl
 from pytorch_lightning.loops import Loop
 from pytorch_lightning.loops.epoch import TrainingEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import FitLoopProgress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -51,8 +50,6 @@ def __init__(
         super().__init__()
         self.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs
         self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
-        self.progress = FitLoopProgress()
-
         self.epoch_loop = TrainingEpochLoop(min_steps, max_steps)
 
     @property
@@ -169,14 +166,10 @@ def skip(self) -> bool:
         """Whether we should skip the training and immediately return from the call to :meth:`run`."""
         return self.done or self.trainer.num_training_batches == 0
 
-    def connect(
-        self, trainer: 'pl.Trainer', *args: Any, progress: Optional[FitLoopProgress] = None, **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        self.epoch_loop.connect(trainer, progress=self.progress.epoch)
+        self.epoch_loop.connect(trainer)
 
     def reset(self) -> None:
         """Resets the internal state of this loop"""
@@ -289,11 +282,5 @@ def _check_checkpoint_callback(self, should_update: bool, is_last: bool = False)
             for cb in callbacks:
                 cb.on_validation_end(self.trainer, model)
 
-    def state_dict(self) -> Dict:
-        return {"epoch_loop": self.epoch_loop.state_dict()}
-
-    def load_state_dict(self, state_dict: Dict) -> None:
-        self.epoch_loop.load_state_dict(state_dict["epoch_loop"])
-
     def teardown(self) -> None:
         self.epoch_loop.teardown()
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index ab74c3bccfc8d..df1328d668305 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -23,6 +23,7 @@
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import fault_tolerant_enabled
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
 
 if _OMEGACONF_AVAILABLE:
@@ -165,6 +166,8 @@ def restore_training_state(self) -> None:
 
         self.restore_optimizers_and_schedulers()
 
+        self.restore_loops()
+
     def restore_callbacks(self) -> None:
         """ Restores all callbacks from the pre-loaded checkpoint. """
         if not self._loaded_checkpoint:
@@ -249,6 +252,18 @@ def restore_lr_schedulers(self) -> None:
         for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers):
             scheduler['scheduler'].load_state_dict(lrs_state)
 
+    def restore_loops(self) -> None:
+        """ Calls hooks on the loops to give it a chance to restore its state from the checkpoint. """
+        if not self._loaded_checkpoint:
+            return
+
+        state_dict = self._loaded_checkpoint.get("loops", None)
+        if state_dict:
+            self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
+            self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
+            self.trainer.test_loop.load_state_dict(state_dict["test_loop"])
+            self.trainer.predict_loop.load_state_dict(state_dict["predict_loop"])
+
     # ----------------------------------
     # PRIVATE OPS
     # ----------------------------------
@@ -332,6 +347,9 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             'state_dict': self.trainer.accelerator.lightning_module_state_dict(),
         }
 
+        if fault_tolerant_enabled():
+            checkpoint.update({"loops": self.get_loops_state_dict()})
+
         if not weights_only:
             # dump callbacks
             checkpoint['callbacks'] = self.trainer.on_save_checkpoint(checkpoint)
@@ -370,6 +388,14 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
 
         return checkpoint
 
+    def get_loops_state_dict(self):
+        return {
+            "fit_loop": self.trainer.fit_loop.state_dict(),
+            "validate_loop": self.trainer.validate_loop.state_dict(),
+            "test_loop": self.trainer.test_loop.state_dict(),
+            "predict_loop": self.trainer.predict_loop.state_dict(),
+        }
+
     def hpc_load(self, checkpoint_path: str) -> None:
         """
         Attempts to restore the full training and model state from a HPC checkpoint file.
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index 06ae55a1ca672..a71356710b5a7 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -15,6 +15,7 @@
 from weakref import proxy
 
 import pytorch_lightning as pl
+from pytorch_lightning.trainer.progress import OptimizationProgress
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -48,6 +49,8 @@ def update_learning_rates(
         if opt_indices is None:
             opt_indices = []
 
+        progress: OptimizationProgress = self.trainer.fit_loop.epoch_loop.batch_loop.optim_progress
+
         for scheduler_idx, lr_scheduler in enumerate(self.trainer.lr_schedulers):
             if isinstance(lr_scheduler['opt_idx'], int) and lr_scheduler['opt_idx'] not in opt_indices:
                 continue
@@ -83,11 +86,15 @@ def update_learning_rates(
                 # update LR
                 old_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
+                progress.scheduler.increment_ready()
+
                 if lr_scheduler['reduce_on_plateau']:
                     lr_scheduler['scheduler'].step(monitor_val)
                 else:
                     lr_scheduler['scheduler'].step()
 
+                progress.scheduler.increment_completed()
+
                 new_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
                 if self.trainer.dev_debugger.enabled:
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 1098957033855..db2321365bfa6 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -35,13 +35,11 @@ def from_state_dict(cls, state_dict: dict) -> "BaseProgress":
 class Tracker(BaseProgress):
     """
     Track an event's progress.
-
     Args:
         ready: Intended to track the number of events ready to start.
         started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).
         processed: Intended to be incremented after the event is processed.
         completed: Intended to be incremented after the event completes (e.g. after ``on_*_end`` runs).
-
     Attributes set to ``None`` are treated as unused and are restricted.
     """
 
@@ -88,7 +86,6 @@ def reset_on_restart(self):
 class Progress(BaseProgress):
     """
     Track aggregated and current progress.
-
     Args:
         total: Intended to track the total progress of an event
         current: Intended to track the current progress of an event
@@ -130,14 +127,39 @@ def load_state_dict(self, state_dict: dict) -> None:
         self.current.load_state_dict(state_dict["current"])
 
 
+@dataclass
 class BatchProgress(Progress):
     """
     Tracks the batch progress
+    Args:
+        total: Tracks the total epoch progress
+        current: Tracks the current epoch progress
+    """
+
 
+@dataclass
+class TrainingEpochProgress(Progress):
+    """
+    Tracks the batch progress
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
     """
+    should_check_val: bool = False
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        super().load_state_dict(state_dict)
+        self.should_check_val = state_dict["should_check_val"]
+
+
+@dataclass
+class DataLoaderProgress(Progress):
+
+    dataloader_idx: int = 0
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        super().load_state_dict(state_dict)
+        self.dataloader_idx = state_dict["dataloader_idx"]
 
 
 @dataclass
@@ -145,13 +167,12 @@ class EpochProgress(Progress):
     """
     Tracks the epoch progress
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
-
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
         batch: Tracks batch progress.
     """
-
+    dataloader_idx: int = 0
     batch: BatchProgress = field(default_factory=BatchProgress)
 
     def reset_on_epoch(self) -> None:
@@ -160,13 +181,13 @@ def reset_on_epoch(self) -> None:
     def load_state_dict(self, state_dict: dict) -> None:
         super().load_state_dict(state_dict)
         self.batch.load_state_dict(state_dict["batch"])
+        self.dataloader_idx = state_dict["dataloader_idx"]
 
 
 @dataclass
 class OptimizerProgress(BaseProgress):
     """
     Track optimizer progress.
-
     Args:
         step: Tracks ``optimizer.step`` calls.
         zero_grad: Tracks ``optimizer.zero_grad`` calls.
@@ -188,13 +209,13 @@ def load_state_dict(self, state_dict: dict) -> None:
 class OptimizationProgress(BaseProgress):
     """
     Track optimization progress.
-
     Args:
         optimizer: Tracks optimizer progress.
         scheduler: Tracks scheduler progress.
     """
 
     # TODO: support for multiple optimizers
+    optimizer_idx: int = 0
     optimizer: OptimizerProgress = field(default_factory=OptimizerProgress)
     scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
 
@@ -213,6 +234,7 @@ def reset_on_epoch(self) -> None:
     def load_state_dict(self, state_dict: dict) -> None:
         self.optimizer.load_state_dict(state_dict["optimizer"])
         self.scheduler.load_state_dict(state_dict["scheduler"])
+        self.optimizer_idx = state_dict["optimizer_idx"]
 
 
 @dataclass
@@ -220,11 +242,9 @@ class EpochLoopProgress(BaseProgress):
     """
     Tracks epoch loop progress.
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
-
     Args:
         epoch: Tracks epochs progress.
     """
-
     epoch: EpochProgress = field(default_factory=EpochProgress)
 
     def increment_epoch_completed(self) -> None:
@@ -237,42 +257,3 @@ def reset_on_epoch(self) -> None:
 
     def load_state_dict(self, state_dict: dict) -> None:
         self.epoch.load_state_dict(state_dict["epoch"])
-
-
-@dataclass
-class TrainingEpochProgress(EpochProgress):
-    """
-    Extends ``EpochProgress`` with training specific attributes
-
-    Args:
-        total: Tracks the total epoch progress.
-        current: Tracks the current epoch progress.
-        batch: Tracks batch progress.
-        optim: Tracks optimization progress.
-        val: Tracks val_loop progress.
-    """
-
-    optim: OptimizationProgress = field(default_factory=OptimizationProgress)
-    val: EpochLoopProgress = field(default_factory=EpochLoopProgress)
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        super().load_state_dict(state_dict)
-        self.optim.load_state_dict(state_dict["optim"])
-        self.val.load_state_dict(state_dict["val"])
-
-
-@dataclass
-class FitLoopProgress(EpochLoopProgress):
-    """
-    Extends ``EpochLoopProgress`` with fit specific attributes
-
-    Args:
-        epoch: Tracks epochs progress.
-    """
-
-    epoch: TrainingEpochProgress = field(default_factory=TrainingEpochProgress)
-
-    def reset_on_epoch(self) -> None:
-        # do not reset `epoch.current` as it should track the number of epochs this `fit` call
-        self.epoch.reset_on_epoch()
-        self.epoch.optim.reset_on_epoch()
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 7475cd9c81326..32b61992166a0 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -57,7 +57,6 @@
 from pytorch_lightning.trainer.logging import TrainerLoggingMixin
 from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
 from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
-from pytorch_lightning.trainer.progress import EpochLoopProgress, FitLoopProgress
 from pytorch_lightning.trainer.properties import TrainerProperties
 from pytorch_lightning.trainer.states import TrainerFn, TrainerState, TrainerStatus
 from pytorch_lightning.trainer.training_tricks import TrainerTrainingTricksMixin
@@ -359,10 +358,10 @@ def __init__(
         self.validate_loop = EvaluationLoop()
         self.test_loop = EvaluationLoop()
         self.predict_loop = PredictionLoop()
-        self.fit_loop.connect(self, progress=FitLoopProgress())
-        self.validate_loop.connect(self, progress=EpochLoopProgress())
-        self.test_loop.connect(self, progress=EpochLoopProgress())
-        self.predict_loop.connect(self, progress=EpochLoopProgress())
+        self.fit_loop.connect(self)
+        self.validate_loop.connect(self)
+        self.test_loop.connect(self)
+        self.predict_loop.connect(self)
 
         # training state
         if weights_summary is not None and weights_summary not in ModelSummary.MODES:
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index 3125a2d38f15e..fdd5382ca751d 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -14,6 +14,7 @@
 """General utilities"""
 import importlib
 import operator
+import os
 import platform
 import sys
 from importlib.util import find_spec
@@ -101,3 +102,7 @@ def _compare_version(package: str, op, version) -> bool:
     _IPU_AVAILABLE = poptorch.ipuHardwareIsAvailable()
 else:
     _IPU_AVAILABLE = False
+
+
+def fault_tolerant_enabled():
+    return os.getenv("PL_FAULT_TOLERANT_TRAINING", "0") == "1"
diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index a3bbd5a36a2c1..ec203ae7cac76 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -11,19 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from copy import deepcopy
+from unittest import mock
 
 import pytest
+import torch
 
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.progress import (
     BatchProgress,
     EpochLoopProgress,
     EpochProgress,
-    FitLoopProgress,
+    OptimizationProgress,
     OptimizerProgress,
     Progress,
     Tracker,
 )
+from tests.helpers import BoringModel
+
+
+class CustomException(BaseException):
+    pass
 
 
 def test_progress_geattr_setattr():
@@ -135,74 +145,9 @@ def test_optimizer_progress_default_factory():
     assert p2.step.total.completed == 0
 
 
-def test_fit_loop_progress_serialization():
-    fit_loop = FitLoopProgress()
-    _ = deepcopy(fit_loop)
-    fit_loop.epoch.increment_completed()  # check `TrainingEpochProgress.load_state_dict` calls `super`
-
-    state_dict = fit_loop.state_dict()
-    # yapf: disable
-    assert state_dict == {
-        'epoch': {
-            # number of epochs across `fit` calls
-            'total': {'completed': 1, 'processed': 0, 'ready': 0, 'started': 0},
-            # number of epochs this `fit` call
-            'current': {'completed': 1, 'processed': 0, 'ready': 0, 'started': 0},
-            'batch': {
-                # number of batches across `fit` calls
-                'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                # number of batches this epoch
-                'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-            },
-            # `fit` optimization progress
-            'optim': {
-                # optimizers progress
-                'optimizer': {
-                    'step': {
-                        # `optimizer.step` calls across `fit` calls
-                        'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0},
-                        # `optimizer.step` calls this epoch
-                        'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0},
-                    },
-                    'zero_grad': {
-                        # `optimizer.zero_grad` calls across `fit` calls
-                        'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0},
-                        # `optimizer.zero_grad` calls this epoch
-                        'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': 0},
-                    },
-                },
-                'scheduler': {
-                    # `scheduler.step` calls across `fit` calls
-                    'total': {'completed': 0, 'processed': None, 'ready': 0, 'started': None},
-                    # `scheduler.step` calls this epoch
-                    'current': {'completed': 0, 'processed': None, 'ready': 0, 'started': None},
-                },
-            },
-            # `fit` validation progress
-            'val': {
-                'epoch': {
-                    # number of `validation` calls across `fit` calls
-                    'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                    # number of `validation` calls this `fit` call
-                    'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                    'batch': {
-                        # number of batches across `fit` `validation` calls
-                        'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                        # number of batches this `fit` `validation` call
-                        'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                    },
-                }
-            },
-        }
-    }
-    # yapf: enable
-
-    new_loop = FitLoopProgress.from_state_dict(state_dict)
-    assert fit_loop == new_loop
-
-
 def test_epoch_loop_progress_serialization():
     loop = EpochLoopProgress()
+    loop.epoch.dataloader_idx = 1
     _ = deepcopy(loop)
     state_dict = loop.state_dict()
 
@@ -219,9 +164,171 @@ def test_epoch_loop_progress_serialization():
                 # number of batches this `validate` call
                 'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
             },
+            'dataloader_idx': 1
         }
     }
     # yapf: enable
 
     new_loop = EpochLoopProgress.from_state_dict(state_dict)
     assert loop == new_loop
+
+
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+@pytest.mark.parametrize("use_multiple_optimizers", [False, True])
+@pytest.mark.parametrize("accumulate_grad_batches", [1, 2])
+def test_progress_tracking(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            if use_multiple_optimizers:
+                self.configure_optimizers = self.configure_optimizers_3
+            self.should_fail = True
+
+        def training_step(self, batch, batch_idx, optimizer_idx: int = None):
+            # breaking on global_step 4
+            if self.should_fail and self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
+                1 if use_multiple_optimizers else None
+            ):
+                raise CustomException
+            return super().training_step(batch, batch_idx)
+
+        def configure_optimizers_3(self):
+            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            optimizer_1 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
+            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
+            optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
+            return [optimizer, optimizer_1, optimizer_2], \
+                   [lr_scheduler, {"scheduler": lr_scheduler_1, "interval": "step"}]
+
+    model = TestModel()
+    model.training_epoch_end = None
+
+    chk = ModelCheckpoint(dirpath=tmpdir, filename=str(use_multiple_optimizers), save_last=True)
+    chk.last_model_path = None
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=3,
+        limit_val_batches=0,
+        callbacks=chk,
+        accumulate_grad_batches=accumulate_grad_batches,
+        resume_from_checkpoint=None,
+    )
+
+    # simulate random failure in training_step
+    try:
+        trainer.fit(model)
+    except CustomException:
+        pass
+
+    assert isinstance(trainer.fit_loop.epoch_loop.batch_loop.optim_progress, OptimizationProgress)
+
+    pr = trainer.fit_loop.epoch_loop.progress
+
+    assert pr.total == Tracker(ready=2, started=2, processed=1, completed=1)
+    assert pr.current == Tracker(ready=2, started=2, processed=1, completed=1)
+
+    pr = trainer.fit_loop.epoch_loop.batch_loop.progress
+
+    assert pr.total == Tracker(ready=5, started=5, processed=4, completed=4)
+    assert pr.current == Tracker(ready=2, started=2, processed=1, completed=1)
+
+    num_optimizers = 3 if use_multiple_optimizers else 1
+
+    optim = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
+
+    # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
+    total = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
+
+    # we raised expection on the first optimizer
+    current = (1 if use_multiple_optimizers else 0)
+
+    if accumulate_grad_batches == 2 and use_multiple_optimizers:
+        total += 1
+
+    assert optim.optimizer.step.total == Tracker(ready=total + 1, started=total, processed=None, completed=total)
+    assert optim.optimizer.step.current == Tracker(
+        ready=current + 1, started=current, processed=None, completed=current
+    )
+
+    if accumulate_grad_batches == 2:
+        # that's weird ! todo (tchaton) investigate this
+        total = (9 if use_multiple_optimizers else 3)
+        current = 0  # same there.
+
+    assert optim.optimizer.zero_grad.total == Tracker(ready=total, started=total, processed=None, completed=total)
+    assert optim.optimizer.zero_grad.current == Tracker(
+        ready=current, started=current, processed=None, completed=current
+    )
+
+    # for multiple optimizers: 4 batches + 1 on epoch
+    total = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
+
+    if accumulate_grad_batches == 2:
+        total += 1
+
+    assert optim.scheduler.total == Tracker(ready=total, started=None, processed=None, completed=total)
+    # assert optim.scheduler.current == Tracker(ready=0, started=None, processed=None, completed=0)
+
+    assert optim.optimizer_idx == (1 if use_multiple_optimizers else 0)
+
+    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)
+    assert "loops" in checkpoint
+
+
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+def test_progress_tracking_validation_multiple_datasets(tmpdir):
+
+    class ValidationModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+
+        def validation_step(self, batch, batch_idx, dataloader_idx):
+            if self.trainer.fit_loop.epoch_loop.batch_idx == 3 and batch_idx == 1 and dataloader_idx == 1:
+                raise CustomException
+            return super().validation_step(batch, batch_idx)
+
+        def val_dataloader(self):
+            return [super().val_dataloader(), super().val_dataloader(), super().val_dataloader()]
+
+    model = ValidationModel()
+    model.validation_epoch_end = None
+
+    chk = ModelCheckpoint(dirpath=tmpdir, save_last=True)
+    chk.last_model_path = None
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        limit_train_batches=5,
+        limit_val_batches=3,
+        callbacks=chk,
+        resume_from_checkpoint=None,
+        val_check_interval=2,
+        num_sanity_val_steps=0,
+    )
+
+    # simulate random failure in training_step
+    try:
+        trainer.fit(model)
+    except CustomException:
+        pass
+
+    pr = trainer.fit_loop.epoch_loop.val_loop.progress
+
+    assert pr.total == Tracker(ready=2, started=2, processed=1, completed=1)
+    assert pr.current == Tracker(ready=1, started=1, processed=0, completed=0)
+    assert pr.dataloader_idx == 1
+
+    assert trainer.fit_loop.epoch_loop.progress.should_check_val
+
+    pr = trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
+
+    # 3 dataloaders with 3 samples for batch_idx == 1 + first dataloader on batch_idx == 1 + failure on batch_idx = 1
+    current = 2
+    total = 3 * 3 + 3 + current
+    assert pr.total == Tracker(ready=total, started=total, processed=total - 1, completed=total - 1)
+    assert pr.current == Tracker(ready=current, started=current, processed=current - 1, completed=current - 1)

From 22fa5fb2903220ce469352e06ad626e3b69728a7 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 14:46:52 +0200
Subject: [PATCH 020/106] validate json

---
 .../loops/batch/training_batch_loop.py        |   2 +-
 pytorch_lightning/trainer/trainer.py          |   7 +
 tests/trainer/test_progress.py                | 124 +++++++++++++-----
 3 files changed, 98 insertions(+), 35 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 89ea775977817..55244e7bfa1c0 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -110,7 +110,7 @@ def reset(self) -> None:
         self.batch_idx = 0
         self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))]
 
-        self.optim_progress.optimizer.reset_on_epoch()
+        self.optim_progress.reset_on_epoch()
 
     def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         """Splits the data into tbptt splits
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 32b61992166a0..d4df26941f919 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -76,6 +76,7 @@
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import fault_tolerant_enabled
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT, EVAL_DATALOADERS, TRAIN_DATALOADERS
@@ -996,6 +997,7 @@ def _run_train(self) -> None:
                 self.training_type_plugin.reconciliate_processes(traceback.format_exc())
             # give accelerators a chance to finish
             self.accelerator.on_train_end()
+            self.on_expection()
             # reset bookkeeping
             self.state.stage = None
             raise
@@ -1235,3 +1237,8 @@ def _log_device_info(self) -> None:
                 "IPU available but not used. Set the `ipus` flag in your trainer"
                 " `Trainer(ipus=8)` or script `--ipus=8`."
             )
+
+    def on_expection(self):
+        if fault_tolerant_enabled():
+            # save a checkpoint for fault tolerant training
+            self.fit_loop._check_checkpoint_callback(True)
diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index ec203ae7cac76..187b10e9dd0df 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -24,7 +24,6 @@
     BatchProgress,
     EpochLoopProgress,
     EpochProgress,
-    OptimizationProgress,
     OptimizerProgress,
     Progress,
     Tracker,
@@ -224,59 +223,116 @@ def configure_optimizers_3(self):
     except CustomException:
         pass
 
-    assert isinstance(trainer.fit_loop.epoch_loop.batch_loop.optim_progress, OptimizationProgress)
+    #######################
+    # VALIDATE CHECKPOINT #
+    #######################
 
-    pr = trainer.fit_loop.epoch_loop.progress
-
-    assert pr.total == Tracker(ready=2, started=2, processed=1, completed=1)
-    assert pr.current == Tracker(ready=2, started=2, processed=1, completed=1)
-
-    pr = trainer.fit_loop.epoch_loop.batch_loop.progress
-
-    assert pr.total == Tracker(ready=5, started=5, processed=4, completed=4)
-    assert pr.current == Tracker(ready=2, started=2, processed=1, completed=1)
+    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)
 
     num_optimizers = 3 if use_multiple_optimizers else 1
 
-    optim = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
-
     # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
-    total = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
+    total_optimizer_step = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
 
     # we raised expection on the first optimizer
-    current = (1 if use_multiple_optimizers else 0)
+    current_optimize_step = (1 if use_multiple_optimizers else 0)
 
     if accumulate_grad_batches == 2 and use_multiple_optimizers:
-        total += 1
+        total_optimizer_step += 1
 
-    assert optim.optimizer.step.total == Tracker(ready=total + 1, started=total, processed=None, completed=total)
-    assert optim.optimizer.step.current == Tracker(
-        ready=current + 1, started=current, processed=None, completed=current
-    )
+    total_optimizer_zero_grad = total_optimizer_step
+    current_optimizer_zero_grad = current_optimize_step
 
     if accumulate_grad_batches == 2:
         # that's weird ! todo (tchaton) investigate this
-        total = (9 if use_multiple_optimizers else 3)
-        current = 0  # same there.
+        total_optimizer_zero_grad = (9 if use_multiple_optimizers else 3)
+        current_optimizer_zero_grad = 0  # same there.
 
-    assert optim.optimizer.zero_grad.total == Tracker(ready=total, started=total, processed=None, completed=total)
-    assert optim.optimizer.zero_grad.current == Tracker(
-        ready=current, started=current, processed=None, completed=current
-    )
+    total_scheduler_step = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
 
-    # for multiple optimizers: 4 batches + 1 on epoch
-    total = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
+    current_scheduler_step = 0
 
     if accumulate_grad_batches == 2:
-        total += 1
+        total_scheduler_step += 1
 
-    assert optim.scheduler.total == Tracker(ready=total, started=None, processed=None, completed=total)
-    # assert optim.scheduler.current == Tracker(ready=0, started=None, processed=None, completed=0)
+    optimizer_idx = (1 if use_multiple_optimizers else 0)
 
-    assert optim.optimizer_idx == (1 if use_multiple_optimizers else 0)
+    # yapf: disable
+    expected = {
+        "state_dict": {},
+        "epoch_loop.state_dict": {},
+        "epoch_loop.progress": {
+            "total": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
+            "current": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
+            "should_check_val": False,
+        },
+        "epoch_loop.batch_loop.state_dict": {},
+        "epoch_loop.batch_loop.progress": {
+            "total": {"ready": 5, "started": 5, "processed": 4, "completed": 4},
+            "current": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
+        },
+        "epoch_loop.batch_loop.optim_progress": {
+            "optimizer_idx": optimizer_idx,
+            "optimizer": {
+                "step": {
+                    "total": {
+                        "ready": total_optimizer_step + 1,
+                        "started": total_optimizer_step,
+                        "processed": None,
+                        "completed": total_optimizer_step
+                    },
+                    "current": {
+                        "ready": current_optimize_step + 1,
+                        "started": current_optimize_step,
+                        "processed": None,
+                        "completed": current_optimize_step,
+                    },
+                },
+                "zero_grad": {
+                    "total": {
+                        "ready": total_optimizer_zero_grad,
+                        "started": total_optimizer_zero_grad,
+                        "processed": None,
+                        "completed": total_optimizer_zero_grad
+                    },
+                    "current": {
+                        "ready": current_optimizer_zero_grad,
+                        "started": current_optimizer_zero_grad,
+                        "processed": None,
+                        "completed": current_optimizer_zero_grad,
+                    },
+                },
+            },
+            "scheduler": {
+                "total": {
+                    "ready": total_scheduler_step,
+                    "started": None,
+                    "processed": None,
+                    "completed": total_scheduler_step
+                },
+                "current": {
+                    "ready": current_scheduler_step,
+                    "started": None,
+                    "processed": None,
+                    "completed": current_scheduler_step
+                },
+            },
+        },
+        "epoch_loop.val_loop.state_dict": {},
+        "epoch_loop.val_loop.progress": {
+            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "dataloader_idx": 0,
+        },
+        "epoch_loop.val_loop.epoch_loop.state_dict": {},
+        "epoch_loop.val_loop.epoch_loop.progress": {
+            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+        },
+    }
+    # yapf: enable
 
-    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)
-    assert "loops" in checkpoint
+    assert checkpoint["loops"]["fit_loop"] == expected
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})

From 6d45fe26e3542a9523e792263b1e968e28f16fd1 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 14:49:45 +0200
Subject: [PATCH 021/106] update

---
 tests/trainer/test_progress.py | 35 +++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 187b10e9dd0df..043233532379a 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -205,12 +205,14 @@ def configure_optimizers_3(self):
     model = TestModel()
     model.training_epoch_end = None
 
+    limit_train_batches = 3
+
     chk = ModelCheckpoint(dirpath=tmpdir, filename=str(use_multiple_optimizers), save_last=True)
     chk.last_model_path = None
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        limit_train_batches=3,
+        limit_train_batches=limit_train_batches,
         limit_val_batches=0,
         callbacks=chk,
         accumulate_grad_batches=accumulate_grad_batches,
@@ -229,6 +231,9 @@ def configure_optimizers_3(self):
 
     checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)
 
+    num_epochs = 1
+    num_batches = 4
+
     num_optimizers = 3 if use_multiple_optimizers else 1
 
     # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
@@ -262,14 +267,34 @@ def configure_optimizers_3(self):
         "state_dict": {},
         "epoch_loop.state_dict": {},
         "epoch_loop.progress": {
-            "total": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
-            "current": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
+            "total": {
+                "ready": num_epochs + 1,
+                "started": num_epochs + 1,
+                "processed": 1,
+                "completed": 1
+            },
+            "current": {
+                "ready": num_epochs + 1,
+                "started": num_epochs + 1,
+                "processed": 1,
+                "completed": 1
+            },
             "should_check_val": False,
         },
         "epoch_loop.batch_loop.state_dict": {},
         "epoch_loop.batch_loop.progress": {
-            "total": {"ready": 5, "started": 5, "processed": 4, "completed": 4},
-            "current": {"ready": 2, "started": 2, "processed": 1, "completed": 1},
+            "total": {
+                "ready": num_batches + 1,
+                "started": num_batches + 1,
+                "processed": num_batches,
+                "completed": num_batches
+            },
+            "current": {
+                "ready": num_batches - limit_train_batches + 1,
+                "started": num_batches - limit_train_batches + 1,
+                "processed": num_batches - limit_train_batches,
+                "completed": num_batches - limit_train_batches
+            },
         },
         "epoch_loop.batch_loop.optim_progress": {
             "optimizer_idx": optimizer_idx,

From 71d01d696b984bba161316b0d01d8549cfe52d7f Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 14:55:53 +0200
Subject: [PATCH 022/106] convert to dict for better readability

---
 tests/trainer/test_progress.py | 46 ++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 043233532379a..053373f354989 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -398,18 +398,48 @@ def val_dataloader(self):
     except CustomException:
         pass
 
-    pr = trainer.fit_loop.epoch_loop.val_loop.progress
+    #######################
+    # VALIDATE CHECKPOINT #
+    #######################
 
-    assert pr.total == Tracker(ready=2, started=2, processed=1, completed=1)
-    assert pr.current == Tracker(ready=1, started=1, processed=0, completed=0)
-    assert pr.dataloader_idx == 1
+    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)["loops"]["fit_loop"]
 
-    assert trainer.fit_loop.epoch_loop.progress.should_check_val
+    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)["loops"]["fit_loop"]
 
-    pr = trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
+    expected = {
+        "total": {
+            "ready": 2,
+            "started": 2,
+            "processed": 1,
+            "completed": 1
+        },
+        "current": {
+            "ready": 1,
+            "started": 1,
+            "processed": 0,
+            "completed": 0
+        },
+        "dataloader_idx": 1,
+    }
+
+    assert checkpoint["epoch_loop.val_loop.progress"] == expected
 
     # 3 dataloaders with 3 samples for batch_idx == 1 + first dataloader on batch_idx == 1 + failure on batch_idx = 1
     current = 2
     total = 3 * 3 + 3 + current
-    assert pr.total == Tracker(ready=total, started=total, processed=total - 1, completed=total - 1)
-    assert pr.current == Tracker(ready=current, started=current, processed=current - 1, completed=current - 1)
+    expected = {
+        "total": {
+            "ready": total,
+            "started": total,
+            "processed": total - 1,
+            "completed": total - 1
+        },
+        "current": {
+            "ready": current,
+            "started": current,
+            "processed": current - 1,
+            "completed": current - 1
+        },
+    }
+
+    assert checkpoint["epoch_loop.val_loop.epoch_loop.progress"] == expected

From 1c6c5661e29f74619b16de8362661a2a9755fe47 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 15:14:57 +0200
Subject: [PATCH 023/106] validate reload

---
 tests/trainer/test_progress.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 053373f354989..6322ab5be33bb 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -359,6 +359,15 @@ def configure_optimizers_3(self):
 
     assert checkpoint["loops"]["fit_loop"] == expected
 
+    trainer = Trainer()
+    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"], restart_progress=False)
+    assert trainer.fit_loop.state_dict() == checkpoint["loops"]["fit_loop"]
+
+    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
+    state_dict = trainer.fit_loop.state_dict()
+    assert state_dict != checkpoint["loops"]["fit_loop"]
+    assert state_dict['epoch_loop.progress']["total"]["started"] == num_epochs
+
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
 def test_progress_tracking_validation_multiple_datasets(tmpdir):
@@ -443,3 +452,10 @@ def val_dataloader(self):
     }
 
     assert checkpoint["epoch_loop.val_loop.epoch_loop.progress"] == expected
+
+    trainer = Trainer()
+    trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
+    assert trainer.fit_loop.state_dict() == checkpoint
+
+    trainer.fit_loop.load_state_dict(checkpoint)
+    assert trainer.fit_loop.state_dict() != checkpoint

From bc49cc72829cdd96c1c94af42df42948595a80ca Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 15:16:03 +0200
Subject: [PATCH 024/106] update

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14db839d0d6e6..399795869209f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -140,6 +140,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))
 
 
+- Added `progress` tracking on loops ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
+
+
 ### Changed
 
 

From 0a0b5e35eff02ddedf5d9998a5a2665eec09d564 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Sat, 10 Jul 2021 19:27:16 +0200
Subject: [PATCH 025/106] update

---
 tests/loops/test_loop_progress_integration.py |  20 +-
 tests/loops/test_loop_state_dict.py           | 201 ++++++------------
 2 files changed, 72 insertions(+), 149 deletions(-)

diff --git a/tests/loops/test_loop_progress_integration.py b/tests/loops/test_loop_progress_integration.py
index 986ea2543d6d8..82d56fb5dd872 100644
--- a/tests/loops/test_loop_progress_integration.py
+++ b/tests/loops/test_loop_progress_integration.py
@@ -3,20 +3,16 @@
 
 def test_loop_progress_integration():
     trainer = Trainer()
-    fit_loop = trainer.fit_loop
-    # check identities inside the fit loop
-    assert fit_loop.progress.epoch is fit_loop.epoch_loop.progress
-    assert fit_loop.epoch_loop.progress.batch is fit_loop.epoch_loop.batch_loop.progress
-    assert fit_loop.epoch_loop.progress.optim is fit_loop.epoch_loop.batch_loop.optim_progress
-    assert fit_loop.epoch_loop.progress.val is fit_loop.epoch_loop.val_loop.progress
-    assert fit_loop.epoch_loop.val_loop.progress.epoch is fit_loop.epoch_loop.val_loop.epoch_loop.progress
-    # check identities inside the evaluation and predict loops
-    assert trainer.validate_loop.progress.epoch is trainer.validate_loop.epoch_loop.progress
-    assert trainer.test_loop.progress.epoch is trainer.test_loop.epoch_loop.progress
-    assert trainer.predict_loop.progress.epoch is trainer.predict_loop.epoch_loop.progress
     # check no progresses are shared
-    assert trainer.fit_loop.progress is not trainer.validate_loop.progress
     assert trainer.validate_loop.progress is not trainer.test_loop.progress
     assert trainer.test_loop.progress is not trainer.predict_loop.progress
     # check the validation progresses are not shared
     assert trainer.fit_loop.epoch_loop.val_loop.progress is not trainer.validate_loop.progress
+    expected = trainer.fit_loop.loop_progress["epoch_loop"]["progress"]
+    assert expected == trainer.fit_loop.epoch_loop.progress
+    expected = trainer.fit_loop.loop_progress["epoch_loop"]["batch_loop"]["progress"]
+    assert expected == trainer.fit_loop.epoch_loop.batch_loop.progress
+    expected = trainer.fit_loop.loop_progress["epoch_loop"]["val_loop"]["progress"]
+    assert expected == trainer.fit_loop.epoch_loop.val_loop.progress
+    expected = trainer.fit_loop.loop_progress["epoch_loop"]["val_loop"]["epoch_loop"]["progress"]
+    assert expected == trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index eed23a89a8b36..591f0c0f297b8 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -44,176 +44,101 @@ def test_loops_state_dict_structure():
     # yapf: disable
     expected = {
         "fit_loop": {
-            "epoch_loop": {
-                "batch_loop": {
-                    "state_dict": {},
-                    "progress": {
-                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "state_dict": {},
+            "epoch_loop.state_dict": {},
+            "epoch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "should_check_val": False,
+            },
+            "epoch_loop.batch_loop.state_dict": {},
+            "epoch_loop.batch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            },
+            "epoch_loop.batch_loop.optim_progress": {
+                "optimizer_idx": 0,
+                "optimizer": {
+                    "step": {
+                        "total": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": None,
+                            "completed": 0,
+                        },
                         "current": {
                             "ready": 0,
                             "started": 0,
-                            "processed": 0,
+                            "processed": None,
                             "completed": 0,
                         },
                     },
-                    "optim_progress": {
-                        "optimizer": {
-                            "step": {
-                                "total": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": None,
-                                    "completed": 0,
-                                },
-                                "current": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": None,
-                                    "completed": 0,
-                                },
-                            },
-                            "zero_grad": {
-                                "total": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": None,
-                                    "completed": 0,
-                                },
-                                "current": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": None,
-                                    "completed": 0,
-                                },
-                            },
-                        },
-                        "scheduler": {
-                            "total": {
-                                "ready": 0,
-                                "started": None,
-                                "processed": None,
-                                "completed": 0,
-                            },
-                            "current": {
-                                "ready": 0,
-                                "started": None,
-                                "processed": None,
-                                "completed": 0,
-                            },
+                    "zero_grad": {
+                        "total": {
+                            "ready": 0,
+                            "started": 0,
+                            "processed": None,
+                            "completed": 0,
                         },
-                    },
-                },
-                "val_loop": {
-                    "state_dict": {},
-                    "progress": {
-                        "epoch": {
-                            "total": {
-                                "ready": 0,
-                                "started": 0,
-                                "processed": 0,
-                                "completed": 0,
-                            },
-                            "current": {
-                                "ready": 0,
-                                "started": 0,
-                                "processed": 0,
-                                "completed": 0,
-                            },
-                            "batch": {
-                                "total": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": 0,
-                                    "completed": 0,
-                                },
-                                "current": {
-                                    "ready": 0,
-                                    "started": 0,
-                                    "processed": 0,
-                                    "completed": 0,
-                                },
-                            },
-                        }
-                    },
-                    "epoch_loop.state_dict": {},
-                    "epoch_loop.progress": {
-                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                         "current": {
                             "ready": 0,
                             "started": 0,
-                            "processed": 0,
+                            "processed": None,
                             "completed": 0,
                         },
-                        "batch": {
-                            "total": {
-                                "ready": 0,
-                                "started": 0,
-                                "processed": 0,
-                                "completed": 0,
-                            },
-                            "current": {
-                                "ready": 0,
-                                "started": 0,
-                                "processed": 0,
-                                "completed": 0,
-                            },
-                        },
                     },
                 },
-            }
+                "scheduler": {
+                    "total": {
+                        "ready": 0,
+                        "started": None,
+                        "processed": None,
+                        "completed": 0,
+                    },
+                    "current": {
+                        "ready": 0,
+                        "started": None,
+                        "processed": None,
+                        "completed": 0,
+                    },
+                },
+            },
+            "epoch_loop.val_loop.state_dict": {},
+            "epoch_loop.val_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "dataloader_idx": 0,
+            },
+            "epoch_loop.val_loop.epoch_loop.state_dict": {},
+            "epoch_loop.val_loop.epoch_loop.progress": {
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            },
         },
         "validate_loop": {
             "state_dict": {},
             "progress": {
-                "epoch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "batch": {
-                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                        "current": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": 0,
-                            "completed": 0,
-                        },
-                    },
-                }
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "dataloader_idx": 0,
             },
             "epoch_loop.state_dict": {},
             "epoch_loop.progress": {
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "batch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                },
             },
         },
         "test_loop": {
             "state_dict": {},
             "progress": {
-                "epoch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "batch": {
-                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                        "current": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": 0,
-                            "completed": 0,
-                        },
-                    },
-                }
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "dataloader_idx": 0,
             },
             "epoch_loop.state_dict": {},
             "epoch_loop.progress": {
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "batch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                },
             },
         },
         "predict_loop": {
@@ -222,6 +147,7 @@ def test_loops_state_dict_structure():
                 "epoch": {
                     "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                     "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                    "dataloader_idx": 0,
                     "batch": {
                         "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                         "current": {
@@ -237,6 +163,7 @@ def test_loops_state_dict_structure():
             "epoch_loop.progress": {
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "dataloader_idx": 0,
                 "batch": {
                     "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                     "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},

From 45fb6576c234220e0b6ffa9babf1bf1c1157af40 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 11:58:45 +0200
Subject: [PATCH 026/106] update on comments

---
 pytorch_lightning/core/optimizer.py                   |  1 -
 pytorch_lightning/loops/batch/training_batch_loop.py  | 11 +++++++++--
 pytorch_lightning/loops/epoch/training_epoch_loop.py  |  1 +
 .../trainer/connectors/optimizer_connector.py         |  9 +++------
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/core/optimizer.py b/pytorch_lightning/core/optimizer.py
index 33b44a35d31f1..25e4519eb39fc 100644
--- a/pytorch_lightning/core/optimizer.py
+++ b/pytorch_lightning/core/optimizer.py
@@ -207,7 +207,6 @@ def closure_dis():
             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
 
         self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
-        self._trainer.fit_loop.epoch_loop.batch_loop.optim_progress.optimizer.step.increment_processed()
         self._total_optimizer_step_calls += 1
 
     def __repr__(self):
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 55244e7bfa1c0..8b4ed1140144c 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import OptimizationProgress, Progress
+from pytorch_lightning.trainer.progress import BatchProgress, OptimizationProgress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -50,7 +50,7 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        self.progress = Progress()
+        self.progress = BatchProgress()
         self.optim_progress = OptimizationProgress()
         self._warning_cache: WarningCache = WarningCache()
         self._hiddens: Optional[Tensor] = None
@@ -441,6 +441,7 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
+        self.optim_progress.optimizer.step.increment_processed()
         self.optim_progress.optimizer.step.increment_completed()
 
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
@@ -724,3 +725,9 @@ def _truncated_bptt_steps(self) -> int:
         if lightning_module.truncated_bptt_steps > 0:
             return lightning_module.truncated_bptt_steps
         return self.trainer.truncated_bptt_steps or 0
+
+    def increment_scheduler_ready(self):
+        self.optim_progress.scheduler.increment_ready()
+
+    def increment_scheduler_completed(self):
+        self.optim_progress.scheduler.increment_completed()
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index af4e4fc52d63f..8f6cc13e64fd5 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -394,6 +394,7 @@ def update_lr_schedulers(self, interval: str, update_plateau_schedulers: bool) -
         """updates the lr schedulers based on the given interval"""
         if interval == "step" and self.batch_loop.should_accumulate():
             return
+
         self.trainer.optimizer_connector.update_learning_rates(
             interval=interval,
             update_plateau_schedulers=update_plateau_schedulers,
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index a71356710b5a7..16b751e7db4b9 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -15,7 +15,6 @@
 from weakref import proxy
 
 import pytorch_lightning as pl
-from pytorch_lightning.trainer.progress import OptimizationProgress
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
@@ -49,8 +48,6 @@ def update_learning_rates(
         if opt_indices is None:
             opt_indices = []
 
-        progress: OptimizationProgress = self.trainer.fit_loop.epoch_loop.batch_loop.optim_progress
-
         for scheduler_idx, lr_scheduler in enumerate(self.trainer.lr_schedulers):
             if isinstance(lr_scheduler['opt_idx'], int) and lr_scheduler['opt_idx'] not in opt_indices:
                 continue
@@ -86,17 +83,17 @@ def update_learning_rates(
                 # update LR
                 old_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
-                progress.scheduler.increment_ready()
+                self.trainer.fit_loop.epoch_loop.batch_loop.increment_scheduler_ready()
 
                 if lr_scheduler['reduce_on_plateau']:
                     lr_scheduler['scheduler'].step(monitor_val)
                 else:
                     lr_scheduler['scheduler'].step()
 
-                progress.scheduler.increment_completed()
-
                 new_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
+                self.trainer.fit_loop.epoch_loop.batch_loop.increment_scheduler_completed()
+
                 if self.trainer.dev_debugger.enabled:
                     self.trainer.dev_debugger.track_lr_schedulers_update(
                         self.trainer.fit_loop.batch_idx,

From 65821c9f79b02b044c61a6cd623c9048b32aa826 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 13:32:47 +0200
Subject: [PATCH 027/106] remove deadcode

---
 pytorch_lightning/loops/base.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 61d445fff760f..9997baac79cc5 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -54,33 +54,6 @@ def __init__(self) -> None:
     def trainer(self) -> Optional['pl.Trainer']:
         return self._trainer
 
-    @trainer.setter
-    def trainer(self, trainer: 'pl.Trainer'):
-        """Connect the Trainer to this loop and all children."""
-        if not isinstance(trainer, pl.Trainer) and trainer is not None:
-            raise MisconfigurationException(
-                f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."
-            )
-        self._trainer = trainer
-        for v in self.__dict__.values():
-            if isinstance(v, Loop):
-                v.trainer = trainer
-
-    @property
-    def loop_progress(self) -> Dict[str, Any]:
-        """Return the progress for the current loop and children loop."""
-        progress = {}
-        for k, v in self.__dict__.items():
-            if isinstance(v, BaseProgress):
-                progress[k] = v
-            elif isinstance(v, Loop):
-                progress[k] = v.loop_progress
-        return progress
-
-    @property
-    def trainer(self) -> Optional['pl.Trainer']:
-        return self._trainer
-
     @trainer.setter
     def trainer(self, trainer: 'pl.Trainer'):
         """Connect the Trainer to itself and all its children loops"""
@@ -126,9 +99,6 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
         Returns:
             the output of :attr:`on_run_end` (often outputs collected from each step of the loop)
         """
-        if self.trainer is None:
-            raise MisconfigurationException(f"The {self.__class__.__name__} Loop hasn't been attached to any Trainer.")
-
         if self.skip:
             return self.on_skip()
 

From d0492b519d1915473a2acc4bade4c6d5cf0b3c3c Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 13:33:58 +0200
Subject: [PATCH 028/106] clean changelog

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47c5c5e8a866c..4fe41a10fa655 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -205,7 +205,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/PyTorchLightning/pytorch-lightning/pull/7700), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077))
     * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/PyTorchLightning/pytorch-lightning/pull/8094))
     * Moved result teardown to the loops ([#8245](https://github.com/PyTorchLightning/pytorch-lightning/pull/8245))
-    * Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334))
 
 - Refactored logging
     * Renamed and moved `core/step_result.py` to `trainer/connectors/logger_connector/result.py` ([#7736](https://github.com/PyTorchLightning/pytorch-lightning/pull/7736))

From 462b35718032012a2988d82d0f9140c63d3165df Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 13:34:44 +0200
Subject: [PATCH 029/106] clean changelog

---
 CHANGELOG.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4fe41a10fa655..ef98267575d0d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -205,6 +205,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/PyTorchLightning/pytorch-lightning/pull/7700), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077))
     * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/PyTorchLightning/pytorch-lightning/pull/8094))
     * Moved result teardown to the loops ([#8245](https://github.com/PyTorchLightning/pytorch-lightning/pull/8245))
+    * Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334))
 
 - Refactored logging
     * Renamed and moved `core/step_result.py` to `trainer/connectors/logger_connector/result.py` ([#7736](https://github.com/PyTorchLightning/pytorch-lightning/pull/7736))
@@ -291,9 +292,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - `Trainer(resume_from_checkpoint=...)` now restores the model directly after `LightningModule.setup()`, which is before `LightningModule.configure_sharded_model()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652))
 
 
-- Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334))
-
-
 ### Deprecated
 
 

From 8c0426b8e82882e7209b4fae6bed14efe7803e23 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 13:35:31 +0200
Subject: [PATCH 030/106] update

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef98267575d0d..0fb37fa52af79 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -375,6 +375,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed deprecated `optimizer` argument in `LightningModule.manual_backward()`; Toggling optimizers in manual optimization should be done using `LightningModule.{un}toggle_optimizer()` ([#8287](https://github.com/PyTorchLightning/pytorch-lightning/pull/8287))
 
 
+
+
 ### Fixed
 
 - Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/PyTorchLightning/pytorch-lightning/pull/7877))

From b7c411325d968518285b81acffc944a9f2e97a1b Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 14:05:50 +0200
Subject: [PATCH 031/106] update on comments

---
 tests/loops/test_loop_progress_integration.py |  9 ++--
 tests/loops/test_loops.py                     | 49 +++++++++++--------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/tests/loops/test_loop_progress_integration.py b/tests/loops/test_loop_progress_integration.py
index 82d56fb5dd872..465ec7ad15655 100644
--- a/tests/loops/test_loop_progress_integration.py
+++ b/tests/loops/test_loop_progress_integration.py
@@ -1,4 +1,5 @@
 from pytorch_lightning import Trainer
+from tests.loops.test_loops import _collect_loop_progress
 
 
 def test_loop_progress_integration():
@@ -8,11 +9,11 @@ def test_loop_progress_integration():
     assert trainer.test_loop.progress is not trainer.predict_loop.progress
     # check the validation progresses are not shared
     assert trainer.fit_loop.epoch_loop.val_loop.progress is not trainer.validate_loop.progress
-    expected = trainer.fit_loop.loop_progress["epoch_loop"]["progress"]
+    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["progress"]
     assert expected == trainer.fit_loop.epoch_loop.progress
-    expected = trainer.fit_loop.loop_progress["epoch_loop"]["batch_loop"]["progress"]
+    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["batch_loop"]["progress"]
     assert expected == trainer.fit_loop.epoch_loop.batch_loop.progress
-    expected = trainer.fit_loop.loop_progress["epoch_loop"]["val_loop"]["progress"]
+    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["val_loop"]["progress"]
     assert expected == trainer.fit_loop.epoch_loop.val_loop.progress
-    expected = trainer.fit_loop.loop_progress["epoch_loop"]["val_loop"]["epoch_loop"]["progress"]
+    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["val_loop"]["epoch_loop"]["progress"]
     assert expected == trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index aa1a0a74750a3..70e2ca7a62d3e 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections import OrderedDict
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator
@@ -162,15 +161,20 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     loop_parent.trainer = Trainer()
     assert loop_child.trainer == loop_parent.trainer
 
-    assert state_dict == OrderedDict([('state_dict', {
-        'a': 1
-    }), ('progress', {
-        'increment': 0
-    }), ('loop_child.state_dict', {
-        'a': 2
-    }), ('loop_child.progress', {
-        'increment': 0
-    })])
+    assert state_dict == {
+        'state_dict': {
+            'a': 1
+        },
+        'progress': {
+            'increment': 0
+        },
+        'loop_child.state_dict': {
+            'a': 2
+        },
+        'loop_child.progress': {
+            'increment': 0
+        }
+    }
 
     loop_parent.progress
 
@@ -190,15 +194,20 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     assert not loop_parent.restarting
 
     state_dict = loop_parent.state_dict()
-    assert state_dict == OrderedDict([('state_dict', {
-        'a': 1
-    }), ('progress', {
-        'increment': 1
-    }), ('loop_child.state_dict', {
-        'a': 3
-    }), ('loop_child.progress', {
-        'increment': 0
-    })])
+    assert state_dict == {
+        'state_dict': {
+            'a': 1
+        },
+        'progress': {
+            'increment': 1
+        },
+        'loop_child.state_dict': {
+            'a': 3
+        },
+        'loop_child.progress': {
+            'increment': 0
+        }
+    }
 
     loop_parent = Simple(1)
     loop_child = Simple(2)
@@ -209,7 +218,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
     del loop_parent.loop_child
     state_dict = loop_parent.state_dict()
-    assert state_dict == OrderedDict([('state_dict', {'a': 1}), ('progress', {'increment': 1})])
+    assert state_dict == {'state_dict': {'a': 1}, 'progress': {'increment': 1}}
 
     grand_loop_parent = Simple(0)
     loop_parent = Simple(1)

From 7e0456b23c75d3fac2b78cb6e7619a8ab717efac Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 12 Jul 2021 15:00:38 +0200
Subject: [PATCH 032/106] CHANGELOG

---
 CHANGELOG.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0fb37fa52af79..14c9adc46d25a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Progress tracking
   * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
   * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
-  * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/PyTorchLightning/pytorch-lightning/pull/8244))
+  * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/PyTorchLightning/pytorch-lightning/pull/8244), [#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
 
 
 - Added support for passing a `LightningDataModule` positionally as the second argument to `trainer.{validate,test,predict}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431))
@@ -146,10 +146,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247))
 
 
-- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))
-
-
-- Added `progress` tracking on loops ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
+- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))``
 
 
 - Added support for `save_hyperparameters` in `LightningDataModule` ([#3792](https://github.com/PyTorchLightning/pytorch-lightning/pull/3792))
@@ -375,8 +372,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Removed deprecated `optimizer` argument in `LightningModule.manual_backward()`; Toggling optimizers in manual optimization should be done using `LightningModule.{un}toggle_optimizer()` ([#8287](https://github.com/PyTorchLightning/pytorch-lightning/pull/8287))
 
 
-
-
 ### Fixed
 
 - Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/PyTorchLightning/pytorch-lightning/pull/7877))

From c2665328ee258702373d944c3d442a16b4eb97df Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Mon, 12 Jul 2021 15:01:58 +0200
Subject: [PATCH 033/106] CHANGELOG

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 14c9adc46d25a..2a9cd56df7478 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -146,7 +146,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247))
 
 
-- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))``
+- Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307))
 
 
 - Added support for `save_hyperparameters` in `LightningDataModule` ([#3792](https://github.com/PyTorchLightning/pytorch-lightning/pull/3792))

From 30ddd1030f9d0fc779a03b243927089bbdbb64ec Mon Sep 17 00:00:00 2001
From: thomas chaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 15:35:07 +0200
Subject: [PATCH 034/106] Update pytorch_lightning/loops/base.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
---
 pytorch_lightning/loops/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 9997baac79cc5..a8173d523de3d 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -56,7 +56,7 @@ def trainer(self) -> Optional['pl.Trainer']:
 
     @trainer.setter
     def trainer(self, trainer: 'pl.Trainer'):
-        """Connect the Trainer to itself and all its children loops"""
+        """Connects this loop's trainer and it's children"""
         if not isinstance(trainer, pl.Trainer):
             raise MisconfigurationException(
                 f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."

From ffc6ca71f6938a9594592a2897d4331a01303c90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 12 Jul 2021 16:21:32 +0200
Subject: [PATCH 035/106] whitespace suggestions

---
 pytorch_lightning/loops/batch/training_batch_loop.py  | 1 +
 pytorch_lightning/loops/dataloader/evaluation_loop.py | 1 +
 pytorch_lightning/loops/dataloader/prediction_loop.py | 1 +
 3 files changed, 3 insertions(+)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index e76ebf704cf38..d27b2a34987b5 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -52,6 +52,7 @@ def __init__(self) -> None:
         self.split_idx: Optional[int] = None
         self.progress = BatchProgress()
         self.optim_progress = OptimizationProgress()
+
         self._warning_cache: WarningCache = WarningCache()
         self._hiddens: Optional[Tensor] = None
         self._optimizer_freq_cumsum: Optional[int] = None
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index 5dc0270f58774..ba554bf9c1a29 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -35,6 +35,7 @@ def __init__(self):
         self.outputs = []
         self.progress = DataLoaderProgress()
         self.epoch_loop = EvaluationEpochLoop()
+
         self._results = ResultCollection(training=False)
         self._max_batches: Optional[Union[int, Sequence[int]]] = None
         self._has_run: bool = False
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 1bdd38ed950b0..6a58a2c78f4b1 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -21,6 +21,7 @@ def __init__(self):
         self.epoch_batch_indices: Optional[List[List[int]]] = None
         self.progress = EpochLoopProgress()
         self.epoch_loop = PredictionEpochLoop()
+        
         self._results = None  # for `trainer._results` access
         self._return_predictions: bool = False
 

From 9ac0b61967619eb81c493ca7b5479c2c2601bfa4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 12 Jul 2021 14:22:45 +0000
Subject: [PATCH 036/106] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/loops/dataloader/prediction_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 6a58a2c78f4b1..345a6296578f5 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -21,7 +21,7 @@ def __init__(self):
         self.epoch_batch_indices: Optional[List[List[int]]] = None
         self.progress = EpochLoopProgress()
         self.epoch_loop = PredictionEpochLoop()
-        
+
         self._results = None  # for `trainer._results` access
         self._return_predictions: bool = False
 

From 8ddb020530277bfc2462cbeed0d6c8e821d15b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 12 Jul 2021 16:23:07 +0200
Subject: [PATCH 037/106] make fault_tolerant_enabled protected

---
 pytorch_lightning/loops/dataloader/prediction_loop.py        | 2 +-
 pytorch_lightning/trainer/connectors/checkpoint_connector.py | 4 ++--
 pytorch_lightning/trainer/trainer.py                         | 4 ++--
 pytorch_lightning/utilities/imports.py                       | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 6a58a2c78f4b1..345a6296578f5 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -21,7 +21,7 @@ def __init__(self):
         self.epoch_batch_indices: Optional[List[List[int]]] = None
         self.progress = EpochLoopProgress()
         self.epoch_loop = PredictionEpochLoop()
-        
+
         self._results = None  # for `trainer._results` access
         self._return_predictions: bool = False
 
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index df1328d668305..40b59f8c93f54 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -23,7 +23,7 @@
 from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, rank_zero_deprecation, rank_zero_info, rank_zero_warn
 from pytorch_lightning.utilities.cloud_io import atomic_save, get_filesystem
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import fault_tolerant_enabled
+from pytorch_lightning.utilities.imports import _fault_tolerant_enabled
 from pytorch_lightning.utilities.upgrade_checkpoint import KEYS_MAPPING as DEPRECATED_CHECKPOINT_KEYS
 
 if _OMEGACONF_AVAILABLE:
@@ -347,7 +347,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             'state_dict': self.trainer.accelerator.lightning_module_state_dict(),
         }
 
-        if fault_tolerant_enabled():
+        if _fault_tolerant_enabled():
             checkpoint.update({"loops": self.get_loops_state_dict()})
 
         if not weights_only:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a9e316bfc5a2f..c7e5224593744 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -76,7 +76,7 @@
 from pytorch_lightning.utilities.debugging import InternalDebugger
 from pytorch_lightning.utilities.distributed import distributed_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import fault_tolerant_enabled
+from pytorch_lightning.utilities.imports import _fault_tolerant_enabled
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.seed import reset_seed
 from pytorch_lightning.utilities.types import _EVALUATE_OUTPUT, _PREDICT_OUTPUT, EVAL_DATALOADERS, TRAIN_DATALOADERS
@@ -1252,6 +1252,6 @@ def _log_device_info(self) -> None:
             )
 
     def on_expection(self):
-        if fault_tolerant_enabled():
+        if _fault_tolerant_enabled():
             # save a checkpoint for fault tolerant training
             self.fit_loop._check_checkpoint_callback(True)
diff --git a/pytorch_lightning/utilities/imports.py b/pytorch_lightning/utilities/imports.py
index fdd5382ca751d..347bcd1ecf544 100644
--- a/pytorch_lightning/utilities/imports.py
+++ b/pytorch_lightning/utilities/imports.py
@@ -104,5 +104,5 @@ def _compare_version(package: str, op, version) -> bool:
     _IPU_AVAILABLE = False
 
 
-def fault_tolerant_enabled():
+def _fault_tolerant_enabled():
     return os.getenv("PL_FAULT_TOLERANT_TRAINING", "0") == "1"

From 50b6f49c1801955c71c31fcd9ea081e7dadd7efc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 12 Jul 2021 16:24:10 +0200
Subject: [PATCH 038/106] whitespace fixes around Args

---
 pytorch_lightning/trainer/progress.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index db2321365bfa6..e5746fa05b283 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -35,6 +35,7 @@ def from_state_dict(cls, state_dict: dict) -> "BaseProgress":
 class Tracker(BaseProgress):
     """
     Track an event's progress.
+    
     Args:
         ready: Intended to track the number of events ready to start.
         started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).
@@ -86,6 +87,7 @@ def reset_on_restart(self):
 class Progress(BaseProgress):
     """
     Track aggregated and current progress.
+
     Args:
         total: Intended to track the total progress of an event
         current: Intended to track the current progress of an event
@@ -131,6 +133,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 class BatchProgress(Progress):
     """
     Tracks the batch progress
+
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
@@ -141,6 +144,7 @@ class BatchProgress(Progress):
 class TrainingEpochProgress(Progress):
     """
     Tracks the batch progress
+
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
@@ -167,6 +171,7 @@ class EpochProgress(Progress):
     """
     Tracks the epoch progress
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
+
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
@@ -188,6 +193,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 class OptimizerProgress(BaseProgress):
     """
     Track optimizer progress.
+
     Args:
         step: Tracks ``optimizer.step`` calls.
         zero_grad: Tracks ``optimizer.zero_grad`` calls.
@@ -209,6 +215,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 class OptimizationProgress(BaseProgress):
     """
     Track optimization progress.
+
     Args:
         optimizer: Tracks optimizer progress.
         scheduler: Tracks scheduler progress.
@@ -242,6 +249,7 @@ class EpochLoopProgress(BaseProgress):
     """
     Tracks epoch loop progress.
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
+
     Args:
         epoch: Tracks epochs progress.
     """

From 8e9682ed8e3abda025179aa58dc3f5073f3d0c2a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 12 Jul 2021 14:25:40 +0000
Subject: [PATCH 039/106] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/trainer/progress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index e5746fa05b283..e37f14960220d 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -35,7 +35,7 @@ def from_state_dict(cls, state_dict: dict) -> "BaseProgress":
 class Tracker(BaseProgress):
     """
     Track an event's progress.
-    
+
     Args:
         ready: Intended to track the number of events ready to start.
         started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).

From 0838d7a7fa957f1f20cbd5afe9128400a94101b0 Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 12 Jul 2021 20:21:39 +0200
Subject: [PATCH 040/106] update

---
 tests/loops/test_loop_progress_integration.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/loops/test_loop_progress_integration.py b/tests/loops/test_loop_progress_integration.py
index 465ec7ad15655..4395cb5cdcf3b 100644
--- a/tests/loops/test_loop_progress_integration.py
+++ b/tests/loops/test_loop_progress_integration.py
@@ -9,11 +9,8 @@ def test_loop_progress_integration():
     assert trainer.test_loop.progress is not trainer.predict_loop.progress
     # check the validation progresses are not shared
     assert trainer.fit_loop.epoch_loop.val_loop.progress is not trainer.validate_loop.progress
-    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["progress"]
-    assert expected == trainer.fit_loop.epoch_loop.progress
-    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["batch_loop"]["progress"]
-    assert expected == trainer.fit_loop.epoch_loop.batch_loop.progress
-    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["val_loop"]["progress"]
-    assert expected == trainer.fit_loop.epoch_loop.val_loop.progress
-    expected = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]["val_loop"]["epoch_loop"]["progress"]
-    assert expected == trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
+    generated = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]
+    assert generated["progress"] is trainer.fit_loop.epoch_loop.progress
+    assert generated["batch_loop"]["progress"] is trainer.fit_loop.epoch_loop.batch_loop.progress
+    assert generated["val_loop"]["progress"] is trainer.fit_loop.epoch_loop.val_loop.progress
+    assert generated["val_loop"]["epoch_loop"]["progress"] is trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress

From 107e1437dd6ae90e161ac365d38bae06f0e52a01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 13 Jul 2021 00:38:50 +0200
Subject: [PATCH 041/106] typo it's -> its

---
 pytorch_lightning/loops/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index a8173d523de3d..9209dcb993284 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -56,7 +56,7 @@ def trainer(self) -> Optional['pl.Trainer']:
 
     @trainer.setter
     def trainer(self, trainer: 'pl.Trainer'):
-        """Connects this loop's trainer and it's children"""
+        """Connects this loop's trainer and its children"""
         if not isinstance(trainer, pl.Trainer):
             raise MisconfigurationException(
                 f"Loop {self.__class__.__name__} should be connected to a `Trainer`, found: {trainer}."

From e49cd508ec3dcb4b09733d0bc2b80e3f7f545146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 13 Jul 2021 00:39:13 +0200
Subject: [PATCH 042/106] fix copy-paste typo in progress docstring

---
 pytorch_lightning/trainer/progress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index e37f14960220d..66921ccc94a4d 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -143,7 +143,7 @@ class BatchProgress(Progress):
 @dataclass
 class TrainingEpochProgress(Progress):
     """
-    Tracks the batch progress
+    Tracks the epoch progress
 
     Args:
         total: Tracks the total epoch progress

From 2e0423a046a8f55cafacd886e1ec834f17d9b1cf Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 13 Jul 2021 14:11:23 +0200
Subject: [PATCH 043/106] Delete classes

---
 .../loops/batch/training_batch_loop.py        |  8 +--
 .../loops/dataloader/prediction_loop.py       | 12 ++---
 .../loops/epoch/prediction_epoch_loop.py      | 13 +----
 pytorch_lightning/trainer/progress.py         | 54 ++-----------------
 4 files changed, 13 insertions(+), 74 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index d27b2a34987b5..334a80241fe59 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import BatchProgress, OptimizationProgress
+from pytorch_lightning.trainer.progress import OptimizationProgress, Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -50,7 +50,7 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        self.progress = BatchProgress()
+        self.progress = Progress()
         self.optim_progress = OptimizationProgress()
 
         self._warning_cache: WarningCache = WarningCache()
@@ -437,9 +437,9 @@ def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         Args:
             optimizer: the current optimizer
         """
-        self.optim_progress.optimizer.zero_grad.increment_started()
-        self.trainer.call_hook('on_before_zero_grad', optimizer)
         self.optim_progress.optimizer.zero_grad.increment_ready()
+        self.trainer.call_hook('on_before_zero_grad', optimizer)
+        self.optim_progress.optimizer.zero_grad.increment_started()
 
     def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer, opt_idx: int) -> None:
         """Zeroes out all gradients of parameters optimized by the current optimizer.
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 345a6296578f5..51eccdf202051 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -7,7 +7,7 @@
 from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop
 from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop
 from pytorch_lightning.plugins import DDPSpawnPlugin
-from pytorch_lightning.trainer.progress import EpochLoopProgress
+from pytorch_lightning.trainer.progress import DataLoaderProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _PREDICT_OUTPUT
 
@@ -19,7 +19,7 @@ def __init__(self):
         super().__init__()
         self.predictions: Optional[List[List[Any]]] = None
         self.epoch_batch_indices: Optional[List[List[int]]] = None
-        self.progress = EpochLoopProgress()
+        self.progress = DataLoaderProgress()
         self.epoch_loop = PredictionEpochLoop()
 
         self._results = None  # for `trainer._results` access
@@ -75,14 +75,10 @@ def done(self) -> bool:
     def skip(self) -> bool:
         return sum(self.max_batches) == 0
 
-    def connect(
-        self, trainer: "pl.Trainer", *args: Any, progress: Optional[EpochLoopProgress] = None, **kwargs: Any
-    ) -> None:
+    def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
         super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-        self.epoch_loop.connect(trainer, progress=self.progress.epoch)
+        self.epoch_loop.connect(trainer)
 
     def reset(self) -> None:
         """Resets the internal state of the loop for a new run"""
diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
index ea03be5ef0096..f94e106a8c444 100644
--- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
@@ -3,10 +3,9 @@
 
 from deprecate import void
 
-import pytorch_lightning as pl
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
-from pytorch_lightning.trainer.progress import EpochProgress
+from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.utilities.warnings import WarningCache
 
 
@@ -18,21 +17,13 @@ def __init__(self) -> None:
         self.return_predictions: bool = False
         self.predictions: List[Any] = []
         self.current_batch_indices: List[int] = []
-        self.progress = EpochProgress()
+        self.progress = Progress()
 
         self._dl_max_batches: Optional[int] = None
         self._num_dataloaders: Optional[int] = None
         self._warning_cache = WarningCache()
         self._all_batch_indices: List[int] = []
 
-    def connect(
-        self, trainer: "pl.Trainer", *args: Any, progress: Optional[EpochProgress] = None, **kwargs: Any
-    ) -> None:
-        """Connects the loop with necessary arguments like the trainer"""
-        super().connect(trainer, *args, **kwargs)
-        if progress is not None:
-            self.progress = progress
-
     @property
     def done(self) -> bool:
         """Ends prediction when the iteration count exceeds the total number of available batches"""
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 66921ccc94a4d..78215031a76e8 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -41,6 +41,7 @@ class Tracker(BaseProgress):
         started: Intended to be incremented after the event is started (e.g. after ``on_*_start`` runs).
         processed: Intended to be incremented after the event is processed.
         completed: Intended to be incremented after the event completes (e.g. after ``on_*_end`` runs).
+
     Attributes set to ``None`` are treated as unused and are restricted.
     """
 
@@ -129,17 +130,6 @@ def load_state_dict(self, state_dict: dict) -> None:
         self.current.load_state_dict(state_dict["current"])
 
 
-@dataclass
-class BatchProgress(Progress):
-    """
-    Tracks the batch progress
-
-    Args:
-        total: Tracks the total epoch progress
-        current: Tracks the current epoch progress
-    """
-
-
 @dataclass
 class TrainingEpochProgress(Progress):
     """
@@ -158,34 +148,19 @@ def load_state_dict(self, state_dict: dict) -> None:
 
 @dataclass
 class DataLoaderProgress(Progress):
-
-    dataloader_idx: int = 0
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        super().load_state_dict(state_dict)
-        self.dataloader_idx = state_dict["dataloader_idx"]
-
-
-@dataclass
-class EpochProgress(Progress):
     """
-    Tracks the epoch progress
+    Tracks the data-loader progress
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
 
     Args:
         total: Tracks the total epoch progress
         current: Tracks the current epoch progress
-        batch: Tracks batch progress.
+        dataloader_idx: The index of the current dataloader.
     """
     dataloader_idx: int = 0
-    batch: BatchProgress = field(default_factory=BatchProgress)
-
-    def reset_on_epoch(self) -> None:
-        self.batch.current.reset()
 
     def load_state_dict(self, state_dict: dict) -> None:
         super().load_state_dict(state_dict)
-        self.batch.load_state_dict(state_dict["batch"])
         self.dataloader_idx = state_dict["dataloader_idx"]
 
 
@@ -242,26 +217,3 @@ def load_state_dict(self, state_dict: dict) -> None:
         self.optimizer.load_state_dict(state_dict["optimizer"])
         self.scheduler.load_state_dict(state_dict["scheduler"])
         self.optimizer_idx = state_dict["optimizer_idx"]
-
-
-@dataclass
-class EpochLoopProgress(BaseProgress):
-    """
-    Tracks epoch loop progress.
-    These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
-
-    Args:
-        epoch: Tracks epochs progress.
-    """
-    epoch: EpochProgress = field(default_factory=EpochProgress)
-
-    def increment_epoch_completed(self) -> None:
-        self.epoch.increment_completed()
-        self.reset_on_epoch()
-
-    def reset_on_epoch(self) -> None:
-        self.epoch.reset_on_epoch()
-        self.epoch.current.reset()
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        self.epoch.load_state_dict(state_dict["epoch"])

From 7caca875f18e02d8fd0725942561d027ae987687 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 13 Jul 2021 14:18:35 +0200
Subject: [PATCH 044/106] Minor change

---
 pytorch_lightning/trainer/connectors/checkpoint_connector.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 40b59f8c93f54..c4114c554962b 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -257,7 +257,7 @@ def restore_loops(self) -> None:
         if not self._loaded_checkpoint:
             return
 
-        state_dict = self._loaded_checkpoint.get("loops", None)
+        state_dict = self._loaded_checkpoint.get("loops")
         if state_dict:
             self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
             self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
@@ -346,9 +346,8 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             'pytorch-lightning_version': pl.__version__,
             'state_dict': self.trainer.accelerator.lightning_module_state_dict(),
         }
-
         if _fault_tolerant_enabled():
-            checkpoint.update({"loops": self.get_loops_state_dict()})
+            checkpoint["loops"] = self.get_loops_state_dict()
 
         if not weights_only:
             # dump callbacks

From 2800eaec82a28373a6c9966fce3384868c7a8d11 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 13 Jul 2021 14:19:31 +0200
Subject: [PATCH 045/106] docs

---
 pytorch_lightning/trainer/progress.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 78215031a76e8..5f153f45002c0 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -194,12 +194,13 @@ class OptimizationProgress(BaseProgress):
     Args:
         optimizer: Tracks optimizer progress.
         scheduler: Tracks scheduler progress.
+        optimizer_idx: The index of the current optimizer.
     """
 
     # TODO: support for multiple optimizers
-    optimizer_idx: int = 0
     optimizer: OptimizerProgress = field(default_factory=OptimizerProgress)
     scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
+    optimizer_idx: int = 0
 
     @property
     def optimizer_steps(self) -> int:

From feec34fba58007cc4beea6463c21ad389853fa13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 13 Jul 2021 14:30:38 +0200
Subject: [PATCH 046/106] protected get_loops_state

---
 .../trainer/connectors/checkpoint_connector.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index c4114c554962b..21bd347958237 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -347,7 +347,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
             'state_dict': self.trainer.accelerator.lightning_module_state_dict(),
         }
         if _fault_tolerant_enabled():
-            checkpoint["loops"] = self.get_loops_state_dict()
+            checkpoint["loops"] = self._get_loops_state_dict()
 
         if not weights_only:
             # dump callbacks
@@ -387,14 +387,6 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
 
         return checkpoint
 
-    def get_loops_state_dict(self):
-        return {
-            "fit_loop": self.trainer.fit_loop.state_dict(),
-            "validate_loop": self.trainer.validate_loop.state_dict(),
-            "test_loop": self.trainer.test_loop.state_dict(),
-            "predict_loop": self.trainer.predict_loop.state_dict(),
-        }
-
     def hpc_load(self, checkpoint_path: str) -> None:
         """
         Attempts to restore the full training and model state from a HPC checkpoint file.
@@ -453,3 +445,11 @@ def save_checkpoint(self, filepath, weights_only: bool = False) -> None:
         """
         _checkpoint = self.dump_checkpoint(weights_only)
         self.trainer.accelerator.save_checkpoint(_checkpoint, filepath)
+
+    def _get_loops_state_dict(self):
+        return {
+            "fit_loop": self.trainer.fit_loop.state_dict(),
+            "validate_loop": self.trainer.validate_loop.state_dict(),
+            "test_loop": self.trainer.test_loop.state_dict(),
+            "predict_loop": self.trainer.predict_loop.state_dict(),
+        }
\ No newline at end of file

From ccdd09d700124b242dbfb9410115ac16b3d24d6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Tue, 13 Jul 2021 14:34:35 +0200
Subject: [PATCH 047/106] merge restore_loops with restore_progress

---
 .../connectors/checkpoint_connector.py        | 33 ++++++++-----------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
index 21bd347958237..7c09804add72b 100644
--- a/pytorch_lightning/trainer/connectors/checkpoint_connector.py
+++ b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -161,13 +161,11 @@ def restore_training_state(self) -> None:
 
         # restore precision plugin (scaler etc.)
         self.trainer.precision_plugin.on_load_checkpoint(self._loaded_checkpoint)
-        # restore progress (loops etc.)
-        self.restore_progress()
+        # restore loops and their progress
+        self.restore_loops()
 
         self.restore_optimizers_and_schedulers()
 
-        self.restore_loops()
-
     def restore_callbacks(self) -> None:
         """ Restores all callbacks from the pre-loaded checkpoint. """
         if not self._loaded_checkpoint:
@@ -182,10 +180,10 @@ def restore_callbacks(self) -> None:
             )
         self.trainer.on_load_checkpoint(self._loaded_checkpoint)
 
-    def restore_progress(self) -> None:
+    def restore_loops(self) -> None:
         """
-        Restores the training progress from the pre-loaded checkpoint. This currently includes only the global step
-        and current epoch.
+        Restores the loop progress from the pre-loaded checkpoint.
+        Calls hooks on the loops to give it a chance to restore its state from the checkpoint.
         """
         if not self._loaded_checkpoint:
             return
@@ -212,6 +210,13 @@ def restore_progress(self) -> None:
                 " consider using an end of epoch checkpoint."
             )
 
+        state_dict = self._loaded_checkpoint.get("loops")
+        if state_dict:
+            self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
+            self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
+            self.trainer.test_loop.load_state_dict(state_dict["test_loop"])
+            self.trainer.predict_loop.load_state_dict(state_dict["predict_loop"])
+
     def restore_optimizers_and_schedulers(self) -> None:
         """ Restores the optimizers and learning rate scheduler states from the pre-loaded checkpoint. """
         if not self._loaded_checkpoint:
@@ -252,18 +257,6 @@ def restore_lr_schedulers(self) -> None:
         for scheduler, lrs_state in zip(self.trainer.lr_schedulers, lr_schedulers):
             scheduler['scheduler'].load_state_dict(lrs_state)
 
-    def restore_loops(self) -> None:
-        """ Calls hooks on the loops to give it a chance to restore its state from the checkpoint. """
-        if not self._loaded_checkpoint:
-            return
-
-        state_dict = self._loaded_checkpoint.get("loops")
-        if state_dict:
-            self.trainer.fit_loop.load_state_dict(state_dict["fit_loop"])
-            self.trainer.validate_loop.load_state_dict(state_dict["validate_loop"])
-            self.trainer.test_loop.load_state_dict(state_dict["test_loop"])
-            self.trainer.predict_loop.load_state_dict(state_dict["predict_loop"])
-
     # ----------------------------------
     # PRIVATE OPS
     # ----------------------------------
@@ -452,4 +445,4 @@ def _get_loops_state_dict(self):
             "validate_loop": self.trainer.validate_loop.state_dict(),
             "test_loop": self.trainer.test_loop.state_dict(),
             "predict_loop": self.trainer.predict_loop.state_dict(),
-        }
\ No newline at end of file
+        }

From 01768cb23a28daf00c684e80f8f337765e8bd723 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Tue, 13 Jul 2021 14:41:25 +0200
Subject: [PATCH 048/106] Fix tests after removals

---
 tests/trainer/test_progress.py | 72 ++++++----------------------------
 1 file changed, 11 insertions(+), 61 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 6322ab5be33bb..32d97be167b52 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -21,12 +21,12 @@
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.progress import (
-    BatchProgress,
-    EpochLoopProgress,
-    EpochProgress,
+    DataLoaderProgress,
+    OptimizationProgress,
     OptimizerProgress,
     Progress,
     Tracker,
+    TrainingEpochProgress,
 )
 from tests.helpers import BoringModel
 
@@ -79,20 +79,9 @@ def test_base_progress_from_defaults():
     assert actual == expected
 
 
-def test_epoch_loop_progress_increment_epoch():
-    p = EpochLoopProgress()
-    p.increment_epoch_completed()
-    p.increment_epoch_completed()
-    assert p.epoch.total == Tracker(completed=2)
-    assert p.epoch.current == Tracker()
-    assert p.epoch.batch.current == Tracker()
-
-
 def test_epoch_loop_progress_increment_sequence():
     """Test sequences for incrementing batches reads and epochs."""
-    batch = BatchProgress(total=Tracker(started=None))
-    epoch = EpochProgress(batch=batch)
-    loop = EpochLoopProgress(epoch=epoch)
+    batch = Progress(total=Tracker(started=None))
 
     batch.increment_ready()
     assert batch.total == Tracker(ready=1, started=None)
@@ -110,26 +99,6 @@ def test_epoch_loop_progress_increment_sequence():
     assert batch.total == Tracker(ready=1, started=None, processed=1, completed=1)
     assert batch.current == Tracker(ready=1, processed=1, completed=1)
 
-    assert epoch.total == Tracker()
-    assert epoch.current == Tracker()
-    loop.increment_epoch_completed()
-    assert batch.total == Tracker(ready=1, started=None, processed=1, completed=1)
-    assert batch.current == Tracker()
-    assert epoch.total == Tracker(completed=1)
-    assert epoch.current == Tracker()
-
-    batch.increment_ready()
-    assert batch.total == Tracker(ready=2, started=None, processed=1, completed=1)
-    assert batch.current == Tracker(ready=1)
-    assert epoch.total == Tracker(completed=1)
-    assert epoch.current == Tracker()
-
-    loop.reset_on_epoch()
-    assert batch.total == Tracker(ready=2, started=None, processed=1, completed=1)
-    assert batch.current == Tracker()
-    assert epoch.total == Tracker(completed=1)
-    assert epoch.current == Tracker()
-
 
 def test_optimizer_progress_default_factory():
     """
@@ -144,32 +113,13 @@ def test_optimizer_progress_default_factory():
     assert p2.step.total.completed == 0
 
 
-def test_epoch_loop_progress_serialization():
-    loop = EpochLoopProgress()
-    loop.epoch.dataloader_idx = 1
-    _ = deepcopy(loop)
-    state_dict = loop.state_dict()
-
-    # yapf: disable
-    assert state_dict == {
-        'epoch': {
-            # number of times `validate` has been called
-            'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-            # either 0 or 1 as `max_epochs` does not apply to the `validate` loop
-            'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-            'batch': {
-                # number of batches across `validate` calls
-                'total': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-                # number of batches this `validate` call
-                'current': {'completed': 0, 'processed': 0, 'ready': 0, 'started': 0},
-            },
-            'dataloader_idx': 1
-        }
-    }
-    # yapf: enable
-
-    new_loop = EpochLoopProgress.from_state_dict(state_dict)
-    assert loop == new_loop
+def test_deepcopy():
+    _ = deepcopy(Tracker())
+    _ = deepcopy(Progress())
+    _ = deepcopy(TrainingEpochProgress())
+    _ = deepcopy(DataLoaderProgress())
+    _ = deepcopy(OptimizerProgress())
+    _ = deepcopy(OptimizationProgress())
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})

From 71e05d3553b480fcfb7813d4ed8e3effe34b7d1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 10:04:18 +0200
Subject: [PATCH 049/106] explicit save with trainer.save_checkpoint()

---
 pytorch_lightning/trainer/trainer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index c7e5224593744..bb65ff47817e7 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Trainer to automate the training."""
 import logging
+import os
 import traceback
 import warnings
 from datetime import timedelta
@@ -1010,7 +1011,7 @@ def _run_train(self) -> None:
                 self.training_type_plugin.reconciliate_processes(traceback.format_exc())
             # give accelerators a chance to finish
             self.accelerator.on_train_end()
-            self.on_expection()
+            self._on_expection()
             # reset bookkeeping
             self.state.stage = None
             raise
@@ -1251,7 +1252,10 @@ def _log_device_info(self) -> None:
                 " `Trainer(ipus=8)` or script `--ipus=8`."
             )
 
-    def on_expection(self):
-        if _fault_tolerant_enabled():
-            # save a checkpoint for fault tolerant training
-            self.fit_loop._check_checkpoint_callback(True)
+    def _on_expection(self):
+        if not self.is_global_zero or not _fault_tolerant_enabled():
+            return
+
+        # save a checkpoint for fault tolerant training
+        file_path = os.path.join(self.default_root_dir, ".pl_auto_save.ckpt")
+        self.save_checkpoint(file_path)

From 3d13b645954ca4eefcbdf5a5304a5514a29910fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 12:48:38 +0200
Subject: [PATCH 050/106] handle optimization restart based on optimizer_idx

---
 pytorch_lightning/loops/batch/training_batch_loop.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 334a80241fe59..30bf3a3dc482c 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -150,7 +150,7 @@ def advance(self, batch, batch_idx, dataloader_idx):
 
                 # handle optimization restart
                 if self.restarting:
-                    if len(active_optimizers) > 1 and opt_idx < self.progress.current.completed:
+                    if opt_idx < self.optim_progress.optimizer_idx:
                         continue
                     self.restarting = False
 

From 78d13e2c09dab959cd0da9045984ac4d146a0464 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 13:54:51 +0200
Subject: [PATCH 051/106] update increments

---
 .../loops/batch/training_batch_loop.py        | 17 +++----------
 .../loops/epoch/training_epoch_loop.py        | 25 +++++++++++--------
 pytorch_lightning/loops/fit_loop.py           |  6 +++++
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 30bf3a3dc482c..912c4f8fbe548 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -50,7 +50,8 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        self.progress = Progress()
+        # TODO: add progress updates for batch splits
+        self.split_progress = Progress()
         self.optim_progress = OptimizationProgress()
 
         self._warning_cache: WarningCache = WarningCache()
@@ -87,8 +88,6 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
             self._warning_cache.warn("train_dataloader yielded None. If this was on purpose, ignore this warning...")
             return AttributeDict(signal=0, training_step_output=[[]])
 
-        self.progress.increment_ready()
-
         # hook
         self.trainer.logger_connector.on_batch_start()
         response = self.trainer.call_hook("on_batch_start")
@@ -100,6 +99,8 @@ def run(self, batch: Any, batch_idx: int, dataloader_idx: int) -> AttributeDict:
         if response == -1:
             return AttributeDict(signal=-1)
 
+        self.trainer.fit_loop.epoch_loop.batch_progress.increment_started()
+
         super().run(batch, batch_idx, dataloader_idx)
         output = AttributeDict(signal=0, training_step_output=self.batch_outputs)
         self.batch_outputs = None  # free memory
@@ -124,10 +125,6 @@ def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         void(batch_idx, dataloader_idx)
         self._remaining_splits = list(enumerate(self._tbptt_split_batch(batch)))
 
-    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
-        super().on_advance_start(*args, **kwargs)
-        self.progress.increment_started()
-
     def advance(self, batch, batch_idx, dataloader_idx):
         """Runs the train step together with optimization (if necessary) on the current batch split
 
@@ -166,12 +163,6 @@ def advance(self, batch, batch_idx, dataloader_idx):
             if result:
                 self.batch_outputs[0].append(result.training_step_output)
 
-        self.progress.increment_processed()
-
-    def on_advance_end(self) -> None:
-        super().on_advance_end()
-        self.progress.increment_completed()
-
     def teardown(self) -> None:
         # release memory
         self._remaining_splits = None
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 8f6cc13e64fd5..0ff4613162c4f 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -20,7 +20,7 @@
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import TrainingEpochProgress
+from pytorch_lightning.trainer.progress import TrainingEpochProgress, Progress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
@@ -45,7 +45,7 @@ def __init__(self, min_steps: int, max_steps: int):
         # the number of batches seen this run, updates immediately after batch_loop.run()
         self.batches_seen: int = 0
         self.is_last_batch: Optional[bool] = None
-        self.progress = TrainingEpochProgress()
+        self.batch_progress = Progress()
 
         self.batch_loop = TrainingBatchLoop()
         self.val_loop = loops.EvaluationLoop()
@@ -92,17 +92,14 @@ def reset(self) -> None:
             self.restarting = False
         else:
             # todo (tchaton) the batch_loop should be responsible for that.
-            self.batch_loop.progress.current.reset()
+            self.batch_loop.split_progress.current.reset()
 
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
-        self.progress.increment_ready()
-
         # hook
         self.trainer.logger_connector.on_epoch_start()
         self.trainer.call_hook("on_epoch_start")
         self.trainer.call_hook("on_train_epoch_start")
-
-        self.progress.increment_started()
+        self.trainer.fit_loop.epoch_progress.increment_started()
 
     def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
         """Runs a single training batch.
@@ -122,10 +119,16 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
         with self.trainer.profiler.profile("training_batch_to_device"):
             batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=self._dataloader_idx)
 
+        self.batch_progress.increment_ready()
+
         with self.trainer.profiler.profile("run_training_batch"):
             batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
+
+            # TODO: remove with progress tracking
             self.batches_seen += 1
 
+        self.batch_progress.increment_processed()
+
         # when returning -1 from train_step, we end epoch early
         if batch_output.signal == -1:
             raise StopIteration
@@ -146,6 +149,8 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
         self.trainer.call_hook('on_batch_end')
         self.trainer.logger_connector.on_batch_end()
 
+        self.batch_progress.increment_completed()
+
         # figure out what to track for epoch end
         self._track_epoch_end_reduce_metrics(self._epoch_output, batch_end_outputs)
 
@@ -163,7 +168,7 @@ def on_advance_end(self):
         # -----------------------------------------
         # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
         # -----------------------------------------
-        self.progress.should_check_val = should_check_val = self._should_check_val_fx(
+        should_check_val = self._should_check_val_fx(
             self.iteration_count, self.is_last_batch
         )
 
@@ -224,15 +229,13 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]:
                     'HINT: remove the return statement in training_epoch_end'
                 )
 
-        self.progress.increment_processed()
+        self.trainer.fit_loop.epoch_progress.increment_processed()
 
         # call train epoch end hooks
         self._on_train_epoch_end_hook(processed_outputs)
         self.trainer.call_hook('on_epoch_end')
         self.trainer.logger_connector.on_epoch_end()
 
-        self.progress.increment_completed()
-
         epoch_output = self._epoch_output
         # free memory
         self._epoch_output = None
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 6963f4b3f2c4a..7af7b52cbea05 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -20,6 +20,7 @@
 from pytorch_lightning.loops import Loop
 from pytorch_lightning.loops.epoch import TrainingEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
+from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_info
 
@@ -51,6 +52,7 @@ def __init__(
         self.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs
         self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
         self.epoch_loop = TrainingEpochLoop(min_steps, max_steps)
+        self.epoch_progress = Progress()
 
     @property
     def current_epoch(self) -> int:
@@ -200,6 +202,8 @@ def on_advance_start(self) -> None:
             window_length=self.trainer.accumulate_grad_batches
         )
 
+        self.epoch_progress.increment_ready()
+
     def advance(self) -> None:
         """Runs one whole epoch."""
         train_dataloader = self.trainer.accelerator.process_dataloader(self.trainer.train_dataloader)
@@ -234,6 +238,8 @@ def on_advance_end(self) -> None:
             self._check_checkpoint_callback(True)
             self.global_step += 1
 
+        self.epoch_progress.increment_completed()
+
     def on_run_end(self) -> None:
         """Calls the ``on_train_end`` hook"""
         # NOTE: the iteration_count/current_epoch is already incremented

From 1048259c81394227d5ac0bb22f3aa38762f8c0ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 14:07:30 +0200
Subject: [PATCH 052/106] update val batch progress and remove iteration count

---
 .../loops/epoch/evaluation_epoch_loop.py      | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index c56b4a7f097d1..9591672a7d3c7 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -41,7 +41,7 @@ def __init__(self) -> None:
         self.dataloader_idx: Optional[int] = None
         self.num_dataloaders: Optional[int] = None
         self.outputs: List[STEP_OUTPUT] = []
-        self.progress = Progress()
+        self.batch_progress = Progress()
 
     def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
         """Connects the loop with necessary arguments like the trainer"""
@@ -50,11 +50,10 @@ def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
     @property
     def done(self) -> bool:
         """Returns ``True`` if the current iteration count reaches the number of dataloader batches."""
-        return self.iteration_count >= self.dl_max_batches
+        return self.batch_progress.current.completed >= self.dl_max_batches
 
     def reset(self) -> None:
         """Resets the loop's internal state."""
-        self.iteration_count = 0
         self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size)
         self.dl_max_batches = None
         self.dataloader_idx = None
@@ -62,11 +61,9 @@ def reset(self) -> None:
         self.outputs = []
 
         if self.restarting:
-            self.iteration_count = self.progress.current.completed
             self.restarting = False
         else:
-            self.iteration_count = 0
-            self.progress.current.reset()
+            self.batch_progress.current.reset()
 
     def on_run_start(
         self,
@@ -117,31 +114,31 @@ def advance(
         with self.trainer.profiler.profile("evaluation_batch_to_device"):
             batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx)
 
-        self.progress.increment_started()
+        self.batch_progress.increment_ready()
 
         # hook
         self.on_evaluation_batch_start(batch, batch_idx, dataloader_idx)
 
-        self.progress.increment_ready()
+        self.batch_progress.increment_started()
 
         # lightning module methods
         with self.trainer.profiler.profile("evaluation_step_and_end"):
             output = self.evaluation_step(batch, batch_idx, dataloader_idx)
             output = self.evaluation_step_end(output)
 
+        self.batch_progress.increment_processed()
+
         # hook + store predictions
         self.on_evaluation_batch_end(output, batch, batch_idx, dataloader_idx)
 
+        self.batch_progress.increment_completed()
+
         # log batch metrics
         self.trainer.logger_connector.update_eval_step_metrics()
 
         # track epoch level outputs
         self.outputs = self._track_output_for_epoch_end(self.outputs, output)
 
-        self.progress.increment_processed()
-
-        self.progress.increment_completed()
-
     def on_run_end(self) -> List[STEP_OUTPUT]:
         """Returns the outputs of the whole run"""
         outputs = self.outputs

From 668a4cfaac1c524ce54fa54af7d148802ec14f31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 14:37:05 +0200
Subject: [PATCH 053/106] update progress tracking for dataloader loops

---
 .../loops/dataloader/dataloader_loop.py       | 23 ++++++++++++++----
 .../loops/dataloader/evaluation_loop.py       | 24 ++-----------------
 .../loops/dataloader/prediction_loop.py       |  7 ------
 pytorch_lightning/trainer/progress.py         | 12 ++++------
 4 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/pytorch_lightning/loops/dataloader/dataloader_loop.py b/pytorch_lightning/loops/dataloader/dataloader_loop.py
index ce255b73d0bba..1b5bf6a2402fe 100644
--- a/pytorch_lightning/loops/dataloader/dataloader_loop.py
+++ b/pytorch_lightning/loops/dataloader/dataloader_loop.py
@@ -13,16 +13,21 @@
 # limitations under the License.
 
 from abc import abstractmethod
-from typing import Sequence
+from typing import Sequence, Any
 
 from torch.utils.data import DataLoader
 
 from pytorch_lightning.loops.base import Loop
+from pytorch_lightning.trainer.progress import DataLoaderProgress
 
 
 class DataLoaderLoop(Loop):
     """Base class to loop over all dataloaders"""
 
+    def __init__(self):
+        super().__init__()
+        self.dataloader_progress = DataLoaderProgress()
+
     @property
     @abstractmethod
     def dataloaders(self) -> Sequence[DataLoader]:
@@ -31,7 +36,7 @@ def dataloaders(self) -> Sequence[DataLoader]:
     @property
     def current_dataloader_idx(self) -> int:
         """Returns the index of the current dataloader"""
-        return self.iteration_count
+        return self.dataloader_progress.current.ready - 1
 
     @property
     def current_dataloader(self) -> DataLoader:
@@ -46,8 +51,18 @@ def num_dataloaders(self) -> int:
     @property
     def done(self) -> bool:
         """Returns whether all dataloaders have been processed"""
-        return self.current_dataloader_idx >= self.num_dataloaders
+        return self.dataloader_progress.current.completed >= self.num_dataloaders
 
     def reset(self) -> None:
         """Resets the internal state"""
-        self.iteration_count = 0
+        if self.restarting:
+            self.restarting = False
+        else:
+            # reset batch / epoch progress tracking
+            self.dataloader_progress.current.reset()
+
+    def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
+        self.dataloader_progress.increment_ready()
+
+    def on_advance_end(self) -> None:
+        self.dataloader_progress.increment_completed()
diff --git a/pytorch_lightning/loops/dataloader/evaluation_loop.py b/pytorch_lightning/loops/dataloader/evaluation_loop.py
index ba554bf9c1a29..eab89eaf415b8 100644
--- a/pytorch_lightning/loops/dataloader/evaluation_loop.py
+++ b/pytorch_lightning/loops/dataloader/evaluation_loop.py
@@ -21,7 +21,6 @@
 from pytorch_lightning.loops.dataloader import DataLoaderLoop
 from pytorch_lightning.loops.epoch import EvaluationEpochLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import DataLoaderProgress
 from pytorch_lightning.trainer.states import TrainerFn
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.types import EPOCH_OUTPUT
@@ -33,7 +32,6 @@ class EvaluationLoop(DataLoaderLoop):
     def __init__(self):
         super().__init__()
         self.outputs = []
-        self.progress = DataLoaderProgress()
         self.epoch_loop = EvaluationEpochLoop()
 
         self._results = ResultCollection(training=False)
@@ -73,7 +71,7 @@ def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
     @property
     def done(self) -> bool:
         """Returns whether all dataloaders are processed or evaluation should be skipped altogether"""
-        return (self.current_dataloader_idx >= len(self.dataloaders)) or self.skip
+        return super().done or self.skip
 
     @property
     def skip(self) -> bool:
@@ -83,7 +81,6 @@ def skip(self) -> bool:
 
     def reset(self) -> None:
         """Resets the internal state of the loop"""
-        self.iteration_count = 0
         self._max_batches = self.get_max_batches()
         # bookkeeping
         self.outputs = []
@@ -91,13 +88,7 @@ def reset(self) -> None:
         if isinstance(self._max_batches, int):
             self._max_batches = [self._max_batches] * len(self.dataloaders)
 
-        if self.restarting:
-            self.iteration_count = self.progress.dataloader_idx
-            self.restarting = False
-        else:
-            self.iteration_count = 0
-            # reset batch / epoch progress tracking
-            self.progress.current.reset()
+        super().reset()
 
     def on_skip(self) -> List:
         return []
@@ -105,17 +96,12 @@ def on_skip(self) -> List:
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         """Runs the ``on_evaluation_model_eval``, ``on_evaluation_start`` and ``on_evaluation_epoch_start`` hooks"""
         void(*args, **kwargs)
-
-        self.progress.increment_started()
-
         # hook
         self.on_evaluation_model_eval()
         self.trainer.lightning_module.zero_grad()
         self.on_evaluation_start()
         self.on_evaluation_epoch_start()
 
-        self.progress.increment_ready()
-
     def advance(self, *args: Any, **kwargs: Any) -> None:
         """Performs evaluation on one single dataloader"""
         void(*args, **kwargs)
@@ -123,8 +109,6 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
         dataloader_iter = enumerate(dataloader)
         dl_max_batches = self._max_batches[self.current_dataloader_idx]
 
-        self.progress.dataloader_idx = self.iteration_count
-
         dl_outputs = self.epoch_loop.run(
             dataloader_iter,
             self.current_dataloader_idx,
@@ -151,8 +135,6 @@ def on_run_end(self) -> Any:
         if len(outputs) > 0 and self.num_dataloaders == 1:
             outputs = outputs[0]
 
-        self.progress.increment_processed()
-
         # lightning module method
         self.evaluation_epoch_end(outputs)
 
@@ -171,8 +153,6 @@ def on_run_end(self) -> Any:
         # enable train mode again
         self.on_evaluation_model_train()
 
-        self.progress.increment_completed()
-
         return eval_loop_results
 
     def get_max_batches(self) -> List[Union[int, float]]:
diff --git a/pytorch_lightning/loops/dataloader/prediction_loop.py b/pytorch_lightning/loops/dataloader/prediction_loop.py
index 51eccdf202051..e1de8669ddf68 100644
--- a/pytorch_lightning/loops/dataloader/prediction_loop.py
+++ b/pytorch_lightning/loops/dataloader/prediction_loop.py
@@ -7,7 +7,6 @@
 from pytorch_lightning.loops.dataloader.dataloader_loop import DataLoaderLoop
 from pytorch_lightning.loops.epoch.prediction_epoch_loop import PredictionEpochLoop
 from pytorch_lightning.plugins import DDPSpawnPlugin
-from pytorch_lightning.trainer.progress import DataLoaderProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.types import _PREDICT_OUTPUT
 
@@ -19,7 +18,6 @@ def __init__(self):
         super().__init__()
         self.predictions: Optional[List[List[Any]]] = None
         self.epoch_batch_indices: Optional[List[List[int]]] = None
-        self.progress = DataLoaderProgress()
         self.epoch_loop = PredictionEpochLoop()
 
         self._results = None  # for `trainer._results` access
@@ -66,11 +64,6 @@ def dataloaders(self) -> Sequence[DataLoader]:
         """Returns all prediction dataloaders"""
         return self.trainer.predict_dataloaders
 
-    @property
-    def done(self) -> bool:
-        """Whether prediction is finished: Max batches run or all dataloaders processed"""
-        return self.current_dataloader_idx >= len(self.dataloaders)
-
     @property
     def skip(self) -> bool:
         return sum(self.max_batches) == 0
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 5f153f45002c0..b34b448d2a0e9 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -153,15 +153,11 @@ class DataLoaderProgress(Progress):
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
 
     Args:
-        total: Tracks the total epoch progress
-        current: Tracks the current epoch progress
-        dataloader_idx: The index of the current dataloader.
+        total: Tracks the total dataloader progress
+        current: Tracks the current dataloader progress
     """
-    dataloader_idx: int = 0
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        super().load_state_dict(state_dict)
-        self.dataloader_idx = state_dict["dataloader_idx"]
+    total: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
+    current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
 
 
 @dataclass

From ad8b342b593bf0de93b0766582df8d144311efaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 14:42:29 +0200
Subject: [PATCH 054/106] remove self.dataloader_idx from eval_epoch_loop

---
 pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 9591672a7d3c7..757df0fb29ef6 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -38,7 +38,6 @@ def __init__(self) -> None:
         self.predictions: Optional[PredictionCollection] = None
         self.dataloader: Optional[Iterator] = None
         self.dl_max_batches: Optional[int] = None
-        self.dataloader_idx: Optional[int] = None
         self.num_dataloaders: Optional[int] = None
         self.outputs: List[STEP_OUTPUT] = []
         self.batch_progress = Progress()
@@ -56,7 +55,6 @@ def reset(self) -> None:
         """Resets the loop's internal state."""
         self.predictions = PredictionCollection(self.trainer.global_rank, self.trainer.world_size)
         self.dl_max_batches = None
-        self.dataloader_idx = None
         self.num_dataloaders = None
         self.outputs = []
 
@@ -80,10 +78,8 @@ def on_run_start(
             dl_max_batches: maximum number of batches the dataloader can produce
             num_dataloaders: the total number of dataloaders
         """
-        void(dataloader_iter)
-
+        void(dataloader_iter, dataloader_idx)
         self.dl_max_batches = dl_max_batches
-        self.dataloader_idx = dataloader_idx
         self.num_dataloaders = num_dataloaders
 
     def advance(

From 512ee0d51c8cf3098fb88c7b9c13610ba862edbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 14:54:27 +0200
Subject: [PATCH 055/106] add batch progress to predict loop

---
 .../loops/epoch/prediction_epoch_loop.py           | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
index f94e106a8c444..da1aa0e42f210 100644
--- a/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/prediction_epoch_loop.py
@@ -17,7 +17,7 @@ def __init__(self) -> None:
         self.return_predictions: bool = False
         self.predictions: List[Any] = []
         self.current_batch_indices: List[int] = []
-        self.progress = Progress()
+        self.batch_progress = Progress()
 
         self._dl_max_batches: Optional[int] = None
         self._num_dataloaders: Optional[int] = None
@@ -27,7 +27,7 @@ def __init__(self) -> None:
     @property
     def done(self) -> bool:
         """Ends prediction when the iteration count exceeds the total number of available batches"""
-        return self.iteration_count >= self._dl_max_batches
+        return self.batch_progress.current.completed >= self._dl_max_batches
 
     @property
     def should_store_predictions(self) -> bool:
@@ -37,9 +37,9 @@ def should_store_predictions(self) -> bool:
 
     def reset(self) -> None:
         """Resets the loops internal state"""
-        self.iteration_count = 0
         self._all_batch_indices: List[int] = []
         self.predictions: List[Any] = []
+        self.batch_progress.current.reset()
 
     def on_run_start(
         self,
@@ -89,6 +89,8 @@ def advance(
         with self.trainer.profiler.profile("predict_batch_to_device"):
             batch = self.trainer.accelerator.batch_to_device(batch, dataloader_idx=dataloader_idx)
 
+        self.batch_progress.increment_ready()
+
         with self.trainer.profiler.profile("predict_step"):
             self._predict_step(batch, batch_idx, dataloader_idx)
 
@@ -120,14 +122,20 @@ def _predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None
 
         self.trainer.call_hook("on_predict_batch_start", batch, batch_idx, dataloader_idx)
 
+        self.batch_progress.increment_started()
+
         model_ref._current_fx_name = "predict_step"
         predictions = self.trainer.accelerator.predict_step(step_kwargs)
 
+        self.batch_progress.increment_processed()
+
         if predictions is None:
             self._warning_cache.warn("predict returned None if it was on purpose, ignore this warning...")
 
         self.trainer.call_hook("on_predict_batch_end", predictions, batch, batch_idx, dataloader_idx)
 
+        self.batch_progress.increment_completed()
+
         if self.should_store_predictions:
             self.predictions.append(predictions)
 

From 2633d515673201a4f2f805c3d1f0c82f92b6e19a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 14 Jul 2021 12:56:18 +0000
Subject: [PATCH 056/106] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pytorch_lightning/loops/dataloader/dataloader_loop.py | 2 +-
 pytorch_lightning/loops/epoch/training_epoch_loop.py  | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/loops/dataloader/dataloader_loop.py b/pytorch_lightning/loops/dataloader/dataloader_loop.py
index 1b5bf6a2402fe..d8bdd67b41c17 100644
--- a/pytorch_lightning/loops/dataloader/dataloader_loop.py
+++ b/pytorch_lightning/loops/dataloader/dataloader_loop.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from abc import abstractmethod
-from typing import Sequence, Any
+from typing import Any, Sequence
 
 from torch.utils.data import DataLoader
 
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 0ff4613162c4f..393dbd02ec824 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -20,7 +20,7 @@
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import TrainingEpochProgress, Progress
+from pytorch_lightning.trainer.progress import Progress, TrainingEpochProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
@@ -168,9 +168,7 @@ def on_advance_end(self):
         # -----------------------------------------
         # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
         # -----------------------------------------
-        should_check_val = self._should_check_val_fx(
-            self.iteration_count, self.is_last_batch
-        )
+        should_check_val = self._should_check_val_fx(self.iteration_count, self.is_last_batch)
 
         if should_check_val:
             self.trainer.validating = True

From 4bbc7acf1062c25efb81d36b20d5958669e79c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 15:34:38 +0200
Subject: [PATCH 057/106] incorporate progress tracking for current_epoch

---
 pytorch_lightning/loops/base.py               |  1 +
 .../loops/epoch/training_epoch_loop.py        |  3 +--
 pytorch_lightning/loops/fit_loop.py           | 21 +++++++++----------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 9209dcb993284..f67447fc19a03 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -46,6 +46,7 @@ class Loop(ABC):
     """
 
     def __init__(self) -> None:
+        # TODO: replace by progress tracking
         self.iteration_count: int = 0
         self.restarting = False
         self._trainer: Optional['pl.Trainer'] = None
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 393dbd02ec824..ee0e399a6c98b 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -38,11 +38,10 @@ def __init__(self, min_steps: int, max_steps: int):
         self.global_step: int = 0
         # the total batch index across all epochs
         self.total_batch_idx: int = 0
-        # the current batch index in the loop that runs over the dataloader(s)
-        self.iteration_count: int = 0
         # the current split index when the batch gets split into chunks in truncated backprop through time
         self.split_idx: Optional[int] = None
         # the number of batches seen this run, updates immediately after batch_loop.run()
+        # TODO: replace by progress tracking
         self.batches_seen: int = 0
         self.is_last_batch: Optional[bool] = None
         self.batch_progress = Progress()
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 7af7b52cbea05..21087d73a4662 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -57,12 +57,12 @@ def __init__(
     @property
     def current_epoch(self) -> int:
         """Return the current epoch"""
-        return self.iteration_count
+        return self.epoch_progress.current.completed
 
     @current_epoch.setter
     def current_epoch(self, value: int) -> None:
         """Setter for the current epoch"""
-        self.iteration_count = value
+        self.epoch_progress.current.completed = value
 
     @property
     def global_step(self) -> int:
@@ -82,7 +82,7 @@ def total_batch_idx(self) -> int:
     @property
     def batch_idx(self) -> int:
         """Returns the number of batches already run within this epoch"""
-        return self.epoch_loop.iteration_count
+        return self.epoch_loop.batch_progress.current.ready - 1
 
     @property
     def split_idx(self) -> int:
@@ -227,16 +227,15 @@ def advance(self) -> None:
 
     def on_advance_end(self) -> None:
         """Updates the LR schedulers and does some internal bookkeeping"""
-        if self.epoch_loop.batches_seen == 0:
-            return
 
-        self.epoch_loop.update_lr_schedulers('epoch', update_plateau_schedulers=True)
+        if self.epoch_loop.batches_seen != 0:
+            self.epoch_loop.update_lr_schedulers('epoch', update_plateau_schedulers=True)
 
-        did_train_only = not self.trainer.enable_validation or self.epoch_loop.val_loop.skip
-        if did_train_only:
-            self.global_step -= 1
-            self._check_checkpoint_callback(True)
-            self.global_step += 1
+            did_train_only = not self.trainer.enable_validation or self.epoch_loop.val_loop.skip
+            if did_train_only:
+                self.global_step -= 1
+                self._check_checkpoint_callback(True)
+                self.global_step += 1
 
         self.epoch_progress.increment_completed()
 

From 01f87145ce30bc63c06ffc9927d40cf56a4bdb70 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 17:20:40 +0200
Subject: [PATCH 058/106] Fix test

---
 tests/loops/test_loop_progress_integration.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/loops/test_loop_progress_integration.py b/tests/loops/test_loop_progress_integration.py
index 4395cb5cdcf3b..32eac6d037a87 100644
--- a/tests/loops/test_loop_progress_integration.py
+++ b/tests/loops/test_loop_progress_integration.py
@@ -5,12 +5,16 @@
 def test_loop_progress_integration():
     trainer = Trainer()
     # check no progresses are shared
-    assert trainer.validate_loop.progress is not trainer.test_loop.progress
-    assert trainer.test_loop.progress is not trainer.predict_loop.progress
+    assert trainer.fit_loop.epoch_progress is not trainer.validate_loop.dataloader_progress
+    assert trainer.validate_loop.dataloader_progress is not trainer.test_loop.dataloader_progress
+    assert trainer.test_loop.dataloader_progress is not trainer.predict_loop.dataloader_progress
     # check the validation progresses are not shared
-    assert trainer.fit_loop.epoch_loop.val_loop.progress is not trainer.validate_loop.progress
-    generated = _collect_loop_progress(trainer.fit_loop)["epoch_loop"]
-    assert generated["progress"] is trainer.fit_loop.epoch_loop.progress
-    assert generated["batch_loop"]["progress"] is trainer.fit_loop.epoch_loop.batch_loop.progress
-    assert generated["val_loop"]["progress"] is trainer.fit_loop.epoch_loop.val_loop.progress
-    assert generated["val_loop"]["epoch_loop"]["progress"] is trainer.fit_loop.epoch_loop.val_loop.epoch_loop.progress
+    assert trainer.fit_loop.epoch_loop.val_loop.dataloader_progress is not trainer.validate_loop.dataloader_progress
+    # check recursive collection of progresses
+    progresses = _collect_loop_progress(trainer.fit_loop)
+    assert progresses["epoch_progress"] is trainer.fit_loop.epoch_progress
+    assert progresses["epoch_loop"]["batch_progress"] is trainer.fit_loop.epoch_loop.batch_progress
+    assert progresses["epoch_loop"]["val_loop"]["dataloader_progress"
+                                                ] is trainer.fit_loop.epoch_loop.val_loop.dataloader_progress
+    assert progresses["epoch_loop"]["val_loop"]["epoch_loop"][
+        "batch_progress"] is trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress

From 65405b806a116316b49b7eac69865043b4fc94db Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 17:25:24 +0200
Subject: [PATCH 059/106] Actually remove it

---
 tests/loops/test_loop_progress_integration.py | 20 -------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 tests/loops/test_loop_progress_integration.py

diff --git a/tests/loops/test_loop_progress_integration.py b/tests/loops/test_loop_progress_integration.py
deleted file mode 100644
index 32eac6d037a87..0000000000000
--- a/tests/loops/test_loop_progress_integration.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from pytorch_lightning import Trainer
-from tests.loops.test_loops import _collect_loop_progress
-
-
-def test_loop_progress_integration():
-    trainer = Trainer()
-    # check no progresses are shared
-    assert trainer.fit_loop.epoch_progress is not trainer.validate_loop.dataloader_progress
-    assert trainer.validate_loop.dataloader_progress is not trainer.test_loop.dataloader_progress
-    assert trainer.test_loop.dataloader_progress is not trainer.predict_loop.dataloader_progress
-    # check the validation progresses are not shared
-    assert trainer.fit_loop.epoch_loop.val_loop.dataloader_progress is not trainer.validate_loop.dataloader_progress
-    # check recursive collection of progresses
-    progresses = _collect_loop_progress(trainer.fit_loop)
-    assert progresses["epoch_progress"] is trainer.fit_loop.epoch_progress
-    assert progresses["epoch_loop"]["batch_progress"] is trainer.fit_loop.epoch_loop.batch_progress
-    assert progresses["epoch_loop"]["val_loop"]["dataloader_progress"
-                                                ] is trainer.fit_loop.epoch_loop.val_loop.dataloader_progress
-    assert progresses["epoch_loop"]["val_loop"]["epoch_loop"][
-        "batch_progress"] is trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress

From 6dd2182a97e91984435208e96457b3c6813962b7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 17:38:14 +0200
Subject: [PATCH 060/106] Remove unused TrainingEpochProgress

---
 .../loops/epoch/training_epoch_loop.py           |  2 +-
 pytorch_lightning/trainer/progress.py            | 16 ----------------
 tests/trainer/test_progress.py                   |  2 --
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index ee0e399a6c98b..eb0f4040a6102 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -20,7 +20,7 @@
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import Progress, TrainingEpochProgress
+from pytorch_lightning.trainer.progress import Progress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index b34b448d2a0e9..32e2ba0ea9b98 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -130,22 +130,6 @@ def load_state_dict(self, state_dict: dict) -> None:
         self.current.load_state_dict(state_dict["current"])
 
 
-@dataclass
-class TrainingEpochProgress(Progress):
-    """
-    Tracks the epoch progress
-
-    Args:
-        total: Tracks the total epoch progress
-        current: Tracks the current epoch progress
-    """
-    should_check_val: bool = False
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        super().load_state_dict(state_dict)
-        self.should_check_val = state_dict["should_check_val"]
-
-
 @dataclass
 class DataLoaderProgress(Progress):
     """
diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 32d97be167b52..7205b72ad3963 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -26,7 +26,6 @@
     OptimizerProgress,
     Progress,
     Tracker,
-    TrainingEpochProgress,
 )
 from tests.helpers import BoringModel
 
@@ -116,7 +115,6 @@ def test_optimizer_progress_default_factory():
 def test_deepcopy():
     _ = deepcopy(Tracker())
     _ = deepcopy(Progress())
-    _ = deepcopy(TrainingEpochProgress())
     _ = deepcopy(DataLoaderProgress())
     _ = deepcopy(OptimizerProgress())
     _ = deepcopy(OptimizationProgress())

From b71e1516653901e3a222a82d4c56520b77046f75 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 18:09:45 +0200
Subject: [PATCH 061/106] Fix optimization progress - missing scheduler

---
 pytorch_lightning/core/lightning.py           |  3 +-
 .../loops/batch/training_batch_loop.py        | 15 ++--------
 .../trainer/connectors/optimizer_connector.py |  4 +--
 pytorch_lightning/trainer/progress.py         | 29 +++++++++++++------
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index 735f8ab160c1f..aeed3c9304a76 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -31,6 +31,7 @@
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
 
+import pytorch_lightning as pl
 from pytorch_lightning.core.grads import GradInformation
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.memory import ModelSummary
@@ -89,7 +90,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         torch._C._log_api_usage_once(f"lightning.module.{self.__class__.__name__}")
 
         # pointer to the trainer object
-        self.trainer = None
+        self.trainer: Optional['pl.Trainer'] = None
 
         self._distrib_type = None
         self._device_type = None
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 912c4f8fbe548..26adb3234f44c 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -245,11 +245,6 @@ def _training_step_and_backward_closure(
         result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
         if result is not None:
             return_result.update(result)
-
-            # this should be done only if result.loss exists and ``optimizer step`` is being run
-            if not self.should_accumulate():
-                self.optim_progress.optimizer.step.increment_started()
-
             return return_result.loss
 
     def _make_closure(self, *closure_args: Any, **closure_kwargs: Any) -> Callable:
@@ -419,7 +414,8 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
-        self.optim_progress.optimizer.step.increment_processed()
+        # FIXME: why does it not fail?
+        # self.optim_progress.optimizer.step.increment_processed()
         self.optim_progress.optimizer.step.increment_completed()
 
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
@@ -441,7 +437,6 @@ def _optimizer_zero_grad(self, batch_idx: int, optimizer: torch.optim.Optimizer,
             opt_idx: the index of the current optimizer
         """
         self.trainer.accelerator.optimizer_zero_grad(self.trainer.current_epoch, batch_idx, optimizer, opt_idx)
-
         self.optim_progress.optimizer.zero_grad.increment_completed()
 
     def _track_and_norm_grad(self, optimizer: torch.optim.Optimizer) -> Dict[str, Tensor]:
@@ -701,9 +696,3 @@ def _truncated_bptt_steps(self) -> int:
         if lightning_module.truncated_bptt_steps > 0:
             return lightning_module.truncated_bptt_steps
         return self.trainer.truncated_bptt_steps or 0
-
-    def increment_scheduler_ready(self):
-        self.optim_progress.scheduler.increment_ready()
-
-    def increment_scheduler_completed(self):
-        self.optim_progress.scheduler.increment_completed()
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index 16b751e7db4b9..4c49b6e028cb4 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -83,7 +83,7 @@ def update_learning_rates(
                 # update LR
                 old_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
-                self.trainer.fit_loop.epoch_loop.batch_loop.increment_scheduler_ready()
+                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
 
                 if lr_scheduler['reduce_on_plateau']:
                     lr_scheduler['scheduler'].step(monitor_val)
@@ -92,7 +92,7 @@ def update_learning_rates(
 
                 new_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
-                self.trainer.fit_loop.epoch_loop.batch_loop.increment_scheduler_completed()
+                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_completed()
 
                 if self.trainer.dev_debugger.enabled:
                     self.trainer.dev_debugger.track_lr_schedulers_update(
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 32e2ba0ea9b98..99895266e6b3c 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -154,7 +154,7 @@ class OptimizerProgress(BaseProgress):
         zero_grad: Tracks ``optimizer.zero_grad`` calls.
     """
 
-    step: Progress = field(default_factory=lambda: Progress.from_defaults(processed=None))
+    step: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
     zero_grad: Progress = field(default_factory=lambda: Progress.from_defaults(processed=None))
 
     def reset_on_epoch(self) -> None:
@@ -173,28 +173,39 @@ class OptimizationProgress(BaseProgress):
 
     Args:
         optimizer: Tracks optimizer progress.
-        scheduler: Tracks scheduler progress.
         optimizer_idx: The index of the current optimizer.
     """
 
     # TODO: support for multiple optimizers
     optimizer: OptimizerProgress = field(default_factory=OptimizerProgress)
-    scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
     optimizer_idx: int = 0
 
     @property
     def optimizer_steps(self) -> int:
         return self.optimizer.step.total.completed
 
+    def reset_on_epoch(self) -> None:
+        self.optimizer.current.reset()
+        self.optimizer_idx = 0
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        self.optimizer.load_state_dict(state_dict["optimizer"])
+        self.optimizer_idx = state_dict["optimizer_idx"]
+
+
+class SchedulerProgress(BaseProgress):
+    """
+    Track scheduler progress.
+
+    Args:
+        scheduler: Tracks scheduler progress.
+    """
+
+    scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
+
     @property
     def scheduler_steps(self) -> int:
         return self.scheduler.total.completed
 
-    def reset_on_epoch(self) -> None:
-        self.optimizer.reset_on_epoch()
-        self.scheduler.current.reset()
-
     def load_state_dict(self, state_dict: dict) -> None:
-        self.optimizer.load_state_dict(state_dict["optimizer"])
         self.scheduler.load_state_dict(state_dict["scheduler"])
-        self.optimizer_idx = state_dict["optimizer_idx"]

From e5a392a4ee30c7dd9115f1bbece66e7ea9e715d9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 18:14:03 +0200
Subject: [PATCH 062/106] Restarting changes

---
 pytorch_lightning/loops/base.py                        | 2 ++
 pytorch_lightning/loops/batch/training_batch_loop.py   | 1 -
 pytorch_lightning/loops/dataloader/dataloader_loop.py  | 4 +---
 pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 4 +---
 pytorch_lightning/loops/epoch/training_epoch_loop.py   | 5 +----
 tests/loops/test_loops.py                              | 6 +++---
 6 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index f67447fc19a03..66844a9dc5ddd 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -113,6 +113,8 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
                 self.advance(*args, **kwargs)
                 self.on_advance_end()
                 self.iteration_count += 1
+                if self.restarting:
+                    self.restarting = False
             except StopIteration:
                 break
 
diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 26adb3234f44c..bd82659fe5476 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -149,7 +149,6 @@ def advance(self, batch, batch_idx, dataloader_idx):
                 if self.restarting:
                     if opt_idx < self.optim_progress.optimizer_idx:
                         continue
-                    self.restarting = False
 
                 # track optimizer_idx
                 self.optim_progress.optimizer_idx = opt_idx
diff --git a/pytorch_lightning/loops/dataloader/dataloader_loop.py b/pytorch_lightning/loops/dataloader/dataloader_loop.py
index d8bdd67b41c17..ed7f776fcad7d 100644
--- a/pytorch_lightning/loops/dataloader/dataloader_loop.py
+++ b/pytorch_lightning/loops/dataloader/dataloader_loop.py
@@ -55,9 +55,7 @@ def done(self) -> bool:
 
     def reset(self) -> None:
         """Resets the internal state"""
-        if self.restarting:
-            self.restarting = False
-        else:
+        if not self.restarting:
             # reset batch / epoch progress tracking
             self.dataloader_progress.current.reset()
 
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 757df0fb29ef6..1c76d33acd404 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -58,9 +58,7 @@ def reset(self) -> None:
         self.num_dataloaders = None
         self.outputs = []
 
-        if self.restarting:
-            self.restarting = False
-        else:
+        if not self.restarting:
             self.batch_progress.current.reset()
 
     def on_run_start(
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index eb0f4040a6102..cae9093291f44 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -85,10 +85,7 @@ def reset(self) -> None:
         self._epoch_output = [[] for _ in range(self.batch_loop.num_active_optimizers(self.total_batch_idx))]
 
         if self.restarting:
-            self.iteration_count = self.batch_loop.current_batch_completed
-            self.batches_seen = self.batch_loop.current_batch_completed
-            # restarting is finished.
-            self.restarting = False
+            self.iteration_count = self.batches_seen = self.batch_progress.current.completed
         else:
             # todo (tchaton) the batch_loop should be responsible for that.
             self.batch_loop.split_progress.current.reset()
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 70e2ca7a62d3e..34828bf8d59f1 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -57,7 +57,7 @@ def reset(self) -> None:
                 for _ in range(self.iteration_count):
                     next(self.iter_dataset)
                 self.iteration_count += 1
-                self.restarting = False
+                # self.restarting = False
             else:
                 self.outputs = []
 
@@ -132,8 +132,8 @@ def skip(self) -> bool:
         def done(self) -> bool:
             return self.iteration_count > 0
 
-        def reset(self) -> None:
-            self.restarting = False
+        # def reset(self) -> None:
+        #     self.restarting = False
 
         def on_save_checkpoint(self) -> Dict:
             return {"a": self.a}

From 49c511277f8275175024e232d225fc37be786219 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 18:39:59 +0200
Subject: [PATCH 063/106] Scheduler progress

---
 .../loops/batch/training_batch_loop.py        |  2 +-
 .../loops/epoch/training_epoch_loop.py        |  5 ++-
 pytorch_lightning/loops/fit_loop.py           |  4 +-
 .../trainer/connectors/optimizer_connector.py |  2 +-
 pytorch_lightning/trainer/progress.py         | 38 +++++++++----------
 5 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index bd82659fe5476..7cd7c76a2f6bd 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -150,7 +150,6 @@ def advance(self, batch, batch_idx, dataloader_idx):
                     if opt_idx < self.optim_progress.optimizer_idx:
                         continue
 
-                # track optimizer_idx
                 self.optim_progress.optimizer_idx = opt_idx
 
                 result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
@@ -416,6 +415,7 @@ def _optimizer_step(
         # FIXME: why does it not fail?
         # self.optim_progress.optimizer.step.increment_processed()
         self.optim_progress.optimizer.step.increment_completed()
+        self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
 
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index cae9093291f44..0a3ba558e2f64 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -20,7 +20,7 @@
 from pytorch_lightning import loops  # import as loops to avoid circular imports
 from pytorch_lightning.loops.batch import TrainingBatchLoop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import Progress
+from pytorch_lightning.trainer.progress import Progress, SchedulerProgress
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.model_helpers import is_overridden
 from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature
@@ -45,6 +45,7 @@ def __init__(self, min_steps: int, max_steps: int):
         self.batches_seen: int = 0
         self.is_last_batch: Optional[bool] = None
         self.batch_progress = Progress()
+        self.scheduler_progress = SchedulerProgress()
 
         self.batch_loop = TrainingBatchLoop()
         self.val_loop = loops.EvaluationLoop()
@@ -230,6 +231,8 @@ def on_run_end(self) -> List[List[STEP_OUTPUT]]:
         self.trainer.call_hook('on_epoch_end')
         self.trainer.logger_connector.on_epoch_end()
 
+        self.update_lr_schedulers('epoch', update_plateau_schedulers=True)
+
         epoch_output = self._epoch_output
         # free memory
         self._epoch_output = None
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 21087d73a4662..75dacdcec4ba9 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -222,15 +222,13 @@ def advance(self) -> None:
             # TODO(@carmocca): deprecate and rename so users don't get confused
             self.global_step -= 1
             # log epoch metrics
+            # FIXME: was this wrong???
             self.trainer.logger_connector.update_train_epoch_metrics()
             self.global_step += 1
 
     def on_advance_end(self) -> None:
         """Updates the LR schedulers and does some internal bookkeeping"""
-
         if self.epoch_loop.batches_seen != 0:
-            self.epoch_loop.update_lr_schedulers('epoch', update_plateau_schedulers=True)
-
             did_train_only = not self.trainer.enable_validation or self.epoch_loop.val_loop.skip
             if did_train_only:
                 self.global_step -= 1
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index 4c49b6e028cb4..9939901832c0e 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -83,7 +83,7 @@ def update_learning_rates(
                 # update LR
                 old_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
-                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
+                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_started()
 
                 if lr_scheduler['reduce_on_plateau']:
                     lr_scheduler['scheduler'].step(monitor_val)
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 99895266e6b3c..043c6ebe9d4c4 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -133,7 +133,7 @@ def load_state_dict(self, state_dict: dict) -> None:
 @dataclass
 class DataLoaderProgress(Progress):
     """
-    Tracks the data-loader progress
+    Tracks the dataloader progress
     These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
 
     Args:
@@ -144,6 +144,24 @@ class DataLoaderProgress(Progress):
     current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
 
 
+class SchedulerProgress(Progress):
+    """
+    Tracks the scheduler progress
+    These counters are local to a trainer rank. By default, they are not globally synced across all ranks.
+
+    Args:
+        total: Tracks the total scheduler progress
+        current: Tracks the current scheduler progress
+    """
+
+    total: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
+    current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
+
+    @property
+    def scheduler_steps(self) -> int:
+        return self.total.completed
+
+
 @dataclass
 class OptimizerProgress(BaseProgress):
     """
@@ -191,21 +209,3 @@ def reset_on_epoch(self) -> None:
     def load_state_dict(self, state_dict: dict) -> None:
         self.optimizer.load_state_dict(state_dict["optimizer"])
         self.optimizer_idx = state_dict["optimizer_idx"]
-
-
-class SchedulerProgress(BaseProgress):
-    """
-    Track scheduler progress.
-
-    Args:
-        scheduler: Tracks scheduler progress.
-    """
-
-    scheduler: Progress = field(default_factory=lambda: Progress.from_defaults(started=None, processed=None))
-
-    @property
-    def scheduler_steps(self) -> int:
-        return self.scheduler.total.completed
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        self.scheduler.load_state_dict(state_dict["scheduler"])

From 018da6a530ab268295177dadcf54de96a25e1f12 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 18:44:08 +0200
Subject: [PATCH 064/106] Unused property, reset on epoch

---
 pytorch_lightning/trainer/progress.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 043c6ebe9d4c4..be78d7b8208a2 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -157,10 +157,6 @@ class SchedulerProgress(Progress):
     total: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
     current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
 
-    @property
-    def scheduler_steps(self) -> int:
-        return self.total.completed
-
 
 @dataclass
 class OptimizerProgress(BaseProgress):
@@ -203,7 +199,7 @@ def optimizer_steps(self) -> int:
         return self.optimizer.step.total.completed
 
     def reset_on_epoch(self) -> None:
-        self.optimizer.current.reset()
+        self.optimizer.reset_on_epoch()
         self.optimizer_idx = 0
 
     def load_state_dict(self, state_dict: dict) -> None:

From 0b1834c6b0726f25144c64d9056d4f7648982de2 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 19:11:45 +0200
Subject: [PATCH 065/106] Resolve FIXME

---
 pytorch_lightning/loops/batch/training_batch_loop.py | 2 --
 pytorch_lightning/trainer/progress.py                | 8 --------
 2 files changed, 10 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 7cd7c76a2f6bd..60f43f5ba8825 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -412,8 +412,6 @@ def _optimizer_step(
             using_lbfgs=is_lbfgs,
         )
 
-        # FIXME: why does it not fail?
-        # self.optim_progress.optimizer.step.increment_processed()
         self.optim_progress.optimizer.step.increment_completed()
         self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
 
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index be78d7b8208a2..4410cf3901ff2 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -98,26 +98,18 @@ class Progress(BaseProgress):
     current: Tracker = field(default_factory=Tracker)
 
     def increment_ready(self) -> None:
-        if self.total.ready is None or self.current.ready is None:
-            return
         self.total.ready += 1
         self.current.ready += 1
 
     def increment_started(self) -> None:
-        if self.total.started is None or self.current.started is None:
-            return
         self.total.started += 1
         self.current.started += 1
 
     def increment_processed(self) -> None:
-        if self.total.processed is None or self.current.processed is None:
-            return
         self.total.processed += 1
         self.current.processed += 1
 
     def increment_completed(self) -> None:
-        if self.total.completed is None or self.current.completed is None:
-            return
         self.total.completed += 1
         self.current.completed += 1
 

From d7bcafa883d538b83b73db33fa6bf915937bba10 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 19:12:30 +0200
Subject: [PATCH 066/106] Remove FIXME

---
 pytorch_lightning/loops/fit_loop.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index 75dacdcec4ba9..d2f0dfe954a6f 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -222,7 +222,6 @@ def advance(self) -> None:
             # TODO(@carmocca): deprecate and rename so users don't get confused
             self.global_step -= 1
             # log epoch metrics
-            # FIXME: was this wrong???
             self.trainer.logger_connector.update_train_epoch_metrics()
             self.global_step += 1
 

From e794fbe746b5fd09a7d1b89629a2c5ad3747a6b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 23:29:56 +0200
Subject: [PATCH 067/106] fix test_progress (wip)

---
 tests/trainer/test_progress.py | 109 +++++++++++++++++++--------------
 1 file changed, 62 insertions(+), 47 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 7205b72ad3963..c6e4a5d266be0 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -34,7 +34,7 @@ class CustomException(BaseException):
     pass
 
 
-def test_progress_geattr_setattr():
+def test_progress_getattr_setattr():
     p = Tracker(ready=10, completed=None)
     # can read
     assert p.completed is None
@@ -134,7 +134,7 @@ def __init__(self):
             self.should_fail = True
 
         def training_step(self, batch, batch_idx, optimizer_idx: int = None):
-            # breaking on global_step 4
+            # simulate failure during the the 5-th training step, 2nd epoch (global_step = 4)
             if self.should_fail and self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
                 1 if use_multiple_optimizers else None
             ):
@@ -177,7 +177,7 @@ def configure_optimizers_3(self):
     # VALIDATE CHECKPOINT #
     #######################
 
-    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)
+    checkpoint = torch.load(str(tmpdir / ".pl_auto_save.ckpt"))
 
     num_epochs = 1
     num_batches = 4
@@ -188,13 +188,13 @@ def configure_optimizers_3(self):
     total_optimizer_step = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
 
     # we raised expection on the first optimizer
-    current_optimize_step = (1 if use_multiple_optimizers else 0)
+    current_optimizer_step = (1 if use_multiple_optimizers else 0)
 
     if accumulate_grad_batches == 2 and use_multiple_optimizers:
         total_optimizer_step += 1
 
     total_optimizer_zero_grad = total_optimizer_step
-    current_optimizer_zero_grad = current_optimize_step
+    current_optimizer_zero_grad = current_optimizer_step
 
     if accumulate_grad_batches == 2:
         # that's weird ! todo (tchaton) investigate this
@@ -214,34 +214,49 @@ def configure_optimizers_3(self):
     expected = {
         "state_dict": {},
         "epoch_loop.state_dict": {},
-        "epoch_loop.progress": {
+        "epoch_loop.batch_progress": {
             "total": {
-                "ready": num_epochs + 1,
-                "started": num_epochs + 1,
-                "processed": 1,
-                "completed": 1
+                "ready": 5,
+                "started": 5,
+                "processed": 4,
+                "completed": 4,
             },
             "current": {
-                "ready": num_epochs + 1,
-                "started": num_epochs + 1,
+                "ready": 2,
+                "started": 2,
                 "processed": 1,
-                "completed": 1
+                "completed": 1,
+            },
+        },
+        "epoch_loop.scheduler_progress": {
+            "scheduler": {
+                "total": {
+                    "ready": total_scheduler_step,
+                    "started": None,
+                    "processed": None,
+                    "completed": total_scheduler_step,
+                },
+                "current": {
+                    "ready": current_scheduler_step,
+                    "started": None,
+                    "processed": None,
+                    "completed": current_scheduler_step,
+                },
             },
-            "should_check_val": False,
         },
         "epoch_loop.batch_loop.state_dict": {},
-        "epoch_loop.batch_loop.progress": {
+        "epoch_loop.batch_loop.split_progress": {
             "total": {
-                "ready": num_batches + 1,
-                "started": num_batches + 1,
-                "processed": num_batches,
-                "completed": num_batches
+                "ready": 0,
+                "started": 0,
+                "processed": 0,
+                "completed": 0,
             },
             "current": {
-                "ready": num_batches - limit_train_batches + 1,
-                "started": num_batches - limit_train_batches + 1,
-                "processed": num_batches - limit_train_batches,
-                "completed": num_batches - limit_train_batches
+                "ready": 0,
+                "started": 0,
+                "processed": 0,
+                "completed": 0,
             },
         },
         "epoch_loop.batch_loop.optim_progress": {
@@ -250,15 +265,15 @@ def configure_optimizers_3(self):
                 "step": {
                     "total": {
                         "ready": total_optimizer_step + 1,
-                        "started": total_optimizer_step,
+                        "started": None,
                         "processed": None,
-                        "completed": total_optimizer_step
+                        "completed": total_optimizer_step,
                     },
                     "current": {
-                        "ready": current_optimize_step + 1,
-                        "started": current_optimize_step,
+                        "ready": current_optimizer_step + 1,
+                        "started": None,
                         "processed": None,
-                        "completed": current_optimize_step,
+                        "completed": current_optimizer_step,
                     },
                 },
                 "zero_grad": {
@@ -266,7 +281,7 @@ def configure_optimizers_3(self):
                         "ready": total_optimizer_zero_grad,
                         "started": total_optimizer_zero_grad,
                         "processed": None,
-                        "completed": total_optimizer_zero_grad
+                        "completed": total_optimizer_zero_grad,
                     },
                     "current": {
                         "ready": current_optimizer_zero_grad,
@@ -276,32 +291,32 @@ def configure_optimizers_3(self):
                     },
                 },
             },
-            "scheduler": {
-                "total": {
-                    "ready": total_scheduler_step,
-                    "started": None,
-                    "processed": None,
-                    "completed": total_scheduler_step
-                },
-                "current": {
-                    "ready": current_scheduler_step,
-                    "started": None,
-                    "processed": None,
-                    "completed": current_scheduler_step
-                },
-            },
         },
         "epoch_loop.val_loop.state_dict": {},
-        "epoch_loop.val_loop.progress": {
-            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+        "epoch_loop.val_loop.dataloader_progress": {
+            "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
+            "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
             "dataloader_idx": 0,
         },
         "epoch_loop.val_loop.epoch_loop.state_dict": {},
-        "epoch_loop.val_loop.epoch_loop.progress": {
+        "epoch_loop.val_loop.epoch_loop.batch_progress": {
             "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
         },
+        "epoch_progress": {
+            "total": {
+                "ready": 2,
+                "started": 2,
+                "processed": 1,
+                "completed": 1,
+            },
+            "current": {
+                "ready": 2,
+                "started": 2,
+                "processed": 1,
+                "completed": 1,
+            },
+        },
     }
     # yapf: enable
 

From c98bd292a2c3e3330c8f314086f29aef1d509e05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 14 Jul 2021 23:34:12 +0200
Subject: [PATCH 068/106] fix batch_progress.current.reset

---
 pytorch_lightning/loops/epoch/training_epoch_loop.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 0a3ba558e2f64..9a6f6bd59eac8 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -88,8 +88,7 @@ def reset(self) -> None:
         if self.restarting:
             self.iteration_count = self.batches_seen = self.batch_progress.current.completed
         else:
-            # todo (tchaton) the batch_loop should be responsible for that.
-            self.batch_loop.split_progress.current.reset()
+            self.batch_progress.current.reset()
 
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         # hook

From f90334cc7dc3e8fe98c4bc89dc6741103354c62a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 14 Jul 2021 23:39:36 +0200
Subject: [PATCH 069/106] Hold off on split progress. Out of scope of this PR

---
 pytorch_lightning/loops/batch/training_batch_loop.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 60f43f5ba8825..7976dfb5159f6 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -28,7 +28,7 @@
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
-from pytorch_lightning.trainer.progress import OptimizationProgress, Progress
+from pytorch_lightning.trainer.progress import OptimizationProgress
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import AMPType, AttributeDict, DeviceType, grad_norm
 from pytorch_lightning.utilities.apply_func import apply_to_collection
@@ -50,8 +50,6 @@ def __init__(self) -> None:
         self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
         self.batch_idx: int = 0
         self.split_idx: Optional[int] = None
-        # TODO: add progress updates for batch splits
-        self.split_progress = Progress()
         self.optim_progress = OptimizationProgress()
 
         self._warning_cache: WarningCache = WarningCache()

From 7fb78deba5e18b30ad828a495c371feecf83cdc0 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 00:07:53 +0200
Subject: [PATCH 070/106] Unnecessary if

---
 pytorch_lightning/loops/base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 66844a9dc5ddd..6aa8ebefb60b1 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -113,8 +113,7 @@ def run(self, *args: Any, **kwargs: Any) -> Optional[Any]:
                 self.advance(*args, **kwargs)
                 self.on_advance_end()
                 self.iteration_count += 1
-                if self.restarting:
-                    self.restarting = False
+                self.restarting = False
             except StopIteration:
                 break
 

From 8130a47b9beaa9b9d80ad44111736279d61fbd8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 15 Jul 2021 00:09:34 +0200
Subject: [PATCH 071/106] fix structure in test_progress

---
 tests/trainer/test_progress.py | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index c6e4a5d266be0..442d446a8f7d1 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -229,19 +229,17 @@ def configure_optimizers_3(self):
             },
         },
         "epoch_loop.scheduler_progress": {
-            "scheduler": {
-                "total": {
-                    "ready": total_scheduler_step,
-                    "started": None,
-                    "processed": None,
-                    "completed": total_scheduler_step,
-                },
-                "current": {
-                    "ready": current_scheduler_step,
-                    "started": None,
-                    "processed": None,
-                    "completed": current_scheduler_step,
-                },
+            "total": {
+                "ready": total_scheduler_step,
+                "started": None,
+                "processed": None,
+                "completed": total_scheduler_step,
+            },
+            "current": {
+                "ready": current_scheduler_step,
+                "started": None,
+                "processed": None,
+                "completed": current_scheduler_step,
             },
         },
         "epoch_loop.batch_loop.state_dict": {},
@@ -296,7 +294,6 @@ def configure_optimizers_3(self):
         "epoch_loop.val_loop.dataloader_progress": {
             "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
-            "dataloader_idx": 0,
         },
         "epoch_loop.val_loop.epoch_loop.state_dict": {},
         "epoch_loop.val_loop.epoch_loop.batch_progress": {

From b6b9ea4c1d4e220e918739ee5f961a381fc47ee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 15 Jul 2021 00:10:15 +0200
Subject: [PATCH 072/106] structure

---
 tests/trainer/test_progress.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 442d446a8f7d1..26fd16e43a8c1 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -326,7 +326,7 @@ def configure_optimizers_3(self):
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
     state_dict = trainer.fit_loop.state_dict()
     assert state_dict != checkpoint["loops"]["fit_loop"]
-    assert state_dict['epoch_loop.progress']["total"]["started"] == num_epochs
+    assert state_dict["epoch_progress"]["total"]["started"] == 1
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})

From 4780b19fa74e2aec22a21a0b5ebba6824d9580e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 15 Jul 2021 00:10:52 +0200
Subject: [PATCH 073/106] clean up unused variables in test_progress

---
 tests/trainer/test_progress.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 26fd16e43a8c1..6318c29c39701 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -131,11 +131,10 @@ def __init__(self):
             super().__init__()
             if use_multiple_optimizers:
                 self.configure_optimizers = self.configure_optimizers_3
-            self.should_fail = True
 
         def training_step(self, batch, batch_idx, optimizer_idx: int = None):
             # simulate failure during the the 5-th training step, 2nd epoch (global_step = 4)
-            if self.should_fail and self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
+            if self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
                 1 if use_multiple_optimizers else None
             ):
                 raise CustomException
@@ -155,16 +154,12 @@ def configure_optimizers_3(self):
 
     limit_train_batches = 3
 
-    chk = ModelCheckpoint(dirpath=tmpdir, filename=str(use_multiple_optimizers), save_last=True)
-    chk.last_model_path = None
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
         limit_train_batches=limit_train_batches,
         limit_val_batches=0,
-        callbacks=chk,
         accumulate_grad_batches=accumulate_grad_batches,
-        resume_from_checkpoint=None,
     )
 
     # simulate random failure in training_step
@@ -179,9 +174,6 @@ def configure_optimizers_3(self):
 
     checkpoint = torch.load(str(tmpdir / ".pl_auto_save.ckpt"))
 
-    num_epochs = 1
-    num_batches = 4
-
     num_optimizers = 3 if use_multiple_optimizers else 1
 
     # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)

From 7eee718d421ded8e93afcc30155223f753ef2752 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 15 Jul 2021 00:11:24 +0200
Subject: [PATCH 074/106] refactor naming and organization in test_progress

---
 tests/trainer/test_progress.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 6318c29c39701..919ded5da60c6 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -141,13 +141,17 @@ def training_step(self, batch, batch_idx, optimizer_idx: int = None):
             return super().training_step(batch, batch_idx)
 
         def configure_optimizers_3(self):
-            optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1)
+            optimizer_0 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
             optimizer_1 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
             optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            return [optimizer, optimizer_1, optimizer_2], \
-                   [lr_scheduler, {"scheduler": lr_scheduler_1, "interval": "step"}]
+            optimizers = [optimizer_0, optimizer_1, optimizer_2]
+
+            lr_scheduler_0 = torch.optim.lr_scheduler.StepLR(optimizer_0, step_size=1)
+            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
+            # no scheduler for optimizer_2
+            lr_schedulers = [lr_scheduler_0, {"scheduler": lr_scheduler_1, "interval": "step"}]
+
+            return optimizers, lr_schedulers
 
     model = TestModel()
     model.training_epoch_end = None
@@ -177,15 +181,15 @@ def configure_optimizers_3(self):
     num_optimizers = 3 if use_multiple_optimizers else 1
 
     # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
-    total_optimizer_step = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
+    completed_optimizer_steps = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
 
     # we raised expection on the first optimizer
     current_optimizer_step = (1 if use_multiple_optimizers else 0)
 
     if accumulate_grad_batches == 2 and use_multiple_optimizers:
-        total_optimizer_step += 1
+        completed_optimizer_steps += 1
 
-    total_optimizer_zero_grad = total_optimizer_step
+    total_optimizer_zero_grad = completed_optimizer_steps
     current_optimizer_zero_grad = current_optimizer_step
 
     if accumulate_grad_batches == 2:
@@ -254,10 +258,10 @@ def configure_optimizers_3(self):
             "optimizer": {
                 "step": {
                     "total": {
-                        "ready": total_optimizer_step + 1,
+                        "ready": completed_optimizer_steps + 1,
                         "started": None,
                         "processed": None,
-                        "completed": total_optimizer_step,
+                        "completed": completed_optimizer_steps,
                     },
                     "current": {
                         "ready": current_optimizer_step + 1,

From a1bd9892a64d91bfc6a531e278d58dbd931de91b Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 00:09:01 +0200
Subject: [PATCH 075/106] Unnecessary variable

---
 pytorch_lightning/loops/batch/training_batch_loop.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index 7976dfb5159f6..d28b5a2bd39de 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -140,9 +140,7 @@ def advance(self, batch, batch_idx, dataloader_idx):
         self.trainer.logger_connector.on_train_split_start(batch_idx, split_idx, split_batch)
 
         if self.trainer.lightning_module.automatic_optimization:
-            active_optimizers = self.get_active_optimizers(batch_idx)
-            for opt_idx, optimizer in active_optimizers:
-
+            for opt_idx, optimizer in self.get_active_optimizers(batch_idx):
                 # handle optimization restart
                 if self.restarting:
                     if opt_idx < self.optim_progress.optimizer_idx:

From f6d3a5f3e59dbdc955959f71325a8e515552b839 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 00:13:32 +0200
Subject: [PATCH 076/106] Remove unnecessary diff

---
 pytorch_lightning/loops/dataloader/dataloader_loop.py | 1 -
 pytorch_lightning/loops/epoch/training_epoch_loop.py  | 4 ----
 pytorch_lightning/loops/fit_loop.py                   | 3 ++-
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/loops/dataloader/dataloader_loop.py b/pytorch_lightning/loops/dataloader/dataloader_loop.py
index ed7f776fcad7d..65521aea547d8 100644
--- a/pytorch_lightning/loops/dataloader/dataloader_loop.py
+++ b/pytorch_lightning/loops/dataloader/dataloader_loop.py
@@ -56,7 +56,6 @@ def done(self) -> bool:
     def reset(self) -> None:
         """Resets the internal state"""
         if not self.restarting:
-            # reset batch / epoch progress tracking
             self.dataloader_progress.current.reset()
 
     def on_advance_start(self, *args: Any, **kwargs: Any) -> None:
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 9a6f6bd59eac8..91b938404f1ef 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -119,8 +119,6 @@ def advance(self, dataloader_iter: Iterator, **kwargs: Any) -> None:
 
         with self.trainer.profiler.profile("run_training_batch"):
             batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
-
-            # TODO: remove with progress tracking
             self.batches_seen += 1
 
         self.batch_progress.increment_processed()
@@ -165,7 +163,6 @@ def on_advance_end(self):
         # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
         # -----------------------------------------
         should_check_val = self._should_check_val_fx(self.iteration_count, self.is_last_batch)
-
         if should_check_val:
             self.trainer.validating = True
             self._run_validation()
@@ -393,7 +390,6 @@ def update_lr_schedulers(self, interval: str, update_plateau_schedulers: bool) -
         """updates the lr schedulers based on the given interval"""
         if interval == "step" and self.batch_loop.should_accumulate():
             return
-
         self.trainer.optimizer_connector.update_learning_rates(
             interval=interval,
             update_plateau_schedulers=update_plateau_schedulers,
diff --git a/pytorch_lightning/loops/fit_loop.py b/pytorch_lightning/loops/fit_loop.py
index d2f0dfe954a6f..7df0d1445e3b3 100644
--- a/pytorch_lightning/loops/fit_loop.py
+++ b/pytorch_lightning/loops/fit_loop.py
@@ -51,9 +51,10 @@ def __init__(
         super().__init__()
         self.max_epochs = 1000 if (max_epochs is None and max_steps is None) else max_epochs
         self.min_epochs = 1 if (min_epochs is None and min_steps is None) else min_epochs
-        self.epoch_loop = TrainingEpochLoop(min_steps, max_steps)
         self.epoch_progress = Progress()
 
+        self.epoch_loop = TrainingEpochLoop(min_steps, max_steps)
+
     @property
     def current_epoch(self) -> int:
         """Return the current epoch"""

From d57bddffb66101187ba8ab4404143463571b5859 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 00:15:42 +0200
Subject: [PATCH 077/106] Improve comment

---
 pytorch_lightning/trainer/trainer.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index bb65ff47817e7..80e0508601b46 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1255,7 +1255,6 @@ def _log_device_info(self) -> None:
     def _on_expection(self):
         if not self.is_global_zero or not _fault_tolerant_enabled():
             return
-
-        # save a checkpoint for fault tolerant training
+        # save a checkpoint for fault tolerant training. we don't use `log_dir` to minimize the chances of failure.
         file_path = os.path.join(self.default_root_dir, ".pl_auto_save.ckpt")
         self.save_checkpoint(file_path)

From 099edd01cc28bebca506242a8ac633cca7823af6 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 00:23:38 +0200
Subject: [PATCH 078/106] Undo typing change to avoid polluting everything with
 mypy fixes

---
 pytorch_lightning/core/lightning.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
index aeed3c9304a76..735f8ab160c1f 100644
--- a/pytorch_lightning/core/lightning.py
+++ b/pytorch_lightning/core/lightning.py
@@ -31,7 +31,6 @@
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
 
-import pytorch_lightning as pl
 from pytorch_lightning.core.grads import GradInformation
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.memory import ModelSummary
@@ -90,7 +89,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         torch._C._log_api_usage_once(f"lightning.module.{self.__class__.__name__}")
 
         # pointer to the trainer object
-        self.trainer: Optional['pl.Trainer'] = None
+        self.trainer = None
 
         self._distrib_type = None
         self._device_type = None

From 9145c82c1cd6da4c175bc4d4e88f65ee206ed444 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 01:32:02 +0200
Subject: [PATCH 079/106] Fix and improve test_loops.py

---
 tests/loops/test_loops.py | 81 +++++++++------------------------------
 1 file changed, 18 insertions(+), 63 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 34828bf8d59f1..59f84b36cf3dd 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -20,17 +20,6 @@
 from pytorch_lightning.trainer.trainer import Trainer
 
 
-def _collect_loop_progress(loop: Loop) -> Dict[str, Any]:
-    """Return the progress for the current loop and its children."""
-    progress = {}
-    for k, v in loop.__dict__.items():
-        if isinstance(v, BaseProgress):
-            progress[k] = v
-        elif isinstance(v, Loop):
-            progress[k] = _collect_loop_progress(v)
-    return progress
-
-
 def test_loop_restore():
 
     class CustomExpection(Exception):
@@ -52,12 +41,10 @@ def done(self) -> bool:
 
         def reset(self) -> None:
             self.iter_dataset = iter(self.dataset)
-
             if self.restarting:
                 for _ in range(self.iteration_count):
                     next(self.iter_dataset)
                 self.iteration_count += 1
-                # self.restarting = False
             else:
                 self.outputs = []
 
@@ -101,15 +88,8 @@ def test_loop_hierarchy():
 
     @dataclass
     class SimpleProgress(BaseProgress):
-
         increment: int = 0
 
-        def state_dict(self):
-            return {"increment": self.increment}
-
-        def load_state_dict(self, state_dict):
-            self.increment = state_dict["increment"]
-
     class Simple(Loop):
 
         def __init__(self, a):
@@ -122,18 +102,16 @@ def advance(self, *args: Any, **kwargs: Any) -> None:
             if not loop:
                 return
             loop.run()
-            self.progress.increment += 1
 
-        @property
-        def skip(self) -> bool:
-            return False
+        def on_advance_end(self):
+            self.progress.increment += 1
 
         @property
         def done(self) -> bool:
-            return self.iteration_count > 0
+            return self.progress.increment > 0
 
-        # def reset(self) -> None:
-        #     self.restarting = False
+        def reset(self) -> None:
+            ...
 
         def on_save_checkpoint(self) -> Dict:
             return {"a": self.a}
@@ -141,26 +119,15 @@ def on_save_checkpoint(self) -> Dict:
         def on_load_checkpoint(self, state_dict: Dict) -> None:
             self.a = state_dict["a"]
 
-    grand_loop_parent = Simple(0)
     loop_parent = Simple(1)
     loop_child = Simple(2)
-
     loop_parent.loop_child = loop_child
 
-    assert not loop_parent.skip
-
-    state_dict = loop_parent.state_dict()
-
-    loop_progress = _collect_loop_progress(loop_parent)
-    assert loop_progress["progress"] == loop_parent.progress
-    assert loop_progress["loop_child"]["progress"] == loop_child.progress
-
-    loop_progress = _collect_loop_progress(loop_child)
-    assert loop_progress["progress"] == loop_child.progress
-
+    # check the trainer reference is propagated
     loop_parent.trainer = Trainer()
-    assert loop_child.trainer == loop_parent.trainer
+    assert loop_child.trainer is loop_parent.trainer
 
+    state_dict = loop_parent.state_dict()
     assert state_dict == {
         'state_dict': {
             'a': 1
@@ -176,23 +143,14 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
         }
     }
 
-    loop_parent.progress
-
     state_dict["loop_child.state_dict"]["a"] = 3
-
+    # check restarting after `load_state_dict`
     loop_parent.load_state_dict(state_dict)
     assert loop_parent.restarting
 
     loop_parent.run()
 
-    loop_parent_copy = deepcopy(loop_parent)
-    assert loop_parent_copy.state_dict() == loop_parent.state_dict()
-
-    assert loop_parent_copy.on_save_checkpoint() == {'a': 1}
-    assert loop_parent_copy.loop_child.on_save_checkpoint() == {'a': 3}
-
-    assert not loop_parent.restarting
-
+    # check the new state after `run`
     state_dict = loop_parent.state_dict()
     assert state_dict == {
         'state_dict': {
@@ -205,26 +163,23 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
             'a': 3
         },
         'loop_child.progress': {
-            'increment': 0
+            'increment': 1
         }
     }
 
+    loop_parent_copy = deepcopy(loop_parent)
+    assert loop_parent_copy.state_dict() == loop_parent.state_dict()
+
+    assert loop_parent_copy.on_save_checkpoint() == state_dict['state_dict']
+    assert loop_parent_copy.loop_child.on_save_checkpoint() == state_dict['loop_child.state_dict']
+
     loop_parent = Simple(1)
     loop_child = Simple(2)
     loop_parent.loop_child = loop_child
     loop_parent.load_state_dict(state_dict)
     assert loop_parent.progress.increment == 1
-    assert loop_parent.loop_child.progress.increment == 0
+    assert loop_parent.loop_child.progress.increment == 1
 
     del loop_parent.loop_child
     state_dict = loop_parent.state_dict()
     assert state_dict == {'state_dict': {'a': 1}, 'progress': {'increment': 1}}
-
-    grand_loop_parent = Simple(0)
-    loop_parent = Simple(1)
-    loop_child = Simple(2)
-    grand_loop_parent.loop_child = loop_parent
-    loop_parent.loop_child = loop_child
-
-    grand_loop_parent.trainer = Trainer()
-    assert loop_child.trainer is not None

From b0fc845f80ca7131ba2774c326284c3e5edd305f Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 01:48:58 +0200
Subject: [PATCH 080/106] Fix and organize `test_loop_state_dict`

---
 tests/loops/test_loop_state_dict.py | 138 +++++++++-------------------
 1 file changed, 43 insertions(+), 95 deletions(-)

diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index 7dc182e2df8fd..cb6ed55d71b31 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import pytest
 
 from pytorch_lightning.loops import FitLoop
@@ -33,140 +32,89 @@ def test_loops_state_dict():
 
 def test_loops_state_dict_structure():
     trainer = Trainer()
-    # structure saved by the checkpoint connector
-    state_dict = {
-        "fit_loop": trainer.fit_loop.state_dict(),
-        "validate_loop": trainer.validate_loop.state_dict(),
-        "test_loop": trainer.test_loop.state_dict(),
-        "predict_loop": trainer.predict_loop.state_dict(),
-    }
+    state_dict = trainer.checkpoint_connector._get_loops_state_dict()
     # yapf: disable
     expected = {
         "fit_loop": {
             "state_dict": {},
-            "epoch_loop.state_dict": {},
-            "epoch_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "epoch_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "should_check_val": False,
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
-            "epoch_loop.batch_loop.state_dict": {},
-            "epoch_loop.batch_loop.progress": {
+
+            "epoch_loop.state_dict": {},
+            "epoch_loop.batch_progress": {
+                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            },
+            "epoch_loop.scheduler_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
+
             "epoch_loop.batch_loop.optim_progress": {
-                "optimizer_idx": 0,
                 "optimizer": {
                     "step": {
-                        "total": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": None,
-                            "completed": 0,
-                        },
-                        "current": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": None,
-                            "completed": 0,
-                        },
+                        "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                        "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
                     },
                     "zero_grad": {
-                        "total": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": None,
-                            "completed": 0,
-                        },
-                        "current": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": None,
-                            "completed": 0,
-                        },
-                    },
-                },
-                "scheduler": {
-                    "total": {
-                        "ready": 0,
-                        "started": None,
-                        "processed": None,
-                        "completed": 0,
-                    },
-                    "current": {
-                        "ready": 0,
-                        "started": None,
-                        "processed": None,
-                        "completed": 0,
+                        "current": {"ready": 0, "started": 0, "processed": None, "completed": 0},
+                        "total": {"ready": 0, "started": 0, "processed": None, "completed": 0},
                     },
                 },
+                "optimizer_idx": 0,
             },
+            "epoch_loop.batch_loop.state_dict": {},
+
             "epoch_loop.val_loop.state_dict": {},
-            "epoch_loop.val_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "dataloader_idx": 0,
+            "epoch_loop.val_loop.dataloader_progress": {
+                "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             },
+
             "epoch_loop.val_loop.epoch_loop.state_dict": {},
-            "epoch_loop.val_loop.epoch_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "epoch_loop.val_loop.epoch_loop.batch_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
         },
-        "validate_loop": {
+        "predict_loop": {
             "state_dict": {},
-            "progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "dataloader_idx": 0,
+            "dataloader_progress": {
+                "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             },
+
             "epoch_loop.state_dict": {},
-            "epoch_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "epoch_loop.batch_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
         },
         "test_loop": {
             "state_dict": {},
-            "progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "dataloader_idx": 0,
+            "dataloader_progress": {
+                "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             },
             "epoch_loop.state_dict": {},
-            "epoch_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "epoch_loop.batch_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
         },
-        "predict_loop": {
+        "validate_loop": {
             "state_dict": {},
-            "progress": {
-                "epoch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "dataloader_idx": 0,
-                    "batch": {
-                        "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                        "current": {
-                            "ready": 0,
-                            "started": 0,
-                            "processed": 0,
-                            "completed": 0,
-                        },
-                    },
-                }
+            "dataloader_progress": {
+                "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             },
+
             "epoch_loop.state_dict": {},
-            "epoch_loop.progress": {
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "epoch_loop.batch_progress": {
                 "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "dataloader_idx": 0,
-                "batch": {
-                    "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                    "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                },
+                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
         },
     }

From 1577aa8bd1dbcaf8733b23cd6f27c8646dd51bac Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 01:52:39 +0200
Subject: [PATCH 081/106] Remove unnecessary checks in test

---
 tests/trainer/test_progress.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 919ded5da60c6..280cf22e4c378 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -20,13 +20,7 @@
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.trainer.progress import (
-    DataLoaderProgress,
-    OptimizationProgress,
-    OptimizerProgress,
-    Progress,
-    Tracker,
-)
+from pytorch_lightning.trainer.progress import BaseProgress, OptimizerProgress, Progress, Tracker
 from tests.helpers import BoringModel
 
 
@@ -113,11 +107,9 @@ def test_optimizer_progress_default_factory():
 
 
 def test_deepcopy():
-    _ = deepcopy(Tracker())
+    _ = deepcopy(BaseProgress())
     _ = deepcopy(Progress())
-    _ = deepcopy(DataLoaderProgress())
-    _ = deepcopy(OptimizerProgress())
-    _ = deepcopy(OptimizationProgress())
+    _ = deepcopy(Tracker())
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})

From 1f3ae633a7eb84032e2a9e2728a5c939d24e7da0 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 01:55:21 +0200
Subject: [PATCH 082/106] Update test after disallowing updates on None
 attributes

---
 tests/trainer/test_progress.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 280cf22e4c378..4fd484128a98e 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -74,23 +74,23 @@ def test_base_progress_from_defaults():
 
 def test_epoch_loop_progress_increment_sequence():
     """Test sequences for incrementing batches reads and epochs."""
-    batch = Progress(total=Tracker(started=None))
+    batch = Progress()
 
     batch.increment_ready()
-    assert batch.total == Tracker(ready=1, started=None)
+    assert batch.total == Tracker(ready=1)
     assert batch.current == Tracker(ready=1)
 
     batch.increment_started()
-    assert batch.total == Tracker(ready=1, started=None)
-    assert batch.current == Tracker(ready=1)
+    assert batch.total == Tracker(ready=1, started=1)
+    assert batch.current == Tracker(ready=1, started=1)
 
     batch.increment_processed()
-    assert batch.total == Tracker(ready=1, started=None, processed=1)
-    assert batch.current == Tracker(ready=1, processed=1)
+    assert batch.total == Tracker(ready=1, started=1, processed=1)
+    assert batch.current == Tracker(ready=1, started=1, processed=1)
 
     batch.increment_completed()
-    assert batch.total == Tracker(ready=1, started=None, processed=1, completed=1)
-    assert batch.current == Tracker(ready=1, processed=1, completed=1)
+    assert batch.total == Tracker(ready=1, started=1, processed=1, completed=1)
+    assert batch.current == Tracker(ready=1, started=1, processed=1, completed=1)
 
 
 def test_optimizer_progress_default_factory():

From ad8224ca4f1bf40c5dc3fbf23a40f52e29704633 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 02:02:29 +0200
Subject: [PATCH 083/106] Typing

---
 pytorch_lightning/loops/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/loops/base.py b/pytorch_lightning/loops/base.py
index 6aa8ebefb60b1..1efd67bb26f8e 100644
--- a/pytorch_lightning/loops/base.py
+++ b/pytorch_lightning/loops/base.py
@@ -160,7 +160,7 @@ def on_save_checkpoint(self) -> Dict:
         """
         return {}
 
-    def on_load_checkpoint(self, state_dict: Dict):
+    def on_load_checkpoint(self, state_dict: Dict) -> None:
         """Called when loading a model checkpoint, use to reload loop state."""
 
     def state_dict(self, destination: Optional[Dict] = None, prefix: Optional[str] = "") -> Dict:
@@ -185,14 +185,14 @@ def state_dict(self, destination: Optional[Dict] = None, prefix: Optional[str] =
 
         return destination
 
-    def load_state_dict(self, state_dict: Dict, prefix="", restart_progress: bool = True):
+    def load_state_dict(self, state_dict: Dict, prefix: str = "", restart_progress: bool = True) -> None:
         """ Loads the state of this loop and all its children. """
         self._load_from_state_dict(state_dict.copy(), prefix, restart_progress)
         for k, v in self.__dict__.items():
             if isinstance(v, Loop):
                 v.load_state_dict(state_dict.copy(), prefix + k + ".", restart_progress)
 
-    def _load_from_state_dict(self, state_dict, prefix, restart_progress):
+    def _load_from_state_dict(self, state_dict: Dict, prefix: str, restart_progress: bool) -> None:
         for k, v in self.__dict__.items():
             if isinstance(v, BaseProgress):
                 v.load_state_dict(state_dict[prefix + k])

From 403ea9d1e7be6eb86ac6835a0fb4ccf4e9c46593 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 02:06:45 +0200
Subject: [PATCH 084/106] Minor test cleanup

---
 tests/trainer/test_progress.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 4fd484128a98e..b1b60580cf179 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -336,14 +336,12 @@ def val_dataloader(self):
     model = ValidationModel()
     model.validation_epoch_end = None
 
-    chk = ModelCheckpoint(dirpath=tmpdir, save_last=True)
-    chk.last_model_path = None
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_train_batches=5,
         limit_val_batches=3,
-        callbacks=chk,
+        callbacks=ModelCheckpoint(dirpath=tmpdir, save_last=True),
         resume_from_checkpoint=None,
         val_check_interval=2,
         num_sanity_val_steps=0,
@@ -355,12 +353,6 @@ def val_dataloader(self):
     except CustomException:
         pass
 
-    #######################
-    # VALIDATE CHECKPOINT #
-    #######################
-
-    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)["loops"]["fit_loop"]
-
     checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)["loops"]["fit_loop"]
 
     expected = {

From 6492cde5350c4e49ccf3d6fdcc5c55a6231b678b Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 02:52:48 +0200
Subject: [PATCH 085/106] Fix and move loop test

---
 tests/loops/test_loops.py      | 111 +++++++++++++++++++++++++++++++--
 tests/trainer/test_progress.py |  85 -------------------------
 2 files changed, 106 insertions(+), 90 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 59f84b36cf3dd..9219a1f832db0 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -11,19 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator
+from unittest import mock
 
+import torch
+
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.trainer.trainer import Trainer
+from tests.helpers import BoringModel
 
 
-def test_loop_restore():
+class CustomException(Exception):
+    pass
 
-    class CustomExpection(Exception):
-        pass
+
+def test_loop_restore():
 
     class Simple(Loop):
 
@@ -52,7 +59,7 @@ def advance(self) -> None:
             value = next(self.iter_dataset)
 
             if self.iteration_count == 5:
-                raise CustomExpection
+                raise CustomException
 
             self.outputs.append(value)
 
@@ -71,7 +78,7 @@ def load_state_dict(self, state_dict: Dict) -> None:
     try:
         loop.run()
         state_dict = {}
-    except CustomExpection:
+    except CustomException:
         state_dict = loop.state_dict()
 
     loop = Simple(data)
@@ -183,3 +190,97 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     del loop_parent.loop_child
     state_dict = loop_parent.state_dict()
     assert state_dict == {'state_dict': {'a': 1}, 'progress': {'increment': 1}}
+
+
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+def test_loop_restart_progress_multiple_datasets(tmpdir):
+    stop_epoch = stop_batch = stop_dataloader = 1
+    n_dataloaders = 3
+    n_batches = 3
+    n_epochs = 2
+
+    class ValidationModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+
+        def validation_step(self, batch, batch_idx, dataloader_idx):
+            if self.current_epoch == stop_epoch and batch_idx == stop_batch and dataloader_idx == stop_dataloader:
+                raise CustomException
+            return super().validation_step(batch, batch_idx)
+
+        def val_dataloader(self):
+            return [super().val_dataloader()] * n_dataloaders
+
+    model = ValidationModel()
+    model.validation_epoch_end = None
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=n_epochs,
+        limit_train_batches=1,
+        limit_val_batches=n_batches,
+        callbacks=ModelCheckpoint(dirpath=tmpdir, save_last=True),
+        num_sanity_val_steps=0,
+    )
+
+    # simulate random failure in training_step
+    try:
+        trainer.fit(model)
+    except CustomException:
+        pass
+
+    ckpt_path = str(tmpdir / '.pl_auto_save.ckpt')
+    checkpoint = torch.load(ckpt_path)["loops"]["fit_loop"]
+
+    total = (n_epochs - 1) * n_dataloaders + stop_dataloader
+    expected = {
+        "total": {
+            "ready": total + 1,
+            "started": None,
+            "processed": None,
+            "completed": total
+        },
+        "current": {
+            "ready": stop_dataloader + 1,
+            "started": None,
+            "processed": None,
+            "completed": stop_dataloader,
+        },
+    }
+    assert checkpoint["epoch_loop.val_loop.dataloader_progress"] == expected
+
+    trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
+    total = n_dataloaders * n_batches + n_batches + stop_epoch
+    expected = {
+        "total": {
+            "ready": total + 1,
+            "started": total + 1,
+            "processed": total,
+            "completed": total
+        },
+        "current": {
+            "ready": stop_batch + 1,
+            "started": stop_batch + 1,
+            "processed": stop_batch,
+            "completed": stop_batch
+        },
+    }
+    assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
+
+    trainer.fit_loop.load_state_dict(checkpoint)
+    expected = {
+        "total": {
+            "ready": total,
+            "started": total,
+            "processed": total,
+            "completed": total
+        },
+        "current": {
+            "ready": stop_batch,
+            "started": stop_batch,
+            "processed": stop_batch,
+            "completed": stop_batch
+        },
+    }
+    assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index b1b60580cf179..889689a241612 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -19,7 +19,6 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.trainer.progress import BaseProgress, OptimizerProgress, Progress, Tracker
 from tests.helpers import BoringModel
 
@@ -315,87 +314,3 @@ def configure_optimizers_3(self):
     state_dict = trainer.fit_loop.state_dict()
     assert state_dict != checkpoint["loops"]["fit_loop"]
     assert state_dict["epoch_progress"]["total"]["started"] == 1
-
-
-@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-def test_progress_tracking_validation_multiple_datasets(tmpdir):
-
-    class ValidationModel(BoringModel):
-
-        def __init__(self):
-            super().__init__()
-
-        def validation_step(self, batch, batch_idx, dataloader_idx):
-            if self.trainer.fit_loop.epoch_loop.batch_idx == 3 and batch_idx == 1 and dataloader_idx == 1:
-                raise CustomException
-            return super().validation_step(batch, batch_idx)
-
-        def val_dataloader(self):
-            return [super().val_dataloader(), super().val_dataloader(), super().val_dataloader()]
-
-    model = ValidationModel()
-    model.validation_epoch_end = None
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        limit_train_batches=5,
-        limit_val_batches=3,
-        callbacks=ModelCheckpoint(dirpath=tmpdir, save_last=True),
-        resume_from_checkpoint=None,
-        val_check_interval=2,
-        num_sanity_val_steps=0,
-    )
-
-    # simulate random failure in training_step
-    try:
-        trainer.fit(model)
-    except CustomException:
-        pass
-
-    checkpoint = torch.load(trainer.checkpoint_callback.last_model_path)["loops"]["fit_loop"]
-
-    expected = {
-        "total": {
-            "ready": 2,
-            "started": 2,
-            "processed": 1,
-            "completed": 1
-        },
-        "current": {
-            "ready": 1,
-            "started": 1,
-            "processed": 0,
-            "completed": 0
-        },
-        "dataloader_idx": 1,
-    }
-
-    assert checkpoint["epoch_loop.val_loop.progress"] == expected
-
-    # 3 dataloaders with 3 samples for batch_idx == 1 + first dataloader on batch_idx == 1 + failure on batch_idx = 1
-    current = 2
-    total = 3 * 3 + 3 + current
-    expected = {
-        "total": {
-            "ready": total,
-            "started": total,
-            "processed": total - 1,
-            "completed": total - 1
-        },
-        "current": {
-            "ready": current,
-            "started": current,
-            "processed": current - 1,
-            "completed": current - 1
-        },
-    }
-
-    assert checkpoint["epoch_loop.val_loop.epoch_loop.progress"] == expected
-
-    trainer = Trainer()
-    trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
-    assert trainer.fit_loop.state_dict() == checkpoint
-
-    trainer.fit_loop.load_state_dict(checkpoint)
-    assert trainer.fit_loop.state_dict() != checkpoint

From bc5544dcba75c4ff2f4c7d48cb7807d794a16e05 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 02:57:25 +0200
Subject: [PATCH 086/106] Move test from progress to loops

---
 tests/loops/test_loops.py      | 206 +++++++++++++++++++++++++++++++
 tests/trainer/test_progress.py | 214 ---------------------------------
 2 files changed, 206 insertions(+), 214 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 9219a1f832db0..1792fbc41c497 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -17,6 +17,7 @@
 from typing import Any, Dict, Iterator
 from unittest import mock
 
+import pytest
 import torch
 
 from pytorch_lightning.callbacks import ModelCheckpoint
@@ -284,3 +285,208 @@ def val_dataloader(self):
         },
     }
     assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
+
+
+@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
+@pytest.mark.parametrize("use_multiple_optimizers", [False, True])
+@pytest.mark.parametrize("accumulate_grad_batches", [1, 2])
+def test_progress_tracking(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
+
+    class TestModel(BoringModel):
+
+        def __init__(self):
+            super().__init__()
+            if use_multiple_optimizers:
+                self.configure_optimizers = self.configure_optimizers_3
+
+        def training_step(self, batch, batch_idx, optimizer_idx: int = None):
+            # simulate failure during the the 5-th training step, 2nd epoch (global_step = 4)
+            if self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
+                1 if use_multiple_optimizers else None
+            ):
+                raise CustomException
+            return super().training_step(batch, batch_idx)
+
+        def configure_optimizers_3(self):
+            optimizer_0 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+            optimizer_1 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
+            optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
+            optimizers = [optimizer_0, optimizer_1, optimizer_2]
+
+            lr_scheduler_0 = torch.optim.lr_scheduler.StepLR(optimizer_0, step_size=1)
+            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
+            # no scheduler for optimizer_2
+            lr_schedulers = [lr_scheduler_0, {"scheduler": lr_scheduler_1, "interval": "step"}]
+
+            return optimizers, lr_schedulers
+
+    model = TestModel()
+    model.training_epoch_end = None
+
+    limit_train_batches = 3
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=2,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=0,
+        accumulate_grad_batches=accumulate_grad_batches,
+    )
+
+    # simulate random failure in training_step
+    try:
+        trainer.fit(model)
+    except CustomException:
+        pass
+
+    #######################
+    # VALIDATE CHECKPOINT #
+    #######################
+
+    checkpoint = torch.load(str(tmpdir / ".pl_auto_save.ckpt"))
+
+    num_optimizers = 3 if use_multiple_optimizers else 1
+
+    # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
+    completed_optimizer_steps = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
+
+    # we raised expection on the first optimizer
+    current_optimizer_step = (1 if use_multiple_optimizers else 0)
+
+    if accumulate_grad_batches == 2 and use_multiple_optimizers:
+        completed_optimizer_steps += 1
+
+    total_optimizer_zero_grad = completed_optimizer_steps
+    current_optimizer_zero_grad = current_optimizer_step
+
+    if accumulate_grad_batches == 2:
+        # that's weird ! todo (tchaton) investigate this
+        total_optimizer_zero_grad = (9 if use_multiple_optimizers else 3)
+        current_optimizer_zero_grad = 0  # same there.
+
+    total_scheduler_step = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
+
+    current_scheduler_step = 0
+
+    if accumulate_grad_batches == 2:
+        total_scheduler_step += 1
+
+    optimizer_idx = (1 if use_multiple_optimizers else 0)
+
+    # yapf: disable
+    expected = {
+        "state_dict": {},
+        "epoch_loop.state_dict": {},
+        "epoch_loop.batch_progress": {
+            "total": {
+                "ready": 5,
+                "started": 5,
+                "processed": 4,
+                "completed": 4,
+            },
+            "current": {
+                "ready": 2,
+                "started": 2,
+                "processed": 1,
+                "completed": 1,
+            },
+        },
+        "epoch_loop.scheduler_progress": {
+            "total": {
+                "ready": total_scheduler_step,
+                "started": None,
+                "processed": None,
+                "completed": total_scheduler_step,
+            },
+            "current": {
+                "ready": current_scheduler_step,
+                "started": None,
+                "processed": None,
+                "completed": current_scheduler_step,
+            },
+        },
+        "epoch_loop.batch_loop.state_dict": {},
+        "epoch_loop.batch_loop.split_progress": {
+            "total": {
+                "ready": 0,
+                "started": 0,
+                "processed": 0,
+                "completed": 0,
+            },
+            "current": {
+                "ready": 0,
+                "started": 0,
+                "processed": 0,
+                "completed": 0,
+            },
+        },
+        "epoch_loop.batch_loop.optim_progress": {
+            "optimizer_idx": optimizer_idx,
+            "optimizer": {
+                "step": {
+                    "total": {
+                        "ready": completed_optimizer_steps + 1,
+                        "started": None,
+                        "processed": None,
+                        "completed": completed_optimizer_steps,
+                    },
+                    "current": {
+                        "ready": current_optimizer_step + 1,
+                        "started": None,
+                        "processed": None,
+                        "completed": current_optimizer_step,
+                    },
+                },
+                "zero_grad": {
+                    "total": {
+                        "ready": total_optimizer_zero_grad,
+                        "started": total_optimizer_zero_grad,
+                        "processed": None,
+                        "completed": total_optimizer_zero_grad,
+                    },
+                    "current": {
+                        "ready": current_optimizer_zero_grad,
+                        "started": current_optimizer_zero_grad,
+                        "processed": None,
+                        "completed": current_optimizer_zero_grad,
+                    },
+                },
+            },
+        },
+        "epoch_loop.val_loop.state_dict": {},
+        "epoch_loop.val_loop.dataloader_progress": {
+            "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
+            "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+        },
+        "epoch_loop.val_loop.epoch_loop.state_dict": {},
+        "epoch_loop.val_loop.epoch_loop.batch_progress": {
+            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+        },
+        "epoch_progress": {
+            "total": {
+                "ready": 2,
+                "started": 2,
+                "processed": 1,
+                "completed": 1,
+            },
+            "current": {
+                "ready": 2,
+                "started": 2,
+                "processed": 1,
+                "completed": 1,
+            },
+        },
+    }
+    # yapf: enable
+
+    assert checkpoint["loops"]["fit_loop"] == expected
+
+    trainer = Trainer()
+    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"], restart_progress=False)
+    assert trainer.fit_loop.state_dict() == checkpoint["loops"]["fit_loop"]
+
+    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
+    state_dict = trainer.fit_loop.state_dict()
+    assert state_dict != checkpoint["loops"]["fit_loop"]
+    assert state_dict["epoch_progress"]["total"]["started"] == 1
diff --git a/tests/trainer/test_progress.py b/tests/trainer/test_progress.py
index 889689a241612..4057a2a686134 100644
--- a/tests/trainer/test_progress.py
+++ b/tests/trainer/test_progress.py
@@ -11,20 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 from copy import deepcopy
-from unittest import mock
 
 import pytest
-import torch
 
-from pytorch_lightning import Trainer
 from pytorch_lightning.trainer.progress import BaseProgress, OptimizerProgress, Progress, Tracker
-from tests.helpers import BoringModel
-
-
-class CustomException(BaseException):
-    pass
 
 
 def test_progress_getattr_setattr():
@@ -109,208 +100,3 @@ def test_deepcopy():
     _ = deepcopy(BaseProgress())
     _ = deepcopy(Progress())
     _ = deepcopy(Tracker())
-
-
-@mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-@pytest.mark.parametrize("use_multiple_optimizers", [False, True])
-@pytest.mark.parametrize("accumulate_grad_batches", [1, 2])
-def test_progress_tracking(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
-
-    class TestModel(BoringModel):
-
-        def __init__(self):
-            super().__init__()
-            if use_multiple_optimizers:
-                self.configure_optimizers = self.configure_optimizers_3
-
-        def training_step(self, batch, batch_idx, optimizer_idx: int = None):
-            # simulate failure during the the 5-th training step, 2nd epoch (global_step = 4)
-            if self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
-                1 if use_multiple_optimizers else None
-            ):
-                raise CustomException
-            return super().training_step(batch, batch_idx)
-
-        def configure_optimizers_3(self):
-            optimizer_0 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            optimizer_1 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            optimizers = [optimizer_0, optimizer_1, optimizer_2]
-
-            lr_scheduler_0 = torch.optim.lr_scheduler.StepLR(optimizer_0, step_size=1)
-            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
-            # no scheduler for optimizer_2
-            lr_schedulers = [lr_scheduler_0, {"scheduler": lr_scheduler_1, "interval": "step"}]
-
-            return optimizers, lr_schedulers
-
-    model = TestModel()
-    model.training_epoch_end = None
-
-    limit_train_batches = 3
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=2,
-        limit_train_batches=limit_train_batches,
-        limit_val_batches=0,
-        accumulate_grad_batches=accumulate_grad_batches,
-    )
-
-    # simulate random failure in training_step
-    try:
-        trainer.fit(model)
-    except CustomException:
-        pass
-
-    #######################
-    # VALIDATE CHECKPOINT #
-    #######################
-
-    checkpoint = torch.load(str(tmpdir / ".pl_auto_save.ckpt"))
-
-    num_optimizers = 3 if use_multiple_optimizers else 1
-
-    # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
-    completed_optimizer_steps = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
-
-    # we raised expection on the first optimizer
-    current_optimizer_step = (1 if use_multiple_optimizers else 0)
-
-    if accumulate_grad_batches == 2 and use_multiple_optimizers:
-        completed_optimizer_steps += 1
-
-    total_optimizer_zero_grad = completed_optimizer_steps
-    current_optimizer_zero_grad = current_optimizer_step
-
-    if accumulate_grad_batches == 2:
-        # that's weird ! todo (tchaton) investigate this
-        total_optimizer_zero_grad = (9 if use_multiple_optimizers else 3)
-        current_optimizer_zero_grad = 0  # same there.
-
-    total_scheduler_step = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
-
-    current_scheduler_step = 0
-
-    if accumulate_grad_batches == 2:
-        total_scheduler_step += 1
-
-    optimizer_idx = (1 if use_multiple_optimizers else 0)
-
-    # yapf: disable
-    expected = {
-        "state_dict": {},
-        "epoch_loop.state_dict": {},
-        "epoch_loop.batch_progress": {
-            "total": {
-                "ready": 5,
-                "started": 5,
-                "processed": 4,
-                "completed": 4,
-            },
-            "current": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
-            },
-        },
-        "epoch_loop.scheduler_progress": {
-            "total": {
-                "ready": total_scheduler_step,
-                "started": None,
-                "processed": None,
-                "completed": total_scheduler_step,
-            },
-            "current": {
-                "ready": current_scheduler_step,
-                "started": None,
-                "processed": None,
-                "completed": current_scheduler_step,
-            },
-        },
-        "epoch_loop.batch_loop.state_dict": {},
-        "epoch_loop.batch_loop.split_progress": {
-            "total": {
-                "ready": 0,
-                "started": 0,
-                "processed": 0,
-                "completed": 0,
-            },
-            "current": {
-                "ready": 0,
-                "started": 0,
-                "processed": 0,
-                "completed": 0,
-            },
-        },
-        "epoch_loop.batch_loop.optim_progress": {
-            "optimizer_idx": optimizer_idx,
-            "optimizer": {
-                "step": {
-                    "total": {
-                        "ready": completed_optimizer_steps + 1,
-                        "started": None,
-                        "processed": None,
-                        "completed": completed_optimizer_steps,
-                    },
-                    "current": {
-                        "ready": current_optimizer_step + 1,
-                        "started": None,
-                        "processed": None,
-                        "completed": current_optimizer_step,
-                    },
-                },
-                "zero_grad": {
-                    "total": {
-                        "ready": total_optimizer_zero_grad,
-                        "started": total_optimizer_zero_grad,
-                        "processed": None,
-                        "completed": total_optimizer_zero_grad,
-                    },
-                    "current": {
-                        "ready": current_optimizer_zero_grad,
-                        "started": current_optimizer_zero_grad,
-                        "processed": None,
-                        "completed": current_optimizer_zero_grad,
-                    },
-                },
-            },
-        },
-        "epoch_loop.val_loop.state_dict": {},
-        "epoch_loop.val_loop.dataloader_progress": {
-            "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
-            "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
-        },
-        "epoch_loop.val_loop.epoch_loop.state_dict": {},
-        "epoch_loop.val_loop.epoch_loop.batch_progress": {
-            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-        },
-        "epoch_progress": {
-            "total": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
-            },
-            "current": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
-            },
-        },
-    }
-    # yapf: enable
-
-    assert checkpoint["loops"]["fit_loop"] == expected
-
-    trainer = Trainer()
-    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"], restart_progress=False)
-    assert trainer.fit_loop.state_dict() == checkpoint["loops"]["fit_loop"]
-
-    trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
-    state_dict = trainer.fit_loop.state_dict()
-    assert state_dict != checkpoint["loops"]["fit_loop"]
-    assert state_dict["epoch_progress"]["total"]["started"] == 1

From 098c7b5becc0b64d9784ea63553376ff47f79874 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 03:47:55 +0200
Subject: [PATCH 087/106] Reset the scheduler progress

---
 pytorch_lightning/loops/epoch/training_epoch_loop.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 91b938404f1ef..2e482a01132cb 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -89,6 +89,7 @@ def reset(self) -> None:
             self.iteration_count = self.batches_seen = self.batch_progress.current.completed
         else:
             self.batch_progress.current.reset()
+            self.scheduler_progress.current.reset()
 
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         # hook

From ef7c9e05059146e077bd32602f74804c428473b9 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 04:03:38 +0200
Subject: [PATCH 088/106] SchedulerProgress fix

---
 pytorch_lightning/loops/batch/training_batch_loop.py        | 1 -
 pytorch_lightning/trainer/connectors/optimizer_connector.py | 2 +-
 pytorch_lightning/trainer/progress.py                       | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index d28b5a2bd39de..a4d76e3547126 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -409,7 +409,6 @@ def _optimizer_step(
         )
 
         self.optim_progress.optimizer.step.increment_completed()
-        self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
 
     def _on_before_zero_grad(self, optimizer: torch.optim.Optimizer) -> None:
         """Calls the ``on_before_zero_grad`` hook.
diff --git a/pytorch_lightning/trainer/connectors/optimizer_connector.py b/pytorch_lightning/trainer/connectors/optimizer_connector.py
index 9939901832c0e..4c49b6e028cb4 100644
--- a/pytorch_lightning/trainer/connectors/optimizer_connector.py
+++ b/pytorch_lightning/trainer/connectors/optimizer_connector.py
@@ -83,7 +83,7 @@ def update_learning_rates(
                 # update LR
                 old_lr = lr_scheduler['scheduler'].optimizer.param_groups[0]['lr']
 
-                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_started()
+                self.trainer.fit_loop.epoch_loop.scheduler_progress.increment_ready()
 
                 if lr_scheduler['reduce_on_plateau']:
                     lr_scheduler['scheduler'].step(monitor_val)
diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 4410cf3901ff2..1321cdd596fe7 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -136,6 +136,7 @@ class DataLoaderProgress(Progress):
     current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
 
 
+@dataclass
 class SchedulerProgress(Progress):
     """
     Tracks the scheduler progress

From 7938403a5e4cb6726e1739a35fdb36dcf144fc3f Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 04:04:37 +0200
Subject: [PATCH 089/106] Consistent whitespace

---
 pytorch_lightning/trainer/progress.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_lightning/trainer/progress.py b/pytorch_lightning/trainer/progress.py
index 1321cdd596fe7..fe9f90613ea9c 100644
--- a/pytorch_lightning/trainer/progress.py
+++ b/pytorch_lightning/trainer/progress.py
@@ -132,6 +132,7 @@ class DataLoaderProgress(Progress):
         total: Tracks the total dataloader progress
         current: Tracks the current dataloader progress
     """
+
     total: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
     current: Tracker = field(default_factory=lambda: Tracker(started=None, processed=None))
 

From 7799101b0ce2a4cfbf1a828ed1a1aa374caef3c7 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 04:15:51 +0200
Subject: [PATCH 090/106] Fix final test

---
 tests/loops/test_loops.py | 157 +++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 96 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 1792fbc41c497..0b6eb4205a802 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -16,6 +16,7 @@
 from dataclasses import dataclass
 from typing import Any, Dict, Iterator
 from unittest import mock
+from unittest.mock import ANY
 
 import pytest
 import torch
@@ -225,7 +226,7 @@ def val_dataloader(self):
         num_sanity_val_steps=0,
     )
 
-    # simulate random failure in training_step
+    # simulate a failure
     try:
         trainer.fit(model)
     except CustomException:
@@ -290,7 +291,12 @@ def val_dataloader(self):
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
 @pytest.mark.parametrize("use_multiple_optimizers", [False, True])
 @pytest.mark.parametrize("accumulate_grad_batches", [1, 2])
-def test_progress_tracking(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
+def test_loop_state_on_exception(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
+    stop_epoch = stop_batch = 1
+    stop_optimizer = 1 if use_multiple_optimizers else 0
+    n_optimizers = 3 if use_multiple_optimizers else 1
+    n_epochs = 2
+    n_batches = 3
 
     class TestModel(BoringModel):
 
@@ -299,11 +305,8 @@ def __init__(self):
             if use_multiple_optimizers:
                 self.configure_optimizers = self.configure_optimizers_3
 
-        def training_step(self, batch, batch_idx, optimizer_idx: int = None):
-            # simulate failure during the the 5-th training step, 2nd epoch (global_step = 4)
-            if self.trainer.current_epoch == 1 and batch_idx == 1 and optimizer_idx == (
-                1 if use_multiple_optimizers else None
-            ):
+        def training_step(self, batch, batch_idx, optimizer_idx=0):
+            if self.trainer.current_epoch == stop_epoch and batch_idx == stop_batch and optimizer_idx == stop_optimizer:
                 raise CustomException
             return super().training_step(batch, batch_idx)
 
@@ -323,118 +326,102 @@ def configure_optimizers_3(self):
     model = TestModel()
     model.training_epoch_end = None
 
-    limit_train_batches = 3
-
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=2,
-        limit_train_batches=limit_train_batches,
+        max_epochs=n_epochs,
+        limit_train_batches=n_batches,
         limit_val_batches=0,
         accumulate_grad_batches=accumulate_grad_batches,
     )
 
-    # simulate random failure in training_step
+    # simulate a failure
     try:
         trainer.fit(model)
     except CustomException:
         pass
 
-    #######################
-    # VALIDATE CHECKPOINT #
-    #######################
-
-    checkpoint = torch.load(str(tmpdir / ".pl_auto_save.ckpt"))
-
-    num_optimizers = 3 if use_multiple_optimizers else 1
-
-    # 4 optimizer steps because breaking on the second batch of the second epoch (3 + 1)
-    completed_optimizer_steps = (4 * num_optimizers + (1 if use_multiple_optimizers else 0)) // accumulate_grad_batches
-
-    # we raised expection on the first optimizer
-    current_optimizer_step = (1 if use_multiple_optimizers else 0)
-
-    if accumulate_grad_batches == 2 and use_multiple_optimizers:
-        completed_optimizer_steps += 1
+    ckpt_path = str(tmpdir / ".pl_auto_save.ckpt")
+    checkpoint = torch.load(ckpt_path)
 
-    total_optimizer_zero_grad = completed_optimizer_steps
-    current_optimizer_zero_grad = current_optimizer_step
+    batches_seen = (n_epochs - stop_epoch) * n_batches + stop_batch
+    total_optimizer_steps = batches_seen // accumulate_grad_batches * n_optimizers + stop_optimizer
 
+    total_optimizer_zero_grad = total_optimizer_steps
+    current_optimizer_zero_grad = stop_optimizer
     if accumulate_grad_batches == 2:
-        # that's weird ! todo (tchaton) investigate this
+        # FIXME: that's weird !
         total_optimizer_zero_grad = (9 if use_multiple_optimizers else 3)
         current_optimizer_zero_grad = 0  # same there.
 
-    total_scheduler_step = (5 if use_multiple_optimizers else 1) // accumulate_grad_batches
-
-    current_scheduler_step = 0
-
-    if accumulate_grad_batches == 2:
-        total_scheduler_step += 1
-
-    optimizer_idx = (1 if use_multiple_optimizers else 0)
+    total_scheduler_steps = n_epochs - stop_epoch
+    current_scheduler_steps = 0  # the current epoch did not complete
+    if use_multiple_optimizers:
+        # 1 for the epoch-interval scheduler and `batches_seen` for the batch-interval scheduler
+        total_scheduler_steps = 1 + batches_seen // accumulate_grad_batches
+        current_scheduler_steps = stop_batch // accumulate_grad_batches
 
     # yapf: disable
     expected = {
         "state_dict": {},
+        "epoch_progress": {
+            "total": {
+                "ready": stop_epoch + 1,
+                "started": stop_epoch + 1,
+                "processed": stop_epoch,
+                "completed": stop_epoch,
+            },
+            "current": {
+                "ready": stop_epoch + 1,
+                "started": stop_epoch + 1,
+                "processed": stop_epoch,
+                "completed": stop_epoch,
+            },
+        },
         "epoch_loop.state_dict": {},
         "epoch_loop.batch_progress": {
             "total": {
-                "ready": 5,
-                "started": 5,
-                "processed": 4,
-                "completed": 4,
+                "ready": batches_seen + 1,
+                "started": batches_seen + 1,
+                "processed": batches_seen,
+                "completed": batches_seen,
             },
             "current": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
+                "ready": stop_batch + 1,
+                "started": stop_batch + 1,
+                "processed": stop_batch,
+                "completed": stop_batch,
             },
         },
         "epoch_loop.scheduler_progress": {
             "total": {
-                "ready": total_scheduler_step,
+                "ready": total_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": total_scheduler_step,
+                "completed": total_scheduler_steps,
             },
             "current": {
-                "ready": current_scheduler_step,
+                "ready": current_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": current_scheduler_step,
+                "completed": current_scheduler_steps,
             },
         },
         "epoch_loop.batch_loop.state_dict": {},
-        "epoch_loop.batch_loop.split_progress": {
-            "total": {
-                "ready": 0,
-                "started": 0,
-                "processed": 0,
-                "completed": 0,
-            },
-            "current": {
-                "ready": 0,
-                "started": 0,
-                "processed": 0,
-                "completed": 0,
-            },
-        },
         "epoch_loop.batch_loop.optim_progress": {
-            "optimizer_idx": optimizer_idx,
+            "optimizer_idx": stop_optimizer,
             "optimizer": {
                 "step": {
                     "total": {
-                        "ready": completed_optimizer_steps + 1,
+                        "ready": total_optimizer_steps + 1,
                         "started": None,
                         "processed": None,
-                        "completed": completed_optimizer_steps,
+                        "completed": total_optimizer_steps,
                     },
                     "current": {
-                        "ready": current_optimizer_step + 1,
+                        "ready": stop_optimizer + 1,
                         "started": None,
                         "processed": None,
-                        "completed": current_optimizer_step,
+                        "completed": stop_optimizer,
                     },
                 },
                 "zero_grad": {
@@ -453,36 +440,14 @@ def configure_optimizers_3(self):
                 },
             },
         },
-        "epoch_loop.val_loop.state_dict": {},
-        "epoch_loop.val_loop.dataloader_progress": {
-            "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
-            "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
-        },
-        "epoch_loop.val_loop.epoch_loop.state_dict": {},
-        "epoch_loop.val_loop.epoch_loop.batch_progress": {
-            "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-            "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-        },
-        "epoch_progress": {
-            "total": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
-            },
-            "current": {
-                "ready": 2,
-                "started": 2,
-                "processed": 1,
-                "completed": 1,
-            },
-        },
+        "epoch_loop.val_loop.state_dict": ANY,
+        "epoch_loop.val_loop.dataloader_progress": ANY,
+        "epoch_loop.val_loop.epoch_loop.state_dict": ANY,
+        "epoch_loop.val_loop.epoch_loop.batch_progress": ANY,
     }
     # yapf: enable
-
     assert checkpoint["loops"]["fit_loop"] == expected
 
-    trainer = Trainer()
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"], restart_progress=False)
     assert trainer.fit_loop.state_dict() == checkpoint["loops"]["fit_loop"]
 

From a3756076fb7bd71441dacb8f02d63d21b35e6f0c Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 05:22:07 +0200
Subject: [PATCH 091/106] Minor test changes

---
 tests/loops/test_loop_state_dict.py |  4 ++--
 tests/loops/test_loops.py           | 15 ++++++---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/tests/loops/test_loop_state_dict.py b/tests/loops/test_loop_state_dict.py
index cb6ed55d71b31..f014f8c619b54 100644
--- a/tests/loops/test_loop_state_dict.py
+++ b/tests/loops/test_loop_state_dict.py
@@ -48,8 +48,8 @@ def test_loops_state_dict_structure():
                 "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
             },
             "epoch_loop.scheduler_progress": {
-                "current": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
-                "total": {"ready": 0, "started": 0, "processed": 0, "completed": 0},
+                "current": {"ready": 0, "started": None, "processed": None, "completed": 0},
+                "total": {"ready": 0, "started": None, "processed": None, "completed": 0},
             },
 
             "epoch_loop.batch_loop.optim_progress": {
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 0b6eb4205a802..ec9ad3d2b9257 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -212,7 +212,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx):
             return super().validation_step(batch, batch_idx)
 
         def val_dataloader(self):
-            return [super().val_dataloader()] * n_dataloaders
+            return [super(ValidationModel, self).val_dataloader() for _ in range(n_dataloaders)]
 
     model = ValidationModel()
     model.validation_epoch_end = None
@@ -303,21 +303,18 @@ class TestModel(BoringModel):
         def __init__(self):
             super().__init__()
             if use_multiple_optimizers:
-                self.configure_optimizers = self.configure_optimizers_3
+                self.configure_optimizers = self.configure_optimizers_multiple
 
         def training_step(self, batch, batch_idx, optimizer_idx=0):
             if self.trainer.current_epoch == stop_epoch and batch_idx == stop_batch and optimizer_idx == stop_optimizer:
                 raise CustomException
             return super().training_step(batch, batch_idx)
 
-        def configure_optimizers_3(self):
-            optimizer_0 = torch.optim.SGD(self.layer.parameters(), lr=0.1)
-            optimizer_1 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            optimizer_2 = torch.optim.Adam(self.layer.parameters(), lr=0.1)
-            optimizers = [optimizer_0, optimizer_1, optimizer_2]
+        def configure_optimizers_multiple(self):
+            optimizers = [torch.optim.Adam(self.layer.parameters(), lr=0.1) for _ in range(n_optimizers)]
 
-            lr_scheduler_0 = torch.optim.lr_scheduler.StepLR(optimizer_0, step_size=1)
-            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizer_1, step_size=1)
+            lr_scheduler_0 = torch.optim.lr_scheduler.StepLR(optimizers[0], step_size=1)
+            lr_scheduler_1 = torch.optim.lr_scheduler.StepLR(optimizers[1], step_size=1)
             # no scheduler for optimizer_2
             lr_schedulers = [lr_scheduler_0, {"scheduler": lr_scheduler_1, "interval": "step"}]
 

From abb08a063120bcbe8d71b552b1c54519e4fe0c95 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 18:05:18 +0200
Subject: [PATCH 092/106] One test to rule them all

---
 .../loops/batch/training_batch_loop.py        |   2 -
 .../loops/epoch/training_epoch_loop.py        |   1 +
 tests/loops/test_loops.py                     | 128 ++++++++++++------
 3 files changed, 89 insertions(+), 42 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index a4d76e3547126..b7a5eceae916e 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -110,8 +110,6 @@ def reset(self) -> None:
         self.batch_idx = 0
         self.batch_outputs = [[] for _ in range(len(self.trainer.optimizers))]
 
-        self.optim_progress.reset_on_epoch()
-
     def on_run_start(self, batch: Any, batch_idx: int, dataloader_idx: int):
         """Splits the data into tbptt splits
 
diff --git a/pytorch_lightning/loops/epoch/training_epoch_loop.py b/pytorch_lightning/loops/epoch/training_epoch_loop.py
index 2e482a01132cb..d9a2e6bb8cbb3 100644
--- a/pytorch_lightning/loops/epoch/training_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/training_epoch_loop.py
@@ -90,6 +90,7 @@ def reset(self) -> None:
         else:
             self.batch_progress.current.reset()
             self.scheduler_progress.current.reset()
+            self.batch_loop.optim_progress.reset_on_epoch()
 
     def on_run_start(self, *args: Any, **kwargs: Any) -> None:
         # hook
diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index ec9ad3d2b9257..23e28338719e4 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -149,7 +149,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
         },
         'loop_child.progress': {
             'increment': 0
-        }
+        },
     }
 
     state_dict["loop_child.state_dict"]["a"] = 3
@@ -173,7 +173,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
         },
         'loop_child.progress': {
             'increment': 1
-        }
+        },
     }
 
     loop_parent_copy = deepcopy(loop_parent)
@@ -195,7 +195,7 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-def test_loop_restart_progress_multiple_datasets(tmpdir):
+def test_loop_restart_progress_multiple_dataloaders(tmpdir):
     stop_epoch = stop_batch = stop_dataloader = 1
     n_dataloaders = 3
     n_batches = 3
@@ -265,7 +265,7 @@ def val_dataloader(self):
             "ready": stop_batch + 1,
             "started": stop_batch + 1,
             "processed": stop_batch,
-            "completed": stop_batch
+            "completed": stop_batch,
         },
     }
     assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
@@ -289,20 +289,21 @@ def val_dataloader(self):
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-@pytest.mark.parametrize("use_multiple_optimizers", [False, True])
-@pytest.mark.parametrize("accumulate_grad_batches", [1, 2])
-def test_loop_state_on_exception(use_multiple_optimizers, accumulate_grad_batches, tmpdir):
-    stop_epoch = stop_batch = 1
-    stop_optimizer = 1 if use_multiple_optimizers else 0
-    n_optimizers = 3 if use_multiple_optimizers else 1
-    n_epochs = 2
+@pytest.mark.parametrize("accumulate_grad_batches", (1, 2))  # FIXME: 3 is broken
+@pytest.mark.parametrize("n_optimizers", (1, 3, 5))
+@pytest.mark.parametrize("stop_epoch", (1, 2))
+@pytest.mark.parametrize("stop_batch", (1, ))  # FIXME: 2 is broken
+@pytest.mark.parametrize("stop_optimizer", (1, 2))
+def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch, stop_optimizer, n_optimizers, tmpdir):
+    stop_optimizer = stop_optimizer if stop_optimizer < n_optimizers else 0
+    n_epochs = 3
     n_batches = 3
 
     class TestModel(BoringModel):
 
         def __init__(self):
             super().__init__()
-            if use_multiple_optimizers:
+            if n_optimizers > 1:
                 self.configure_optimizers = self.configure_optimizers_multiple
 
         def training_step(self, batch, batch_idx, optimizer_idx=0):
@@ -329,6 +330,8 @@ def configure_optimizers_multiple(self):
         limit_train_batches=n_batches,
         limit_val_batches=0,
         accumulate_grad_batches=accumulate_grad_batches,
+        progress_bar_refresh_rate=0,
+        logger=False,
     )
 
     # simulate a failure
@@ -340,22 +343,65 @@ def configure_optimizers_multiple(self):
     ckpt_path = str(tmpdir / ".pl_auto_save.ckpt")
     checkpoint = torch.load(ckpt_path)
 
-    batches_seen = (n_epochs - stop_epoch) * n_batches + stop_batch
-    total_optimizer_steps = batches_seen // accumulate_grad_batches * n_optimizers + stop_optimizer
+    optim_progress = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
+    scheduler_progress = trainer.fit_loop.epoch_loop.scheduler_progress
 
-    total_optimizer_zero_grad = total_optimizer_steps
-    current_optimizer_zero_grad = stop_optimizer
-    if accumulate_grad_batches == 2:
-        # FIXME: that's weird !
-        total_optimizer_zero_grad = (9 if use_multiple_optimizers else 3)
-        current_optimizer_zero_grad = 0  # same there.
+    non_breaking_epoch_batches_completed = stop_epoch * n_batches
+    breaking_epoch_batches_completed = stop_batch
+    breaking_epoch_batches_ready = stop_batch + 1
+    # lightning applies leftover accumulated gradients when the epoch ends
+    has_leftover_accumulation_batches = n_batches % accumulate_grad_batches != 0
 
-    total_scheduler_steps = n_epochs - stop_epoch
-    current_scheduler_steps = 0  # the current epoch did not complete
-    if use_multiple_optimizers:
-        # 1 for the epoch-interval scheduler and `batches_seen` for the batch-interval scheduler
-        total_scheduler_steps = 1 + batches_seen // accumulate_grad_batches
-        current_scheduler_steps = stop_batch // accumulate_grad_batches
+    non_breaking_total_optimizer_steps = (
+        non_breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers
+        + has_leftover_accumulation_batches * n_optimizers
+    )
+    should_last_batch_step = breaking_epoch_batches_ready % accumulate_grad_batches == 0
+    breaking_total_optimizer_steps = (
+        breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers
+        + should_last_batch_step * stop_optimizer
+    )
+    total_optimizer_steps = non_breaking_total_optimizer_steps + breaking_total_optimizer_steps
+    current_optimizer_steps = breaking_total_optimizer_steps
+    has_optimizer_step_in_breaking_epoch = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
+    assert optim_progress.optimizer_steps == total_optimizer_steps
+    assert optim_progress.optimizer.step.current.completed == current_optimizer_steps
+
+    non_breaking_total_zero_grad = (
+        non_breaking_epoch_batches_completed // accumulate_grad_batches + has_leftover_accumulation_batches
+    ) * n_optimizers
+    # FIXME: What the hell
+    if accumulate_grad_batches > 1:
+        # FIXME: ready or completed? 0 or stop_optimizer?
+        breaking_total_zero_grad = (
+            n_optimizers + (breaking_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
+            (n_optimizers - 1) + 0
+        )
+        # breaking_total_zero_grad = breaking_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
+    else:
+        breaking_total_zero_grad = (
+            breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers + stop_optimizer
+        )
+    total_zero_grad = non_breaking_total_zero_grad + breaking_total_zero_grad
+    current_zero_grad = breaking_total_zero_grad
+    assert optim_progress.optimizer.zero_grad.total.completed == total_zero_grad
+    assert optim_progress.optimizer.zero_grad.current.completed == current_zero_grad
+
+    non_breaking_scheduler_steps = stop_epoch
+    breaking_scheduler_steps = 0  # the current epoch did not complete
+    if n_optimizers > 1:
+        # assumes that the scheduler config is unchanged
+        # `* 1` because there is only one step-level scheduler
+        non_breaking_scheduler_steps = (
+            stop_epoch + non_breaking_epoch_batches_completed // accumulate_grad_batches
+            + has_leftover_accumulation_batches * 1
+        )
+        # `0 +` for the epoch-level scheduler
+        breaking_scheduler_steps = 0 + breaking_epoch_batches_completed // accumulate_grad_batches
+    total_scheduler_steps = non_breaking_scheduler_steps + breaking_scheduler_steps
+    current_scheduler_steps = breaking_scheduler_steps
+    assert scheduler_progress.total.completed == total_scheduler_steps
+    assert scheduler_progress.current.completed == current_scheduler_steps
 
     # yapf: disable
     expected = {
@@ -377,10 +423,10 @@ def configure_optimizers_multiple(self):
         "epoch_loop.state_dict": {},
         "epoch_loop.batch_progress": {
             "total": {
-                "ready": batches_seen + 1,
-                "started": batches_seen + 1,
-                "processed": batches_seen,
-                "completed": batches_seen,
+                "ready": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed + 1,
+                "started": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed + 1,
+                "processed": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed,
+                "completed": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed,
             },
             "current": {
                 "ready": stop_batch + 1,
@@ -409,30 +455,30 @@ def configure_optimizers_multiple(self):
             "optimizer": {
                 "step": {
                     "total": {
-                        "ready": total_optimizer_steps + 1,
+                        "ready": total_optimizer_steps + has_optimizer_step_in_breaking_epoch,
                         "started": None,
                         "processed": None,
                         "completed": total_optimizer_steps,
                     },
                     "current": {
-                        "ready": stop_optimizer + 1,
+                        "ready": current_optimizer_steps + has_optimizer_step_in_breaking_epoch,
                         "started": None,
                         "processed": None,
-                        "completed": stop_optimizer,
+                        "completed": current_optimizer_steps,
                     },
                 },
                 "zero_grad": {
                     "total": {
-                        "ready": total_optimizer_zero_grad,
-                        "started": total_optimizer_zero_grad,
+                        "ready": total_zero_grad,
+                        "started": total_zero_grad,
                         "processed": None,
-                        "completed": total_optimizer_zero_grad,
+                        "completed": total_zero_grad,
                     },
                     "current": {
-                        "ready": current_optimizer_zero_grad,
-                        "started": current_optimizer_zero_grad,
+                        "ready": current_zero_grad,
+                        "started": current_zero_grad,
                         "processed": None,
-                        "completed": current_optimizer_zero_grad,
+                        "completed": current_zero_grad,
                     },
                 },
             },
@@ -451,4 +497,6 @@ def configure_optimizers_multiple(self):
     trainer.fit_loop.load_state_dict(checkpoint["loops"]["fit_loop"])
     state_dict = trainer.fit_loop.state_dict()
     assert state_dict != checkpoint["loops"]["fit_loop"]
-    assert state_dict["epoch_progress"]["total"]["started"] == 1
+    # TODO(@carmocca): do not reset for total
+    assert state_dict["epoch_progress"]["total"]["started"] == stop_epoch
+    assert state_dict["epoch_progress"]["current"]["started"] == stop_epoch

From fc18c16adf508e9714e5bc513941a6ec82038ccb Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 19:10:28 +0200
Subject: [PATCH 093/106] Formatting

---
 tests/loops/test_loops.py | 99 ++++++++++-----------------------------
 1 file changed, 26 insertions(+), 73 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 23e28338719e4..47a22512f2fd7 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -33,9 +33,7 @@ class CustomException(Exception):
 
 
 def test_loop_restore():
-
     class Simple(Loop):
-
         def __init__(self, dataset: Iterator):
             super().__init__()
             self.dataset = dataset
@@ -94,13 +92,11 @@ def load_state_dict(self, state_dict: Dict) -> None:
 
 
 def test_loop_hierarchy():
-
     @dataclass
     class SimpleProgress(BaseProgress):
         increment: int = 0
 
     class Simple(Loop):
-
         def __init__(self, a):
             super().__init__()
             self.a = a
@@ -138,18 +134,10 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
     state_dict = loop_parent.state_dict()
     assert state_dict == {
-        'state_dict': {
-            'a': 1
-        },
-        'progress': {
-            'increment': 0
-        },
-        'loop_child.state_dict': {
-            'a': 2
-        },
-        'loop_child.progress': {
-            'increment': 0
-        },
+        'state_dict': {'a': 1},
+        'progress': {'increment': 0},
+        'loop_child.state_dict': {'a': 2},
+        'loop_child.progress': {'increment': 0},
     }
 
     state_dict["loop_child.state_dict"]["a"] = 3
@@ -162,18 +150,10 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     # check the new state after `run`
     state_dict = loop_parent.state_dict()
     assert state_dict == {
-        'state_dict': {
-            'a': 1
-        },
-        'progress': {
-            'increment': 1
-        },
-        'loop_child.state_dict': {
-            'a': 3
-        },
-        'loop_child.progress': {
-            'increment': 1
-        },
+        'state_dict': {'a': 1},
+        'progress': {'increment': 1},
+        'loop_child.state_dict': {'a': 3},
+        'loop_child.progress': {'increment': 1},
     }
 
     loop_parent_copy = deepcopy(loop_parent)
@@ -202,7 +182,6 @@ def test_loop_restart_progress_multiple_dataloaders(tmpdir):
     n_epochs = 2
 
     class ValidationModel(BoringModel):
-
         def __init__(self):
             super().__init__()
 
@@ -237,12 +216,7 @@ def val_dataloader(self):
 
     total = (n_epochs - 1) * n_dataloaders + stop_dataloader
     expected = {
-        "total": {
-            "ready": total + 1,
-            "started": None,
-            "processed": None,
-            "completed": total
-        },
+        "total": {"ready": total + 1, "started": None, "processed": None, "completed": total},
         "current": {
             "ready": stop_dataloader + 1,
             "started": None,
@@ -255,12 +229,7 @@ def val_dataloader(self):
     trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
     total = n_dataloaders * n_batches + n_batches + stop_epoch
     expected = {
-        "total": {
-            "ready": total + 1,
-            "started": total + 1,
-            "processed": total,
-            "completed": total
-        },
+        "total": {"ready": total + 1, "started": total + 1, "processed": total, "completed": total},
         "current": {
             "ready": stop_batch + 1,
             "started": stop_batch + 1,
@@ -272,18 +241,8 @@ def val_dataloader(self):
 
     trainer.fit_loop.load_state_dict(checkpoint)
     expected = {
-        "total": {
-            "ready": total,
-            "started": total,
-            "processed": total,
-            "completed": total
-        },
-        "current": {
-            "ready": stop_batch,
-            "started": stop_batch,
-            "processed": stop_batch,
-            "completed": stop_batch
-        },
+        "total": {"ready": total, "started": total, "processed": total, "completed": total},
+        "current": {"ready": stop_batch, "started": stop_batch, "processed": stop_batch, "completed": stop_batch},
     }
     assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
 
@@ -292,7 +251,7 @@ def val_dataloader(self):
 @pytest.mark.parametrize("accumulate_grad_batches", (1, 2))  # FIXME: 3 is broken
 @pytest.mark.parametrize("n_optimizers", (1, 3, 5))
 @pytest.mark.parametrize("stop_epoch", (1, 2))
-@pytest.mark.parametrize("stop_batch", (1, ))  # FIXME: 2 is broken
+@pytest.mark.parametrize("stop_batch", (1,))  # FIXME: 2 is broken
 @pytest.mark.parametrize("stop_optimizer", (1, 2))
 def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch, stop_optimizer, n_optimizers, tmpdir):
     stop_optimizer = stop_optimizer if stop_optimizer < n_optimizers else 0
@@ -300,7 +259,6 @@ def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch
     n_batches = 3
 
     class TestModel(BoringModel):
-
         def __init__(self):
             super().__init__()
             if n_optimizers > 1:
@@ -351,37 +309,33 @@ def configure_optimizers_multiple(self):
     breaking_epoch_batches_ready = stop_batch + 1
     # lightning applies leftover accumulated gradients when the epoch ends
     has_leftover_accumulation_batches = n_batches % accumulate_grad_batches != 0
+    non_breaking_stepping_batches = non_breaking_epoch_batches_completed // accumulate_grad_batches
+    breaking_stepping_batches = breaking_epoch_batches_completed // accumulate_grad_batches
 
     non_breaking_total_optimizer_steps = (
-        non_breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers
-        + has_leftover_accumulation_batches * n_optimizers
-    )
+        non_breaking_stepping_batches + has_leftover_accumulation_batches
+    ) * n_optimizers
     should_last_batch_step = breaking_epoch_batches_ready % accumulate_grad_batches == 0
-    breaking_total_optimizer_steps = (
-        breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers
-        + should_last_batch_step * stop_optimizer
-    )
+    breaking_total_optimizer_steps = breaking_stepping_batches * n_optimizers + should_last_batch_step * stop_optimizer
     total_optimizer_steps = non_breaking_total_optimizer_steps + breaking_total_optimizer_steps
     current_optimizer_steps = breaking_total_optimizer_steps
     has_optimizer_step_in_breaking_epoch = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
     assert optim_progress.optimizer_steps == total_optimizer_steps
     assert optim_progress.optimizer.step.current.completed == current_optimizer_steps
 
-    non_breaking_total_zero_grad = (
-        non_breaking_epoch_batches_completed // accumulate_grad_batches + has_leftover_accumulation_batches
-    ) * n_optimizers
+    non_breaking_total_zero_grad = (non_breaking_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
     # FIXME: What the hell
     if accumulate_grad_batches > 1:
         # FIXME: ready or completed? 0 or stop_optimizer?
         breaking_total_zero_grad = (
-            n_optimizers + (breaking_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
-            (n_optimizers - 1) + 0
+            n_optimizers
+            + (breaking_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1))
+            * (n_optimizers - 1)
+            + 0
         )
         # breaking_total_zero_grad = breaking_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
     else:
-        breaking_total_zero_grad = (
-            breaking_epoch_batches_completed // accumulate_grad_batches * n_optimizers + stop_optimizer
-        )
+        breaking_total_zero_grad = breaking_stepping_batches * n_optimizers + stop_optimizer
     total_zero_grad = non_breaking_total_zero_grad + breaking_total_zero_grad
     current_zero_grad = breaking_total_zero_grad
     assert optim_progress.optimizer.zero_grad.total.completed == total_zero_grad
@@ -393,11 +347,10 @@ def configure_optimizers_multiple(self):
         # assumes that the scheduler config is unchanged
         # `* 1` because there is only one step-level scheduler
         non_breaking_scheduler_steps = (
-            stop_epoch + non_breaking_epoch_batches_completed // accumulate_grad_batches
-            + has_leftover_accumulation_batches * 1
+            stop_epoch + non_breaking_stepping_batches + has_leftover_accumulation_batches * 1
         )
         # `0 +` for the epoch-level scheduler
-        breaking_scheduler_steps = 0 + breaking_epoch_batches_completed // accumulate_grad_batches
+        breaking_scheduler_steps = 0 + breaking_stepping_batches
     total_scheduler_steps = non_breaking_scheduler_steps + breaking_scheduler_steps
     current_scheduler_steps = breaking_scheduler_steps
     assert scheduler_progress.total.completed == total_scheduler_steps

From e550e6d03c8f55fbd57c7485d8d4906bca1214a0 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 19:47:08 +0200
Subject: [PATCH 094/106] Rename and clean variables

---
 tests/loops/test_loops.py | 173 ++++++++++++++++++++++----------------
 1 file changed, 102 insertions(+), 71 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 47a22512f2fd7..49a9570649027 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -33,7 +33,9 @@ class CustomException(Exception):
 
 
 def test_loop_restore():
+
     class Simple(Loop):
+
         def __init__(self, dataset: Iterator):
             super().__init__()
             self.dataset = dataset
@@ -92,11 +94,13 @@ def load_state_dict(self, state_dict: Dict) -> None:
 
 
 def test_loop_hierarchy():
+
     @dataclass
     class SimpleProgress(BaseProgress):
         increment: int = 0
 
     class Simple(Loop):
+
         def __init__(self, a):
             super().__init__()
             self.a = a
@@ -134,10 +138,18 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
     state_dict = loop_parent.state_dict()
     assert state_dict == {
-        'state_dict': {'a': 1},
-        'progress': {'increment': 0},
-        'loop_child.state_dict': {'a': 2},
-        'loop_child.progress': {'increment': 0},
+        'state_dict': {
+            'a': 1
+        },
+        'progress': {
+            'increment': 0
+        },
+        'loop_child.state_dict': {
+            'a': 2
+        },
+        'loop_child.progress': {
+            'increment': 0
+        },
     }
 
     state_dict["loop_child.state_dict"]["a"] = 3
@@ -150,10 +162,18 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
     # check the new state after `run`
     state_dict = loop_parent.state_dict()
     assert state_dict == {
-        'state_dict': {'a': 1},
-        'progress': {'increment': 1},
-        'loop_child.state_dict': {'a': 3},
-        'loop_child.progress': {'increment': 1},
+        'state_dict': {
+            'a': 1
+        },
+        'progress': {
+            'increment': 1
+        },
+        'loop_child.state_dict': {
+            'a': 3
+        },
+        'loop_child.progress': {
+            'increment': 1
+        },
     }
 
     loop_parent_copy = deepcopy(loop_parent)
@@ -182,6 +202,7 @@ def test_loop_restart_progress_multiple_dataloaders(tmpdir):
     n_epochs = 2
 
     class ValidationModel(BoringModel):
+
         def __init__(self):
             super().__init__()
 
@@ -216,7 +237,12 @@ def val_dataloader(self):
 
     total = (n_epochs - 1) * n_dataloaders + stop_dataloader
     expected = {
-        "total": {"ready": total + 1, "started": None, "processed": None, "completed": total},
+        "total": {
+            "ready": total + 1,
+            "started": None,
+            "processed": None,
+            "completed": total
+        },
         "current": {
             "ready": stop_dataloader + 1,
             "started": None,
@@ -229,7 +255,12 @@ def val_dataloader(self):
     trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
     total = n_dataloaders * n_batches + n_batches + stop_epoch
     expected = {
-        "total": {"ready": total + 1, "started": total + 1, "processed": total, "completed": total},
+        "total": {
+            "ready": total + 1,
+            "started": total + 1,
+            "processed": total,
+            "completed": total
+        },
         "current": {
             "ready": stop_batch + 1,
             "started": stop_batch + 1,
@@ -241,8 +272,18 @@ def val_dataloader(self):
 
     trainer.fit_loop.load_state_dict(checkpoint)
     expected = {
-        "total": {"ready": total, "started": total, "processed": total, "completed": total},
-        "current": {"ready": stop_batch, "started": stop_batch, "processed": stop_batch, "completed": stop_batch},
+        "total": {
+            "ready": total,
+            "started": total,
+            "processed": total,
+            "completed": total
+        },
+        "current": {
+            "ready": stop_batch,
+            "started": stop_batch,
+            "processed": stop_batch,
+            "completed": stop_batch
+        },
     }
     assert trainer.fit_loop.epoch_loop.val_loop.epoch_loop.batch_progress.state_dict() == expected
 
@@ -251,7 +292,7 @@ def val_dataloader(self):
 @pytest.mark.parametrize("accumulate_grad_batches", (1, 2))  # FIXME: 3 is broken
 @pytest.mark.parametrize("n_optimizers", (1, 3, 5))
 @pytest.mark.parametrize("stop_epoch", (1, 2))
-@pytest.mark.parametrize("stop_batch", (1,))  # FIXME: 2 is broken
+@pytest.mark.parametrize("stop_batch", (1, ))  # FIXME: 2 is broken
 @pytest.mark.parametrize("stop_optimizer", (1, 2))
 def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch, stop_optimizer, n_optimizers, tmpdir):
     stop_optimizer = stop_optimizer if stop_optimizer < n_optimizers else 0
@@ -259,6 +300,7 @@ def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch
     n_batches = 3
 
     class TestModel(BoringModel):
+
         def __init__(self):
             super().__init__()
             if n_optimizers > 1:
@@ -304,57 +346,46 @@ def configure_optimizers_multiple(self):
     optim_progress = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
     scheduler_progress = trainer.fit_loop.epoch_loop.scheduler_progress
 
-    non_breaking_epoch_batches_completed = stop_epoch * n_batches
-    breaking_epoch_batches_completed = stop_batch
-    breaking_epoch_batches_ready = stop_batch + 1
+    # `nb_`: non-breaking, as in, no exception will be raised. `b_`: breaking
+    nb_epoch_batches_completed = stop_epoch * n_batches
+    b_epoch_batches_completed = stop_batch
+    b_epoch_batches_ready = stop_batch + 1
     # lightning applies leftover accumulated gradients when the epoch ends
     has_leftover_accumulation_batches = n_batches % accumulate_grad_batches != 0
-    non_breaking_stepping_batches = non_breaking_epoch_batches_completed // accumulate_grad_batches
-    breaking_stepping_batches = breaking_epoch_batches_completed // accumulate_grad_batches
-
-    non_breaking_total_optimizer_steps = (
-        non_breaking_stepping_batches + has_leftover_accumulation_batches
-    ) * n_optimizers
-    should_last_batch_step = breaking_epoch_batches_ready % accumulate_grad_batches == 0
-    breaking_total_optimizer_steps = breaking_stepping_batches * n_optimizers + should_last_batch_step * stop_optimizer
-    total_optimizer_steps = non_breaking_total_optimizer_steps + breaking_total_optimizer_steps
-    current_optimizer_steps = breaking_total_optimizer_steps
-    has_optimizer_step_in_breaking_epoch = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
-    assert optim_progress.optimizer_steps == total_optimizer_steps
-    assert optim_progress.optimizer.step.current.completed == current_optimizer_steps
-
-    non_breaking_total_zero_grad = (non_breaking_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
+    nb_stepping_batches = nb_epoch_batches_completed // accumulate_grad_batches
+    b_stepping_batches = b_epoch_batches_completed // accumulate_grad_batches
+
+    nb_total_optimizer_steps = (nb_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
+    should_last_batch_step = b_epoch_batches_ready % accumulate_grad_batches == 0
+    b_total_optimizer_steps = b_stepping_batches * n_optimizers + should_last_batch_step * stop_optimizer
+    has_optimizer_step_in_b_epoch = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
+    assert optim_progress.optimizer_steps == nb_total_optimizer_steps + b_total_optimizer_steps
+    assert optim_progress.optimizer.step.current.completed == b_total_optimizer_steps
+
+    nb_total_zero_grad = (nb_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
     # FIXME: What the hell
     if accumulate_grad_batches > 1:
         # FIXME: ready or completed? 0 or stop_optimizer?
-        breaking_total_zero_grad = (
-            n_optimizers
-            + (breaking_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1))
-            * (n_optimizers - 1)
-            + 0
+        b_total_zero_grad = (
+            n_optimizers + (b_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
+            (n_optimizers - 1) + 0
         )
-        # breaking_total_zero_grad = breaking_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
+        # b_total_zero_grad = b_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
     else:
-        breaking_total_zero_grad = breaking_stepping_batches * n_optimizers + stop_optimizer
-    total_zero_grad = non_breaking_total_zero_grad + breaking_total_zero_grad
-    current_zero_grad = breaking_total_zero_grad
-    assert optim_progress.optimizer.zero_grad.total.completed == total_zero_grad
-    assert optim_progress.optimizer.zero_grad.current.completed == current_zero_grad
-
-    non_breaking_scheduler_steps = stop_epoch
-    breaking_scheduler_steps = 0  # the current epoch did not complete
+        b_total_zero_grad = b_stepping_batches * n_optimizers + stop_optimizer
+    assert optim_progress.optimizer.zero_grad.total.completed == nb_total_zero_grad + b_total_zero_grad
+    assert optim_progress.optimizer.zero_grad.current.completed == b_total_zero_grad
+
+    nb_scheduler_steps = stop_epoch
+    b_scheduler_steps = 0  # the current epoch did not complete
     if n_optimizers > 1:
         # assumes that the scheduler config is unchanged
         # `* 1` because there is only one step-level scheduler
-        non_breaking_scheduler_steps = (
-            stop_epoch + non_breaking_stepping_batches + has_leftover_accumulation_batches * 1
-        )
+        nb_scheduler_steps = stop_epoch + nb_stepping_batches + has_leftover_accumulation_batches * 1
         # `0 +` for the epoch-level scheduler
-        breaking_scheduler_steps = 0 + breaking_stepping_batches
-    total_scheduler_steps = non_breaking_scheduler_steps + breaking_scheduler_steps
-    current_scheduler_steps = breaking_scheduler_steps
-    assert scheduler_progress.total.completed == total_scheduler_steps
-    assert scheduler_progress.current.completed == current_scheduler_steps
+        b_scheduler_steps = 0 + b_stepping_batches
+    assert scheduler_progress.total.completed == nb_scheduler_steps + b_scheduler_steps
+    assert scheduler_progress.current.completed == b_scheduler_steps
 
     # yapf: disable
     expected = {
@@ -376,10 +407,10 @@ def configure_optimizers_multiple(self):
         "epoch_loop.state_dict": {},
         "epoch_loop.batch_progress": {
             "total": {
-                "ready": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed + 1,
-                "started": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed + 1,
-                "processed": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed,
-                "completed": non_breaking_epoch_batches_completed + breaking_epoch_batches_completed,
+                "ready": nb_epoch_batches_completed + b_epoch_batches_completed + 1,
+                "started": nb_epoch_batches_completed + b_epoch_batches_completed + 1,
+                "processed": nb_epoch_batches_completed + b_epoch_batches_completed,
+                "completed": nb_epoch_batches_completed + b_epoch_batches_completed,
             },
             "current": {
                 "ready": stop_batch + 1,
@@ -390,16 +421,16 @@ def configure_optimizers_multiple(self):
         },
         "epoch_loop.scheduler_progress": {
             "total": {
-                "ready": total_scheduler_steps,
+                "ready": nb_scheduler_steps + b_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": total_scheduler_steps,
+                "completed": nb_scheduler_steps + b_scheduler_steps,
             },
             "current": {
-                "ready": current_scheduler_steps,
+                "ready": b_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": current_scheduler_steps,
+                "completed": b_scheduler_steps,
             },
         },
         "epoch_loop.batch_loop.state_dict": {},
@@ -408,30 +439,30 @@ def configure_optimizers_multiple(self):
             "optimizer": {
                 "step": {
                     "total": {
-                        "ready": total_optimizer_steps + has_optimizer_step_in_breaking_epoch,
+                        "ready": nb_total_optimizer_steps + b_total_optimizer_steps + has_optimizer_step_in_b_epoch,
                         "started": None,
                         "processed": None,
-                        "completed": total_optimizer_steps,
+                        "completed": nb_total_optimizer_steps + b_total_optimizer_steps,
                     },
                     "current": {
-                        "ready": current_optimizer_steps + has_optimizer_step_in_breaking_epoch,
+                        "ready": b_total_optimizer_steps + has_optimizer_step_in_b_epoch,
                         "started": None,
                         "processed": None,
-                        "completed": current_optimizer_steps,
+                        "completed": b_total_optimizer_steps,
                     },
                 },
                 "zero_grad": {
                     "total": {
-                        "ready": total_zero_grad,
-                        "started": total_zero_grad,
+                        "ready": nb_total_zero_grad + b_total_zero_grad,
+                        "started": nb_total_zero_grad + b_total_zero_grad,
                         "processed": None,
-                        "completed": total_zero_grad,
+                        "completed": nb_total_zero_grad + b_total_zero_grad,
                     },
                     "current": {
-                        "ready": current_zero_grad,
-                        "started": current_zero_grad,
+                        "ready": b_total_zero_grad,
+                        "started": b_total_zero_grad,
                         "processed": None,
-                        "completed": current_zero_grad,
+                        "completed": b_total_zero_grad,
                     },
                 },
             },

From 01a8a456969be25bec24a1b22f878ef5dba320a4 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 20:03:50 +0200
Subject: [PATCH 095/106] Shorter names

---
 tests/loops/test_loops.py | 93 ++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 46 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 49a9570649027..d4e096641d74f 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -332,6 +332,7 @@ def configure_optimizers_multiple(self):
         accumulate_grad_batches=accumulate_grad_batches,
         progress_bar_refresh_rate=0,
         logger=False,
+        checkpoint_callback=False,
     )
 
     # simulate a failure
@@ -346,46 +347,46 @@ def configure_optimizers_multiple(self):
     optim_progress = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
     scheduler_progress = trainer.fit_loop.epoch_loop.scheduler_progress
 
-    # `nb_`: non-breaking, as in, no exception will be raised. `b_`: breaking
-    nb_epoch_batches_completed = stop_epoch * n_batches
-    b_epoch_batches_completed = stop_batch
-    b_epoch_batches_ready = stop_batch + 1
+    # `nbe_`: non-breaking epoch, as in, no exception will be raised. `be_`: breaking epoch
+    nbe_batches_completed = stop_epoch * n_batches
+    be_batches_completed = stop_batch
+    be_batches_ready = stop_batch + 1
     # lightning applies leftover accumulated gradients when the epoch ends
     has_leftover_accumulation_batches = n_batches % accumulate_grad_batches != 0
-    nb_stepping_batches = nb_epoch_batches_completed // accumulate_grad_batches
-    b_stepping_batches = b_epoch_batches_completed // accumulate_grad_batches
-
-    nb_total_optimizer_steps = (nb_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
-    should_last_batch_step = b_epoch_batches_ready % accumulate_grad_batches == 0
-    b_total_optimizer_steps = b_stepping_batches * n_optimizers + should_last_batch_step * stop_optimizer
-    has_optimizer_step_in_b_epoch = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
-    assert optim_progress.optimizer_steps == nb_total_optimizer_steps + b_total_optimizer_steps
-    assert optim_progress.optimizer.step.current.completed == b_total_optimizer_steps
-
-    nb_total_zero_grad = (nb_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
-    # FIXME: What the hell
+    # number of batches that will call `optimizer.step()` during non-breaking and breaking epochs
+    nbe_stepping_batches = nbe_batches_completed // accumulate_grad_batches
+    be_stepping_batches = be_batches_completed // accumulate_grad_batches
+
+    nbe_total_opt_steps = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
+    is_last_batch_stepping = be_batches_ready % accumulate_grad_batches == 0
+    be_total_opt_steps = be_stepping_batches * n_optimizers + is_last_batch_stepping * stop_optimizer
+    assert optim_progress.optimizer_steps == nbe_total_opt_steps + be_total_opt_steps
+    assert optim_progress.optimizer.step.current.completed == be_total_opt_steps
+    has_opt_stepped_in_be = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
+
+    nbe_total_zero_grad = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
     if accumulate_grad_batches > 1:
         # FIXME: ready or completed? 0 or stop_optimizer?
-        b_total_zero_grad = (
-            n_optimizers + (b_epoch_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
+        be_total_zero_grad = (
+            n_optimizers + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
             (n_optimizers - 1) + 0
         )
-        # b_total_zero_grad = b_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
+        # be_total_zero_grad = be_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
     else:
-        b_total_zero_grad = b_stepping_batches * n_optimizers + stop_optimizer
-    assert optim_progress.optimizer.zero_grad.total.completed == nb_total_zero_grad + b_total_zero_grad
-    assert optim_progress.optimizer.zero_grad.current.completed == b_total_zero_grad
+        be_total_zero_grad = be_stepping_batches * n_optimizers + stop_optimizer
+    assert optim_progress.optimizer.zero_grad.total.completed == nbe_total_zero_grad + be_total_zero_grad
+    assert optim_progress.optimizer.zero_grad.current.completed == be_total_zero_grad
 
-    nb_scheduler_steps = stop_epoch
-    b_scheduler_steps = 0  # the current epoch did not complete
+    nbe_scheduler_steps = stop_epoch
+    be_scheduler_steps = 0  # the current epoch did not complete
     if n_optimizers > 1:
         # assumes that the scheduler config is unchanged
         # `* 1` because there is only one step-level scheduler
-        nb_scheduler_steps = stop_epoch + nb_stepping_batches + has_leftover_accumulation_batches * 1
+        nbe_scheduler_steps = stop_epoch + nbe_stepping_batches + has_leftover_accumulation_batches * 1
         # `0 +` for the epoch-level scheduler
-        b_scheduler_steps = 0 + b_stepping_batches
-    assert scheduler_progress.total.completed == nb_scheduler_steps + b_scheduler_steps
-    assert scheduler_progress.current.completed == b_scheduler_steps
+        be_scheduler_steps = 0 + be_stepping_batches
+    assert scheduler_progress.total.completed == nbe_scheduler_steps + be_scheduler_steps
+    assert scheduler_progress.current.completed == be_scheduler_steps
 
     # yapf: disable
     expected = {
@@ -407,10 +408,10 @@ def configure_optimizers_multiple(self):
         "epoch_loop.state_dict": {},
         "epoch_loop.batch_progress": {
             "total": {
-                "ready": nb_epoch_batches_completed + b_epoch_batches_completed + 1,
-                "started": nb_epoch_batches_completed + b_epoch_batches_completed + 1,
-                "processed": nb_epoch_batches_completed + b_epoch_batches_completed,
-                "completed": nb_epoch_batches_completed + b_epoch_batches_completed,
+                "ready": nbe_batches_completed + be_batches_completed + 1,
+                "started": nbe_batches_completed + be_batches_completed + 1,
+                "processed": nbe_batches_completed + be_batches_completed,
+                "completed": nbe_batches_completed + be_batches_completed,
             },
             "current": {
                 "ready": stop_batch + 1,
@@ -421,16 +422,16 @@ def configure_optimizers_multiple(self):
         },
         "epoch_loop.scheduler_progress": {
             "total": {
-                "ready": nb_scheduler_steps + b_scheduler_steps,
+                "ready": nbe_scheduler_steps + be_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": nb_scheduler_steps + b_scheduler_steps,
+                "completed": nbe_scheduler_steps + be_scheduler_steps,
             },
             "current": {
-                "ready": b_scheduler_steps,
+                "ready": be_scheduler_steps,
                 "started": None,
                 "processed": None,
-                "completed": b_scheduler_steps,
+                "completed": be_scheduler_steps,
             },
         },
         "epoch_loop.batch_loop.state_dict": {},
@@ -439,30 +440,30 @@ def configure_optimizers_multiple(self):
             "optimizer": {
                 "step": {
                     "total": {
-                        "ready": nb_total_optimizer_steps + b_total_optimizer_steps + has_optimizer_step_in_b_epoch,
+                        "ready": nbe_total_opt_steps + be_total_opt_steps + has_opt_stepped_in_be,
                         "started": None,
                         "processed": None,
-                        "completed": nb_total_optimizer_steps + b_total_optimizer_steps,
+                        "completed": nbe_total_opt_steps + be_total_opt_steps,
                     },
                     "current": {
-                        "ready": b_total_optimizer_steps + has_optimizer_step_in_b_epoch,
+                        "ready": be_total_opt_steps + has_opt_stepped_in_be,
                         "started": None,
                         "processed": None,
-                        "completed": b_total_optimizer_steps,
+                        "completed": be_total_opt_steps,
                     },
                 },
                 "zero_grad": {
                     "total": {
-                        "ready": nb_total_zero_grad + b_total_zero_grad,
-                        "started": nb_total_zero_grad + b_total_zero_grad,
+                        "ready": nbe_total_zero_grad + be_total_zero_grad,
+                        "started": nbe_total_zero_grad + be_total_zero_grad,
                         "processed": None,
-                        "completed": nb_total_zero_grad + b_total_zero_grad,
+                        "completed": nbe_total_zero_grad + be_total_zero_grad,
                     },
                     "current": {
-                        "ready": b_total_zero_grad,
-                        "started": b_total_zero_grad,
+                        "ready": be_total_zero_grad,
+                        "started": be_total_zero_grad,
                         "processed": None,
-                        "completed": b_total_zero_grad,
+                        "completed": be_total_zero_grad,
                     },
                 },
             },

From 1a6c2a1d40380ad7d090866aa22c6b2a3cd62c63 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 20:07:15 +0200
Subject: [PATCH 096/106] Shorter scheduler name

---
 tests/loops/test_loops.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index d4e096641d74f..db46a974ce340 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -345,7 +345,7 @@ def configure_optimizers_multiple(self):
     checkpoint = torch.load(ckpt_path)
 
     optim_progress = trainer.fit_loop.epoch_loop.batch_loop.optim_progress
-    scheduler_progress = trainer.fit_loop.epoch_loop.scheduler_progress
+    sch_progress = trainer.fit_loop.epoch_loop.scheduler_progress
 
     # `nbe_`: non-breaking epoch, as in, no exception will be raised. `be_`: breaking epoch
     nbe_batches_completed = stop_epoch * n_batches
@@ -368,8 +368,9 @@ def configure_optimizers_multiple(self):
     if accumulate_grad_batches > 1:
         # FIXME: ready or completed? 0 or stop_optimizer?
         be_total_zero_grad = (
-            n_optimizers + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
-            (n_optimizers - 1) + 0
+            n_optimizers
+            + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) * (n_optimizers - 1)
+            + 0
         )
         # be_total_zero_grad = be_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
     else:
@@ -377,16 +378,16 @@ def configure_optimizers_multiple(self):
     assert optim_progress.optimizer.zero_grad.total.completed == nbe_total_zero_grad + be_total_zero_grad
     assert optim_progress.optimizer.zero_grad.current.completed == be_total_zero_grad
 
-    nbe_scheduler_steps = stop_epoch
-    be_scheduler_steps = 0  # the current epoch did not complete
+    nbe_sch_steps = stop_epoch
+    be_sch_steps = 0  # the current epoch did not complete
     if n_optimizers > 1:
         # assumes that the scheduler config is unchanged
         # `* 1` because there is only one step-level scheduler
-        nbe_scheduler_steps = stop_epoch + nbe_stepping_batches + has_leftover_accumulation_batches * 1
+        nbe_sch_steps = stop_epoch + nbe_stepping_batches + has_leftover_accumulation_batches * 1
         # `0 +` for the epoch-level scheduler
-        be_scheduler_steps = 0 + be_stepping_batches
-    assert scheduler_progress.total.completed == nbe_scheduler_steps + be_scheduler_steps
-    assert scheduler_progress.current.completed == be_scheduler_steps
+        be_sch_steps = 0 + be_stepping_batches
+    assert sch_progress.total.completed == nbe_sch_steps + be_sch_steps
+    assert sch_progress.current.completed == be_sch_steps
 
     # yapf: disable
     expected = {
@@ -422,16 +423,16 @@ def configure_optimizers_multiple(self):
         },
         "epoch_loop.scheduler_progress": {
             "total": {
-                "ready": nbe_scheduler_steps + be_scheduler_steps,
+                "ready": nbe_sch_steps + be_sch_steps,
                 "started": None,
                 "processed": None,
-                "completed": nbe_scheduler_steps + be_scheduler_steps,
+                "completed": nbe_sch_steps + be_sch_steps,
             },
             "current": {
-                "ready": be_scheduler_steps,
+                "ready": be_sch_steps,
                 "started": None,
                 "processed": None,
-                "completed": be_scheduler_steps,
+                "completed": be_sch_steps,
             },
         },
         "epoch_loop.batch_loop.state_dict": {},

From e1906b75fce7767dcc5fd2574fbfca68eb39a76a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 20:27:26 +0200
Subject: [PATCH 097/106] Fix optimizer step calculation for stop_batch=2

---
 tests/loops/test_loops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index db46a974ce340..42e84f502473a 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -358,8 +358,8 @@ def configure_optimizers_multiple(self):
     be_stepping_batches = be_batches_completed // accumulate_grad_batches
 
     nbe_total_opt_steps = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
-    is_last_batch_stepping = be_batches_ready % accumulate_grad_batches == 0
-    be_total_opt_steps = be_stepping_batches * n_optimizers + is_last_batch_stepping * stop_optimizer
+    is_last_be_batch_stepping = be_batches_ready % accumulate_grad_batches == 0 or has_leftover_accumulation_batches
+    be_total_opt_steps = be_stepping_batches * n_optimizers + is_last_be_batch_stepping * stop_optimizer
     assert optim_progress.optimizer_steps == nbe_total_opt_steps + be_total_opt_steps
     assert optim_progress.optimizer.step.current.completed == be_total_opt_steps
     has_opt_stepped_in_be = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0

From 5eaf5b3421d5aa6963eea21703d07d87fe574bd6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 15 Jul 2021 18:35:20 +0000
Subject: [PATCH 098/106] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/loops/test_loops.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 42e84f502473a..f3beaf5332e44 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -368,9 +368,8 @@ def configure_optimizers_multiple(self):
     if accumulate_grad_batches > 1:
         # FIXME: ready or completed? 0 or stop_optimizer?
         be_total_zero_grad = (
-            n_optimizers
-            + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) * (n_optimizers - 1)
-            + 0
+            n_optimizers + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
+            (n_optimizers - 1) + 0
         )
         # be_total_zero_grad = be_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
     else:

From 29ce5528d572d7f005b29b936ff05a48c595b72a Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 20:37:11 +0200
Subject: [PATCH 099/106] Remove empty connects

---
 pytorch_lightning/loops/batch/training_batch_loop.py   | 5 -----
 pytorch_lightning/loops/epoch/evaluation_epoch_loop.py | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/pytorch_lightning/loops/batch/training_batch_loop.py b/pytorch_lightning/loops/batch/training_batch_loop.py
index b7a5eceae916e..3e5a8081f9eca 100644
--- a/pytorch_lightning/loops/batch/training_batch_loop.py
+++ b/pytorch_lightning/loops/batch/training_batch_loop.py
@@ -23,7 +23,6 @@
 from torch import Tensor
 from torch.optim import Optimizer
 
-import pytorch_lightning as pl
 from pytorch_lightning.core.optimizer import LightningOptimizer
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.plugins import ParallelPlugin
@@ -58,10 +57,6 @@ def __init__(self) -> None:
         self._remaining_splits: Optional[List[Any]] = None
         self._skip_backward: bool = False
 
-    def connect(self, trainer: 'pl.Trainer', *args: Any, **kwargs: Any) -> None:
-        """Connects the loop with necessary arguments like the trainer"""
-        super().connect(trainer, *args, **kwargs)
-
     @property
     def done(self) -> bool:
         """Returns if all batch splits have been processed already"""
diff --git a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
index 1a0b0f9c8bd9b..bd697d8cc8653 100644
--- a/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
+++ b/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py
@@ -18,7 +18,6 @@
 from deprecate import void
 from torch import Tensor
 
-import pytorch_lightning as pl
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.connectors.logger_connector.result import ResultCollection
 from pytorch_lightning.trainer.progress import Progress
@@ -42,10 +41,6 @@ def __init__(self) -> None:
         self.outputs: List[STEP_OUTPUT] = []
         self.batch_progress = Progress()
 
-    def connect(self, trainer: "pl.Trainer", *args: Any, **kwargs: Any) -> None:
-        """Connects the loop with necessary arguments like the trainer"""
-        super().connect(trainer, *args, **kwargs)
-
     @property
     def done(self) -> bool:
         """Returns ``True`` if the current iteration count reaches the number of dataloader batches."""

From 398457896b75744b751f486aa80e5a9666b18929 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Thu, 15 Jul 2021 20:47:28 +0200
Subject: [PATCH 100/106] Update CHANGELOG

---
 CHANGELOG.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fb1e20572eb01..e63657851449d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
 - Progress tracking
-  * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
+  * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140), [#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
   * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140))
   * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/PyTorchLightning/pytorch-lightning/pull/8244), [#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
 
@@ -92,6 +92,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fault-tolerant training
     * Added `{,load_}state_dict` to `ResultCollection` ([#7948](https://github.com/PyTorchLightning/pytorch-lightning/pull/7948))
     * Added `{,load_}state_dict` to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197))
+    * Set `Loop.restarting=False` at the end of the `run` in the first iteration ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
+    * Save the loops state with the checkpoint (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
+    * Save a checkpoint to restore the state on exception (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
 
 
 - Added `rank_zero_only` to `LightningModule.log` function ([#7966](https://github.com/PyTorchLightning/pytorch-lightning/pull/7966))

From 70a9bcac224cdd12bd5193e3feeee22b87f55ca5 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 16 Jul 2021 01:49:26 +0200
Subject: [PATCH 101/106] Holy shit finally got the formula right

---
 tests/loops/test_loops.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index f3beaf5332e44..0f8dd58cb364a 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -289,7 +289,7 @@ def val_dataloader(self):
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-@pytest.mark.parametrize("accumulate_grad_batches", (1, 2))  # FIXME: 3 is broken
+@pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3))
 @pytest.mark.parametrize("n_optimizers", (1, 3, 5))
 @pytest.mark.parametrize("stop_epoch", (1, 2))
 @pytest.mark.parametrize("stop_batch", (1, ))  # FIXME: 2 is broken
@@ -358,22 +358,16 @@ def configure_optimizers_multiple(self):
     be_stepping_batches = be_batches_completed // accumulate_grad_batches
 
     nbe_total_opt_steps = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
-    is_last_be_batch_stepping = be_batches_ready % accumulate_grad_batches == 0 or has_leftover_accumulation_batches
-    be_total_opt_steps = be_stepping_batches * n_optimizers + is_last_be_batch_stepping * stop_optimizer
+    does_last_be_batch_step = be_batches_ready % accumulate_grad_batches == 0 or has_leftover_accumulation_batches
+    be_total_opt_steps = be_stepping_batches * n_optimizers + does_last_be_batch_step * stop_optimizer
     assert optim_progress.optimizer_steps == nbe_total_opt_steps + be_total_opt_steps
     assert optim_progress.optimizer.step.current.completed == be_total_opt_steps
     has_opt_stepped_in_be = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
 
     nbe_total_zero_grad = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
-    if accumulate_grad_batches > 1:
-        # FIXME: ready or completed? 0 or stop_optimizer?
-        be_total_zero_grad = (
-            n_optimizers + (be_batches_ready // accumulate_grad_batches - (accumulate_grad_batches > 1)) *
-            (n_optimizers - 1) + 0
-        )
-        # be_total_zero_grad = be_epoch_batches_ready // accumulate_grad_batches * n_optimizers + 0
-    else:
-        be_total_zero_grad = be_stepping_batches * n_optimizers + stop_optimizer
+    does_last_be_batch_zero_grad = be_batches_completed % accumulate_grad_batches == 0
+    # `max` because the first batch always zero-grads
+    be_total_zero_grad = max(1, be_stepping_batches) * n_optimizers + stop_optimizer * does_last_be_batch_zero_grad
     assert optim_progress.optimizer.zero_grad.total.completed == nbe_total_zero_grad + be_total_zero_grad
     assert optim_progress.optimizer.zero_grad.current.completed == be_total_zero_grad
 

From ae94d7a34c96e4e51be95397cc069bc8c32378df Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 16 Jul 2021 02:22:14 +0200
Subject: [PATCH 102/106] Fix final thing!!!

---
 tests/loops/test_loops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 0f8dd58cb364a..2af9d941b3b34 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -292,7 +292,7 @@ def val_dataloader(self):
 @pytest.mark.parametrize("accumulate_grad_batches", (1, 2, 3))
 @pytest.mark.parametrize("n_optimizers", (1, 3, 5))
 @pytest.mark.parametrize("stop_epoch", (1, 2))
-@pytest.mark.parametrize("stop_batch", (1, ))  # FIXME: 2 is broken
+@pytest.mark.parametrize("stop_batch", (1, 2))
 @pytest.mark.parametrize("stop_optimizer", (1, 2))
 def test_loop_state_on_exception(accumulate_grad_batches, stop_epoch, stop_batch, stop_optimizer, n_optimizers, tmpdir):
     stop_optimizer = stop_optimizer if stop_optimizer < n_optimizers else 0
@@ -362,7 +362,7 @@ def configure_optimizers_multiple(self):
     be_total_opt_steps = be_stepping_batches * n_optimizers + does_last_be_batch_step * stop_optimizer
     assert optim_progress.optimizer_steps == nbe_total_opt_steps + be_total_opt_steps
     assert optim_progress.optimizer.step.current.completed == be_total_opt_steps
-    has_opt_stepped_in_be = accumulate_grad_batches == 1 or n_batches % accumulate_grad_batches != 0
+    has_opt_stepped_in_be = stop_batch + 1 >= accumulate_grad_batches
 
     nbe_total_zero_grad = (nbe_stepping_batches + has_leftover_accumulation_batches) * n_optimizers
     does_last_be_batch_zero_grad = be_batches_completed % accumulate_grad_batches == 0

From 83b3dd60826707d594df7738ae62948df3426a44 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Fri, 16 Jul 2021 02:23:50 +0200
Subject: [PATCH 103/106] Do not check state dicts

---
 tests/loops/test_loops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 2af9d941b3b34..28edb52de055c 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -384,7 +384,7 @@ def configure_optimizers_multiple(self):
 
     # yapf: disable
     expected = {
-        "state_dict": {},
+        "state_dict": ANY,
         "epoch_progress": {
             "total": {
                 "ready": stop_epoch + 1,
@@ -399,7 +399,7 @@ def configure_optimizers_multiple(self):
                 "completed": stop_epoch,
             },
         },
-        "epoch_loop.state_dict": {},
+        "epoch_loop.state_dict": ANY,
         "epoch_loop.batch_progress": {
             "total": {
                 "ready": nbe_batches_completed + be_batches_completed + 1,
@@ -428,7 +428,7 @@ def configure_optimizers_multiple(self):
                 "completed": be_sch_steps,
             },
         },
-        "epoch_loop.batch_loop.state_dict": {},
+        "epoch_loop.batch_loop.state_dict": ANY,
         "epoch_loop.batch_loop.optim_progress": {
             "optimizer_idx": stop_optimizer,
             "optimizer": {

From 5af97306111be7652826581eb9fd36612373bdb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 16 Jul 2021 04:11:30 +0200
Subject: [PATCH 104/106] parametrize multiple_dataloader progress test

---
 tests/loops/test_loops.py | 40 +++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 28edb52de055c..5173736812e08 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -195,11 +195,12 @@ def on_load_checkpoint(self, state_dict: Dict) -> None:
 
 
 @mock.patch.dict(os.environ, {"PL_FAULT_TOLERANT_TRAINING": "1"})
-def test_loop_restart_progress_multiple_dataloaders(tmpdir):
-    stop_epoch = stop_batch = stop_dataloader = 1
-    n_dataloaders = 3
-    n_batches = 3
-    n_epochs = 2
+@pytest.mark.parametrize("stop_epoch", (1, 2))
+@pytest.mark.parametrize("stop_batch", (1, 2))
+@pytest.mark.parametrize("n_dataloaders,stop_dataloader", [(2, 0), (2, 1), (3, 2)])
+def test_loop_restart_progress_multiple_dataloaders(tmpdir, n_dataloaders, stop_dataloader, stop_epoch, stop_batch):
+    n_batches = 5
+    n_epochs = 3
 
     class ValidationModel(BoringModel):
 
@@ -222,7 +223,6 @@ def val_dataloader(self):
         max_epochs=n_epochs,
         limit_train_batches=1,
         limit_val_batches=n_batches,
-        callbacks=ModelCheckpoint(dirpath=tmpdir, save_last=True),
         num_sanity_val_steps=0,
     )
 
@@ -235,13 +235,13 @@ def val_dataloader(self):
     ckpt_path = str(tmpdir / '.pl_auto_save.ckpt')
     checkpoint = torch.load(ckpt_path)["loops"]["fit_loop"]
 
-    total = (n_epochs - 1) * n_dataloaders + stop_dataloader
+    total_dataloader = stop_epoch * n_dataloaders + stop_dataloader
     expected = {
         "total": {
-            "ready": total + 1,
+            "ready": total_dataloader + 1,
             "started": None,
             "processed": None,
-            "completed": total
+            "completed": total_dataloader
         },
         "current": {
             "ready": stop_dataloader + 1,
@@ -253,13 +253,17 @@ def val_dataloader(self):
     assert checkpoint["epoch_loop.val_loop.dataloader_progress"] == expected
 
     trainer.fit_loop.load_state_dict(checkpoint, restart_progress=False)
-    total = n_dataloaders * n_batches + n_batches + stop_epoch
+
+    # `nbe_`: non-breaking epoch, as in, no exception will be raised. `be_`: breaking epoch
+    nbe_total_val_batch = stop_epoch * n_dataloaders * n_batches
+    be_total_val_batch = stop_dataloader * n_batches + stop_batch
+    total_val_batch = nbe_total_val_batch + be_total_val_batch
     expected = {
         "total": {
-            "ready": total + 1,
-            "started": total + 1,
-            "processed": total,
-            "completed": total
+            "ready": total_val_batch + 1,
+            "started": total_val_batch + 1,
+            "processed": total_val_batch,
+            "completed": total_val_batch
         },
         "current": {
             "ready": stop_batch + 1,
@@ -273,10 +277,10 @@ def val_dataloader(self):
     trainer.fit_loop.load_state_dict(checkpoint)
     expected = {
         "total": {
-            "ready": total,
-            "started": total,
-            "processed": total,
-            "completed": total
+            "ready": total_val_batch,
+            "started": total_val_batch,
+            "processed": total_val_batch,
+            "completed": total_val_batch
         },
         "current": {
             "ready": stop_batch,

From d1a8bc0e55503679e3dfc5c62a5d3401bfded7e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Fri, 16 Jul 2021 04:16:45 +0200
Subject: [PATCH 105/106] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e63657851449d..484c1362f9d83 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -92,7 +92,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fault-tolerant training
     * Added `{,load_}state_dict` to `ResultCollection` ([#7948](https://github.com/PyTorchLightning/pytorch-lightning/pull/7948))
     * Added `{,load_}state_dict` to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197))
-    * Set `Loop.restarting=False` at the end of the `run` in the first iteration ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
+    * Set `Loop.restarting=False` at the end of the first iteration ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
     * Save the loops state with the checkpoint (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
     * Save a checkpoint to restore the state on exception (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362))
 

From b510a96bd16cef6ba3487911a553da39449e57bd Mon Sep 17 00:00:00 2001
From: tchaton <thomas@grid.ai>
Date: Mon, 19 Jul 2021 09:55:16 +0200
Subject: [PATCH 106/106] resolve flake8

---
 tests/loops/test_loops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/loops/test_loops.py b/tests/loops/test_loops.py
index 5173736812e08..695a0c7be16a0 100644
--- a/tests/loops/test_loops.py
+++ b/tests/loops/test_loops.py
@@ -21,7 +21,6 @@
 import pytest
 import torch
 
-from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loops.base import Loop
 from pytorch_lightning.trainer.progress import BaseProgress
 from pytorch_lightning.trainer.trainer import Trainer