From d5472c0d33d37ce908f252f470f17216bb885ecc Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 15 Apr 2020 23:44:54 -0400
Subject: [PATCH 001/136] add state_dict for early stopping

---
 pytorch_lightning/callbacks/early_stopping.py | 22 ++++++++++++++-----
 pytorch_lightning/trainer/training_io.py      | 11 ++++------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 61abfb879a666..1b44d1b3b6d43 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -5,6 +5,7 @@
 Monitor a validation metric and stop training when it stops improving.
 
 """
+from copy import deepcopy
 
 import numpy as np
 import torch
@@ -61,6 +62,7 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.wait = 0
         self.stopped_epoch = 0
         self.mode = mode
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
         if mode not in self.mode_dict:
             if self.verbose > 0:
@@ -103,11 +105,20 @@ def _validate_condition_metric(self, logs):
     def monitor_op(self):
         return self.mode_dict[self.mode]
 
-    def on_train_start(self, trainer, pl_module):
-        # Allow instances to be re-used
-        self.wait = 0
-        self.stopped_epoch = 0
-        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
+    def state_dict(self):
+        return {
+            'wait': self.wait,
+            'stopped_epoch': self.stopped_epoch,
+            'best': self.best,
+            'patience': self.patience
+        }
+
+    def load_state_dict(self, state_dict):
+        state_dict = deepcopy(state_dict)
+        self.wait = state_dict['wait']
+        self.stopped_epoch = state_dict['stopped_epoch']
+        self.best = state_dict['best']
+        self.patience = state_dict['patience']
 
     def on_validation_end(self, trainer, pl_module):
         self._run_early_stopping_check(trainer, pl_module)
@@ -130,7 +141,6 @@ def _run_early_stopping_check(self, trainer, pl_module):
             if self.wait >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
-                self.on_train_end(trainer, pl_module)
 
         return stop_training
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index fd0385cde4b34..398bda20b8f38 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -334,21 +334,18 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
                 checkpoint['checkpoint_callback_best_model_path'] = self.checkpoint_callback.best_model_path
 
             if self.early_stop_callback:
-                checkpoint['early_stop_callback_wait'] = self.early_stop_callback.wait
-                checkpoint['early_stop_callback_patience'] = self.early_stop_callback.patience
+                checkpoint['early_stop_callback_state_dict'] = self.early_stop_callback.state_dict()
 
             # save optimizers
             optimizer_states = []
             for i, optimizer in enumerate(self.optimizers):
                 optimizer_states.append(optimizer.state_dict())
-
             checkpoint['optimizer_states'] = optimizer_states
 
             # save lr schedulers
             lr_schedulers = []
             for scheduler in self.lr_schedulers:
                 lr_schedulers.append(scheduler['scheduler'].state_dict())
-
             checkpoint['lr_schedulers'] = lr_schedulers
 
             # save native amp scaling
@@ -414,9 +411,9 @@ def restore_training_state(self, checkpoint):
                 self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best']
             self.checkpoint_callback.best_model_path = checkpoint['checkpoint_callback_best_model_path']
 
-        if self.early_stop_callback:
-            self.early_stop_callback.wait = checkpoint['early_stop_callback_wait']
-            self.early_stop_callback.patience = checkpoint['early_stop_callback_patience']
+        if self.early_stop_callback is not None and self.early_stop_callback is not False:
+            state = checkpoint['early_stop_callback_state_dict']
+            self.early_stop_callback.load_state_dict(state)
 
         self.global_step = checkpoint['global_step']
         self.current_epoch = checkpoint['epoch']

From 339506a6fbaef71b396e44125f930fe362c5543d Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 15 Apr 2020 23:57:59 -0400
Subject: [PATCH 002/136] move best attr after monitor_op defined

---
 pytorch_lightning/callbacks/early_stopping.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 1b44d1b3b6d43..b6d99c9043931 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -62,7 +62,6 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.wait = 0
         self.stopped_epoch = 0
         self.mode = mode
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
         if mode not in self.mode_dict:
             if self.verbose > 0:
@@ -77,7 +76,9 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+        self.monitor_op = self.mode_dict[mode]
+        self.min_delta *= 1 if self.monitor_op == np.greater else -1
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
     def _validate_condition_metric(self, logs):
         """

From a5a119801966a59033c2bac41564d780fce48afa Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 12:21:58 -0400
Subject: [PATCH 003/136] improve early stopping and model checkpoint callbacks

---
 docs/source/callbacks.rst                     | 12 ++++
 pytorch_lightning/callbacks/early_stopping.py | 48 +++++++++++++---
 pytorch_lightning/trainer/callback_config.py  | 56 +++++++++++--------
 pytorch_lightning/trainer/trainer.py          | 30 +++++++---
 pytorch_lightning/trainer/training_loop.py    | 43 +++++++-------
 5 files changed, 130 insertions(+), 59 deletions(-)

diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
index 744c1f0c5edd6..3a6b1e84acdd6 100644
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@@ -46,6 +46,18 @@ Example:
 We successfully extended functionality without polluting our super clean
 :class:`~pytorch_lightning.core.LightningModule` research code.
 
+
+Best Practices
+==============
+
+1. Callbacks should be isolated in their functionality. Your callback should not rely on 
+ the presence of other callbacks in order to work properly.
+2. Do not manually call methods from the callback. The callbacks are designed to be
+ invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
+ is strongly discouraged.
+3. Whenever possible, your callbacks should not depend on the order in which they are executed.
+
+
 ---------
 
 .. automodule:: pytorch_lightning.callbacks.base
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index b6d99c9043931..77c3cc21b78d4 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,8 +26,10 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of validation epochs with no improvement
-            after which training will be stopped. Default: ``0``.
+        patience: number of passes through the validation set
+            with no improvement after which training will be stopped. 
+            This will usually correspond with epochs but may vary depending
+            on how often you have configured to check validation. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
             training will stop when the quantity
@@ -76,15 +78,29 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.monitor_op = self.mode_dict[mode]
         self.min_delta *= 1 if self.monitor_op == np.greater else -1
         self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
+    def state_dict(self):
+        return {
+            'wait': self.wait,
+            'stopped_epoch': self.stopped_epoch,
+            'best': self.best,
+            'patience': self.patience
+        }
+
+    def load_state_dict(self, state_dict):
+        state_dict = deepcopy(state_dict)
+        self.wait = state_dict['wait']
+        self.stopped_epoch = state_dict['stopped_epoch']
+        self.best = state_dict['best']
+        self.patience = state_dict['patience']
+
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
-        :param logs:
-        :return:
+        :param logs: callback metrics from validation output
+        :return: True if specified metric is available
         """
         monitor_val = logs.get(self.monitor)
         error_msg = (f'Early stopping conditioned on metric `{self.monitor}`'
@@ -121,14 +137,29 @@ def load_state_dict(self, state_dict):
         self.best = state_dict['best']
         self.patience = state_dict['patience']
 
+    def on_train_start(self, trainer, pl_module):
+        if not (
+            trainer.is_overriden("validation_step") and
+            trainer.is_overriden("validation_epoch_end")
+        ):
+            error_msg = (f'''
+                Early stopping is expecting metrics to be returned from
+                validation but the Lightning model does not have a validation loop
+                defined with logging. Please ensure that your LightningModule has
+                both `validation_step` and `validation_epoch_end` defined.
+            ''')
+            if self.strict:
+                raise RuntimeError(error_msg)
+            if self.verbose > 0:
+                rank_zero_warn(error_msg, RuntimeWarning)
+
     def on_validation_end(self, trainer, pl_module):
         self._run_early_stopping_check(trainer, pl_module)
 
     def _run_early_stopping_check(self, trainer, pl_module):
         logs = trainer.callback_metrics
-        stop_training = False
         if not self._validate_condition_metric(logs):
-            return stop_training
+            return  # short circuit if metric not present
 
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
@@ -143,7 +174,8 @@ def _run_early_stopping_check(self, trainer, pl_module):
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
 
-        return stop_training
+        if stop_training:
+            trainer.should_stop = True
 
     def on_train_end(self, trainer, pl_module):
         if self.stopped_epoch > 0 and self.verbose > 0:
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index cd94e7190e452..25a2e2ae7b79f 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -28,24 +28,28 @@ def slurm_job_id(self) -> int:
     def save_checkpoint(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def configure_checkpoint_callback(self):
+    def configure_checkpoint_callback(self,
+                                      checkpoint_callback,
+                                      default_root_dir,
+                                      logger,
+                                      weights_save_path):
         """
         Weight path set in this priority:
         Checkpoint_callback's path (if passed in).
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = self.default_root_dir
-        if self.checkpoint_callback:
+        ckpt_path = default_root_dir
+        if checkpoint_callback:
             # init a default one
-            if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None) or
-                            getattr(self.logger, '_save_dir', None) or
-                            self.default_root_dir)
+            if logger is not None:
+                save_dir = (getattr(logger, 'save_dir', None) or
+                            getattr(logger, '_save_dir', None) or
+                            default_root_dir)
 
                 # weights_save_path overrides anything
-                if self.weights_save_path is not None:
-                    save_dir = self.weights_save_path
+                if weights_save_path is not None:
+                    save_dir = weights_save_path
 
                 version = self.logger.version if isinstance(
                     self.logger.version, str) else f'version_{self.logger.version}'
@@ -56,32 +60,32 @@ def configure_checkpoint_callback(self):
                     "checkpoints"
                 )
             else:
-                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
+                ckpt_path = os.path.join(default_root_dir, "checkpoints")
 
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
             monitor_key = 'loss' if train_step_only else 'val_loss'
 
-            if self.checkpoint_callback is True:
+            if checkpoint_callback is True:
                 os.makedirs(ckpt_path, exist_ok=True)
-                self.checkpoint_callback = ModelCheckpoint(
+                checkpoint_callback = ModelCheckpoint(
                     filepath=ckpt_path,
                     monitor=monitor_key
                 )
             # If user specified None in filepath, override with runtime default
-            elif isinstance(self.checkpoint_callback, ModelCheckpoint) \
-                    and self.checkpoint_callback.dirpath is None:
-                self.checkpoint_callback.dirpath = ckpt_path
-                self.checkpoint_callback.filename = '{epoch}'
-                os.makedirs(self.checkpoint_callback.dirpath, exist_ok=True)
-        elif self.checkpoint_callback is False:
-            self.checkpoint_callback = None
+            elif isinstance(checkpoint_callback, ModelCheckpoint) \
+                    and checkpoint_callback.dirpath is None:
+                checkpoint_callback.dirpath = ckpt_path
+                checkpoint_callback.filename = '{epoch}'
+                os.makedirs(checkpoint_callback.dirpath, exist_ok=True)
+        elif checkpoint_callback is False:
+            checkpoint_callback = None
 
         self.ckpt_path = ckpt_path
 
-        if self.checkpoint_callback:
+        if checkpoint_callback:
             # set the path for the callbacks
-            self.checkpoint_callback.save_function = self.save_checkpoint
+            checkpoint_callback.save_function = self.save_checkpoint
 
             # if checkpoint callback used, then override the weights path
             self.weights_save_path = self.checkpoint_callback.dirpath
@@ -90,22 +94,26 @@ def configure_checkpoint_callback(self):
         if self.weights_save_path is None:
             self.weights_save_path = self.default_root_dir
 
+        return checkpoint_callback
+
     def configure_early_stopping(self, early_stop_callback):
         if early_stop_callback is True or None:
-            self.early_stop_callback = EarlyStopping(
+            early_stop_callback = EarlyStopping(
                 monitor='val_loss',
                 patience=3,
                 strict=True,
                 verbose=True,
                 mode='min'
             )
+            # TODO remove this attribute
             self.enable_early_stop = True
         elif not early_stop_callback:
-            self.early_stop_callback = None
+            early_stop_callback = None
             self.enable_early_stop = False
         else:
-            self.early_stop_callback = early_stop_callback
+            early_stop_callback = early_stop_callback
             self.enable_early_stop = True
+        return early_stop_callback
 
     def configure_progress_bar(self, refresh_rate=1, process_position=0):
         progress_bars = [c for c in self.callbacks if isinstance(c, ProgressBarBase)]
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 6239e66cd541f..be6560be16845 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -320,6 +320,27 @@ def __init__(
         self.callbacks = callbacks or []
         self.on_init_start()
 
+        # configure early stop callback
+        # creates a default one if none passed in
+        early_stop_callback = self.configure_early_stopping(early_stop_callback)
+        if early_stop_callback:
+            self.callbacks.append(early_stop_callback)
+
+        # configure checkpoint callback
+        # it is important that this is the last callback to run
+        # pass through the required args to figure out defaults
+        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
+                                                                 default_root_dir,
+                                                                 logger,
+                                                                 weights_save_path)
+        if checkpoint_callback:
+            self.callbacks.append(checkpoint_callback)
+
+        # TODO clean this up and follow same pattern as early_stop_callback
+        # configure checkpoint callback
+        self.checkpoint_callback = checkpoint_callback
+        self.weights_save_path = weights_save_path
+
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -447,6 +468,7 @@ def __init__(
         self.global_step = 0
         self.current_epoch = 0
         self.interrupted = False
+        self.should_stop = True
 
         # configure logger
         self.configure_logger(logger)
@@ -456,14 +478,6 @@ def __init__(
             profiler = SimpleProfiler()
         self.profiler = profiler or PassThroughProfiler()
 
-        # configure early stop callback
-        # creates a default one if none passed in
-        self.configure_early_stopping(early_stop_callback)
-
-        # configure checkpoint callback
-        self.checkpoint_callback = checkpoint_callback
-        self.weights_save_path = weights_save_path
-
         # accumulated grads
         self.accumulate_grad_batches = accumulate_grad_batches
         self.configure_accumulated_gradients(accumulate_grad_batches)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index ff3ed0e4fec6a..fed01e3a28346 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -372,15 +372,15 @@ def _signal_kill_handler(*args):
                 met_min_epochs = epoch >= self.min_epochs - 1
                 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
 
-                # TODO wrap this logic into the callback
-                if self.enable_early_stop:
-                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
-                        should_stop = self.early_stop_callback.on_validation_end(self, self.get_model())
-                        # stop training
-                        stop = should_stop and met_min_epochs
-                        if stop:
-                            self.run_training_teardown()
-                            return
+                if self.should_stop:
+                    # Question: didn't understand the check about self.fast_dev_run
+                    if met_min_epochs and met_min_steps:
+                        self.run_training_teardown()
+                        return
+                    else:
+                        log.info(f'''Trainer was signaled to stop but required minimum epochs
+                                 ({self.min_epochs}) or minimum steps ({self.min_steps}) has
+                                 not been met. Training will continue...''')
 
             self.run_training_teardown()
 
@@ -435,8 +435,7 @@ def run_training_epoch(self):
             # ---------------
             # RUN TRAIN STEP
             # ---------------
-            _outputs = self.run_training_batch(batch, batch_idx)
-            batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs
+            batch_result, grad_norm_dic, batch_step_metrics, batch_output = self.run_training_batch(batch, batch_idx)
 
             # only track outputs when user implements training_epoch_end
             # otherwise we will build up unnecessary memory
@@ -444,7 +443,8 @@ def run_training_epoch(self):
                 outputs.append(batch_output)
 
             # when returning -1 from train_step, we end epoch early
-            early_stop_epoch = batch_result == -1
+            if batch_result == -1:
+                self.should_stop = True
 
             # TODO: consolidate all actions that need to take place only after
             # self.accumulate_grad_batches steps (optimizer step, lr update, global step increment)
@@ -458,26 +458,27 @@ def run_training_epoch(self):
             is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
             can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0
             can_check_val = not self.disable_validation and can_check_epoch
-            should_check_val = is_val_check_batch or early_stop_epoch
+            should_check_val = is_val_check_batch or self.should_stop
             should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf'))
             should_check_val = can_check_val and should_check_val
 
-            # ---------------
-            # CHECKPOINTING, EARLY STOPPING
-            # ---------------
             # fast_dev_run always forces val checking after train batch
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
                 self.call_checkpoint_callback()
 
+            # ---------------
+            # CHECKPOINTING, EARLY STOPPING
+            # ---------------
+
             # when logs should be saved
-            should_save_log = (batch_idx + 1) % self.log_save_interval == 0 or early_stop_epoch
+            should_save_log = (batch_idx + 1) % self.log_save_interval == 0 or self.should_stop
             if should_save_log or self.fast_dev_run:
                 if self.proc_rank == 0 and self.logger is not None:
                     self.logger.save()
 
             # when metrics should be logged
-            should_log_metrics = batch_idx % self.row_log_interval == 0 or early_stop_epoch
+            should_log_metrics = batch_idx % self.row_log_interval == 0 or self.should_stop
             if should_log_metrics or self.fast_dev_run:
                 # logs user requested information to logger
                 self.log_metrics(batch_step_metrics, grad_norm_dic)
@@ -494,7 +495,7 @@ def run_training_epoch(self):
             # end epoch early
             # stop when the flag is changed or we've gone past the amount
             # requested in the batches
-            if early_stop_epoch or self.fast_dev_run:
+            if self.fast_dev_run or self.should_stop:
                 break
 
         if self.use_horovod:
@@ -811,6 +812,10 @@ def call_checkpoint_callback(self):
         if self.checkpoint_callback is not None:
             self.checkpoint_callback.on_validation_end(self, self.get_model())
 
+    def call_early_stop_callback(self):
+        if self.early_stop_callback:
+            self.early_stop_callback.on_epoch_end(self, self.get_model())
+
 
 def _with_is_last(iterable):
     """Pass through values from the given iterable with an added boolean indicating if this is the last item.

From a07be00724d0a720cfe79329cf3e86414ec55caf Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 12:25:30 -0400
Subject: [PATCH 004/136] fix formatting

---
 pytorch_lightning/callbacks/early_stopping.py | 6 +++---
 pytorch_lightning/trainer/callback_config.py  | 6 +++---
 pytorch_lightning/trainer/training_loop.py    | 5 ++++-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 77c3cc21b78d4..0d64f52092de7 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -27,7 +27,7 @@ class EarlyStopping(Callback):
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
         patience: number of passes through the validation set
-            with no improvement after which training will be stopped. 
+            with no improvement after which training will be stopped.
             This will usually correspond with epochs but may vary depending
             on how often you have configured to check validation. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
@@ -139,8 +139,8 @@ def load_state_dict(self, state_dict):
 
     def on_train_start(self, trainer, pl_module):
         if not (
-            trainer.is_overriden("validation_step") and
-            trainer.is_overriden("validation_epoch_end")
+            trainer.is_overriden("validation_step") 
+            and trainer.is_overriden("validation_epoch_end")
         ):
             error_msg = (f'''
                 Early stopping is expecting metrics to be returned from
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 25a2e2ae7b79f..159fe58250b8a 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -43,9 +43,9 @@ def configure_checkpoint_callback(self,
         if checkpoint_callback:
             # init a default one
             if logger is not None:
-                save_dir = (getattr(logger, 'save_dir', None) or
-                            getattr(logger, '_save_dir', None) or
-                            default_root_dir)
+                save_dir = (getattr(logger, 'save_dir', None)
+                            or getattr(logger, '_save_dir', None)
+                            or default_root_dir)
 
                 # weights_save_path overrides anything
                 if weights_save_path is not None:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index fed01e3a28346..ca3addb1f847a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -435,7 +435,10 @@ def run_training_epoch(self):
             # ---------------
             # RUN TRAIN STEP
             # ---------------
-            batch_result, grad_norm_dic, batch_step_metrics, batch_output = self.run_training_batch(batch, batch_idx)
+            (batch_result,
+             grad_norm_dic,
+             batch_step_metrics,
+             batch_output) = self.run_training_batch(batch, batch_idx)
 
             # only track outputs when user implements training_epoch_end
             # otherwise we will build up unnecessary memory

From 47fdd1654ca9d87753a9042cd45f078f9a8db5c2 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 13:20:44 -0400
Subject: [PATCH 005/136] fix attr init order

---
 pytorch_lightning/trainer/trainer.py | 38 +++++++++++++---------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index be6560be16845..87b716c3a29a8 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -316,6 +316,19 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
+        # configure logger
+        self.configure_logger(logger)
+
+        # set default save path if user didn't provide one
+        self.default_root_dir = default_root_dir
+
+        # Backward compatibility, TODO: remove in v0.8.0
+        if default_save_path is not None:
+            self.default_root_dir = default_save_path
+
+        if self.default_root_dir is None:
+            self.default_root_dir = os.getcwd()
+
         # Init callbacks
         self.callbacks = callbacks or []
         self.on_init_start()
@@ -329,18 +342,14 @@ def __init__(
         # configure checkpoint callback
         # it is important that this is the last callback to run
         # pass through the required args to figure out defaults
+        self.weights_save_path = weights_save_path
         checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
-                                                                 default_root_dir,
-                                                                 logger,
-                                                                 weights_save_path)
+                                                                 self.default_root_dir,
+                                                                 self.logger,
+                                                                 self.weights_save_path)
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
-        # TODO clean this up and follow same pattern as early_stop_callback
-        # configure checkpoint callback
-        self.checkpoint_callback = checkpoint_callback
-        self.weights_save_path = weights_save_path
-
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -435,16 +444,6 @@ def __init__(
             log.info('Running in fast_dev_run mode: will run a full train,'
                      ' val and test loop using a single batch')
 
-        # set default save path if user didn't provide one
-        self.default_root_dir = default_root_dir
-
-        # Backward compatibility, TODO: remove in v0.8.0
-        if default_save_path is not None:
-            self.default_root_dir = default_save_path
-
-        if self.default_root_dir is None:
-            self.default_root_dir = os.getcwd()
-
         # training bookeeping
         self.total_batch_idx = 0
         self.running_loss = TensorRunningAccum(window_length=20)
@@ -470,9 +469,6 @@ def __init__(
         self.interrupted = False
         self.should_stop = True
 
-        # configure logger
-        self.configure_logger(logger)
-
         # configure profiler
         if profiler is True:
             profiler = SimpleProfiler()

From 535314bfcc2705dd15ab02747d4d55a5efb84f20 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 14:37:07 -0400
Subject: [PATCH 006/136] clean up setting of default_root_dir attr

---
 pytorch_lightning/trainer/trainer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 87b716c3a29a8..3451cbe528a99 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -320,15 +320,11 @@ def __init__(
         self.configure_logger(logger)
 
         # set default save path if user didn't provide one
+        if default_root_dir is None:
+            # Backward compatibility, TODO: remove default_save_path in v0.8.0
+            default_root_dir = default_save_path or os.getcwd()
         self.default_root_dir = default_root_dir
 
-        # Backward compatibility, TODO: remove in v0.8.0
-        if default_save_path is not None:
-            self.default_root_dir = default_save_path
-
-        if self.default_root_dir is None:
-            self.default_root_dir = os.getcwd()
-
         # Init callbacks
         self.callbacks = callbacks or []
         self.on_init_start()

From ed980ea961d91904910ed105c156be71822802f3 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 15:11:02 -0400
Subject: [PATCH 007/136] logger needs default root dir set first

---
 pytorch_lightning/trainer/trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 3451cbe528a99..978b757bec11b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -316,15 +316,15 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
-        # configure logger
-        self.configure_logger(logger)
-
         # set default save path if user didn't provide one
         if default_root_dir is None:
             # Backward compatibility, TODO: remove default_save_path in v0.8.0
             default_root_dir = default_save_path or os.getcwd()
         self.default_root_dir = default_root_dir
 
+        # configure logger
+        self.configure_logger(logger)
+
         # Init callbacks
         self.callbacks = callbacks or []
         self.on_init_start()

From 98e240bf98d394d340fd49e9ad7415a390c50108 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 15:20:42 -0400
Subject: [PATCH 008/136] reorg trainer init

---
 pytorch_lightning/trainer/trainer.py | 55 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 978b757bec11b..60ad04f5c898a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -316,6 +316,31 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
+        # training bookeeping
+        self.total_batch_idx = 0
+        self.running_loss = TensorRunningAccum(window_length=20)
+        self.batch_idx = 0
+        self.progress_bar_metrics = {}
+        self.callback_metrics = {}
+        self.num_val_batches = 0
+        self.num_training_batches = 0
+        self.num_test_batches = 0
+        self.train_dataloader = None
+        self.test_dataloaders = None
+        self.val_dataloaders = None
+
+        # training state
+        self.model = None
+        self.testing = False
+        self.disable_validation = False
+        self.lr_schedulers = []
+        self.optimizers = None
+        self.optimizer_frequencies = []
+        self.global_step = 0
+        self.current_epoch = 0
+        self.interrupted = False
+        self.should_stop = True
+
         # set default save path if user didn't provide one
         if default_root_dir is None:
             # Backward compatibility, TODO: remove default_save_path in v0.8.0
@@ -325,9 +350,8 @@ def __init__(
         # configure logger
         self.configure_logger(logger)
 
-        # Init callbacks
+        # initialize callbacks
         self.callbacks = callbacks or []
-        self.on_init_start()
 
         # configure early stop callback
         # creates a default one if none passed in
@@ -346,6 +370,8 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
+        self.on_init_start()
+
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -440,31 +466,6 @@ def __init__(
             log.info('Running in fast_dev_run mode: will run a full train,'
                      ' val and test loop using a single batch')
 
-        # training bookeeping
-        self.total_batch_idx = 0
-        self.running_loss = TensorRunningAccum(window_length=20)
-        self.batch_idx = 0
-        self.progress_bar_metrics = {}
-        self.callback_metrics = {}
-        self.num_val_batches = 0
-        self.num_training_batches = 0
-        self.num_test_batches = 0
-        self.train_dataloader = None
-        self.test_dataloaders = None
-        self.val_dataloaders = None
-
-        # training state
-        self.model = None
-        self.testing = False
-        self.disable_validation = False
-        self.lr_schedulers = []
-        self.optimizers = None
-        self.optimizer_frequencies = []
-        self.global_step = 0
-        self.current_epoch = 0
-        self.interrupted = False
-        self.should_stop = True
-
         # configure profiler
         if profiler is True:
             profiler = SimpleProfiler()

From a316e027f3d59480800d1c33824d2af6feab7a86 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 20:52:33 -0400
Subject: [PATCH 009/136] remove direct references to checkpoint callback

---
 .../trainer/distrib_data_parallel.py          |  1 -
 pytorch_lightning/trainer/lr_finder.py        |  6 ++++
 pytorch_lightning/trainer/trainer.py          |  6 ++--
 pytorch_lightning/trainer/training_io.py      | 33 +++++++++++++------
 pytorch_lightning/trainer/training_loop.py    |  1 -
 5 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index 21bce3436e28f..8690c080ab0b9 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -147,7 +147,6 @@ class TrainerDDPMixin(ABC):
     on_gpu: bool
     num_gpu_nodes: int
     logger: Union[LightningLoggerBase, bool]
-    checkpoint_callback: Union[ModelCheckpoint, bool]
     data_parallel_device_ids: ...
     distributed_backend: str
     amp_level: str
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index f3679dddb9f4b..6c254ca977923 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -150,6 +150,9 @@ def lr_find(self,
         self.early_stop_callback = None
         self.enable_early_stop = False
 
+        # Accumulation of gradients
+        self.accumulate_grad_batches = num_accumulation_steps
+
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
         self.model = model
@@ -201,6 +204,9 @@ def __lr_finder_dump_params(self, model):
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
             'enable_early_stop': self.enable_early_stop,
+            'progress_bar_refresh_rate': self.progress_bar_refresh_rate,
+            'accumulate_grad_batches': self.accumulate_grad_batches,
+            'progress_bar_callback': self.progress_bar_callback,
             'configure_optimizers': model.configure_optimizers,
         }
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 60ad04f5c898a..a4c7cb387bb7a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -370,6 +370,9 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
+        # TODO refactor codebase (tests) to not directly reach into this callback
+        self.checkpoint_callback = checkpoint_callback
+
         self.on_init_start()
 
         # benchmarking
@@ -1013,9 +1016,6 @@ def run_pretrain_routine(self, model: LightningModule):
         # if cluster resets state, the model will update with the saved weights
         self.model = model
 
-        # set up checkpoint callback
-        self.configure_checkpoint_callback()
-
         # restore training and model before hpc call
         self.restore_weights(model)
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 398bda20b8f38..7a498a1cef5fa 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -96,6 +96,7 @@
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule, CHECKPOINT_KEY_MODULE_ARGS
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.overrides.data_parallel import (
     LightningDistributedDataParallel,
@@ -137,7 +138,6 @@ class TrainerIOMixin(ABC):
     use_ddp: bool
     use_ddp2: bool
     use_horovod: bool
-    checkpoint_callback: ...
     proc_rank: int
     weights_save_path: str
     logger: Union[LightningLoggerBase, bool]
@@ -329,12 +329,21 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         }
 
         if not weights_only:
-            if self.checkpoint_callback:
+
+            # TODO support more generic way for callbacks to persist a state_dict in a checkpoint
+            checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+            early_stopping_callbacks = [c for c in self.callbacks if isinstance(c, EarlyStopping)]
+
+            if checkpoint_callbacks:
+                # we add the official checkpoint callback to the end of the list
+                # extra user provided callbacks will not be persisted yet
                 checkpoint['checkpoint_callback_best_model_score'] = self.checkpoint_callback.best_model_score
                 checkpoint['checkpoint_callback_best_model_path'] = self.checkpoint_callback.best_model_path
 
-            if self.early_stop_callback:
-                checkpoint['early_stop_callback_state_dict'] = self.early_stop_callback.state_dict()
+            if early_stopping_callbacks and checkpoint_callbacks:
+                # we add the official early stopping callback to the end of the list
+                # extra user provided callbacks will not be persisted yet
+                checkpoint['early_stop_callback_state_dict'] = early_stopping_callbacks[-1].state_dict()
 
             # save optimizers
             optimizer_states = []
@@ -399,21 +408,25 @@ def restore_training_state(self, checkpoint):
                 ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
             )
 
-        if self.checkpoint_callback:
+        # TODO support more generic way for callbacks to load callback state_dicts
+        checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+        early_stopping_callbacks = [c for c in self.callbacks if isinstance(c, EarlyStopping)]
+
+        if checkpoint_callbacks:
             if 'checkpoint_callback_best_model_score' in checkpoint:
-                self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best_model_score']
+                checkpoint_callbacks[-1].best_model_score = checkpoint['checkpoint_callback_best_model_score']
             else:
                 # Old naming until version 0.7.6
                 rank_zero_warn(
                     'Loading a checkpoint created with an old version of Lightning; '
                     'this will not be supported in the future.'
                 )
-                self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best']
-            self.checkpoint_callback.best_model_path = checkpoint['checkpoint_callback_best_model_path']
+                checkpoint_callbacks[-1].best_model_score = checkpoint['checkpoint_callback_best']
+            checkpoint_callbacks[-1].best_model_path = checkpoint['checkpoint_callback_best_model_path']
 
-        if self.early_stop_callback is not None and self.early_stop_callback is not False:
+        if early_stopping_callbacks:
             state = checkpoint['early_stop_callback_state_dict']
-            self.early_stop_callback.load_state_dict(state)
+            early_stopping_callbacks[-1].load_state_dict(state)
 
         self.global_step = checkpoint['global_step']
         self.current_epoch = checkpoint['epoch']
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index ca3addb1f847a..8d8230fc80f00 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -233,7 +233,6 @@ class TrainerTrainLoopMixin(ABC):
     max_steps: int
     min_steps: int
     total_batch_idx: int
-    checkpoint_callback: ...
     terminate_on_nan: bool
     tpu_id: int
 

From 048a5f3202d30bd3410af37ba146b2a5443d3353 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 22:57:04 -0400
Subject: [PATCH 010/136] more fixes

---
 pytorch_lightning/callbacks/early_stopping.py | 21 +++++--------------
 pytorch_lightning/trainer/callback_config.py  |  2 +-
 pytorch_lightning/trainer/trainer.py          |  4 ----
 pytorch_lightning/trainer/training_loop.py    |  3 ---
 4 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 0d64f52092de7..2e1da9023bc2e 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -81,21 +81,6 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.min_delta *= 1 if self.monitor_op == np.greater else -1
         self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
-    def state_dict(self):
-        return {
-            'wait': self.wait,
-            'stopped_epoch': self.stopped_epoch,
-            'best': self.best,
-            'patience': self.patience
-        }
-
-    def load_state_dict(self, state_dict):
-        state_dict = deepcopy(state_dict)
-        self.wait = state_dict['wait']
-        self.stopped_epoch = state_dict['stopped_epoch']
-        self.best = state_dict['best']
-        self.patience = state_dict['patience']
-
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
@@ -137,9 +122,13 @@ def load_state_dict(self, state_dict):
         self.best = state_dict['best']
         self.patience = state_dict['patience']
 
+    def on_sanity_check_end(self, trainer, pl_module):
+        logs = trainer.callback_metrics
+        self._validate_condition_metric(logs)
+
     def on_train_start(self, trainer, pl_module):
         if not (
-            trainer.is_overriden("validation_step") 
+            trainer.is_overriden("validation_step")
             and trainer.is_overriden("validation_epoch_end")
         ):
             error_msg = (f'''
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 159fe58250b8a..cde6fcacb2f00 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -88,7 +88,7 @@ def configure_checkpoint_callback(self,
             checkpoint_callback.save_function = self.save_checkpoint
 
             # if checkpoint callback used, then override the weights path
-            self.weights_save_path = self.checkpoint_callback.dirpath
+            self.weights_save_path = checkpoint_callback.dirpath
 
         # if weights_save_path is still none here, set to current working dir
         if self.weights_save_path is None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a4c7cb387bb7a..92f4d16043f3a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1047,10 +1047,6 @@ def run_pretrain_routine(self, model: LightningModule):
 
             self.on_sanity_check_end()
 
-            # verify that early stop has conditioned on a metric that exists
-            if self.enable_early_stop:
-                self.early_stop_callback._validate_condition_metric(callback_metrics)
-
         # clear cache before training
         if self.on_gpu:
             torch.cuda.empty_cache()
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8d8230fc80f00..dca9460aa0e8c 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -325,9 +325,6 @@ def _signal_kill_handler(*args):
         with self.profiler.profile('on_train_start'):
             # callbacks
             self.on_train_start()
-            # initialize early stop callback
-            if self.early_stop_callback is not None:
-                self.early_stop_callback.on_train_start(self, self.get_model())
             # model hooks
             model.on_train_start()
 

From 980ac2b8bb6f7089f6eddce9d461069d2c90a801 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 23:52:56 -0400
Subject: [PATCH 011/136] more bugfixes

---
 pytorch_lightning/callbacks/early_stopping.py | 3 ++-
 pytorch_lightning/trainer/logging.py          | 2 +-
 pytorch_lightning/trainer/trainer.py          | 2 +-
 pytorch_lightning/trainer/training_loop.py    | 9 +++++++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 2e1da9023bc2e..1e49c5347278c 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -54,7 +54,7 @@ class EarlyStopping(Callback):
     }
 
     def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3,
-                 verbose: bool = False, mode: str = 'auto', strict: bool = True):
+                 verbose: bool = False, mode: str = 'auto', strict: bool = False):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
@@ -150,6 +150,7 @@ def _run_early_stopping_check(self, trainer, pl_module):
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
 
+        stop_training = False
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index 978ac5df78d81..854f7ee0951f7 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -29,7 +29,7 @@ def configure_logger(self, logger):
         if logger is True:
             # default logger
             self.logger = TensorBoardLogger(
-                save_dir=self.default_root_dir,
+                save_dir=str(self.default_root_dir),
                 version=self.slurm_job_id,
                 name='lightning_logs'
             )
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 92f4d16043f3a..e08a1a2ea7799 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -339,7 +339,7 @@ def __init__(
         self.global_step = 0
         self.current_epoch = 0
         self.interrupted = False
-        self.should_stop = True
+        self.should_stop = False
 
         # set default save path if user didn't provide one
         if default_root_dir is None:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index dca9460aa0e8c..a1fd8b5bf7943 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -153,6 +153,7 @@ def training_step(self, batch, batch_idx):
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
@@ -369,8 +370,7 @@ def _signal_kill_handler(*args):
                 met_min_steps = self.global_step >= self.min_steps if self.min_steps else True
 
                 if self.should_stop:
-                    # Question: didn't understand the check about self.fast_dev_run
-                    if met_min_epochs and met_min_steps:
+                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
                         self.run_training_teardown()
                         return
                     else:
@@ -515,6 +515,11 @@ def run_training_epoch(self):
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
             self.call_checkpoint_callback()
 
+        # when no val loop is present or fast-dev-run still need to call checkpoints
+        if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
+            checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
+
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):
             # callbacks

From c80c0e7c5a9276eb8e14305bde2df5c558f6adff Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 27 Apr 2020 23:57:29 -0400
Subject: [PATCH 012/136] run callbacks at epoch end

---
 pytorch_lightning/callbacks/early_stopping.py   | 5 +----
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 pytorch_lightning/trainer/trainer.py            | 1 +
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 1e49c5347278c..039b8739fe0f2 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -142,10 +142,7 @@ def on_train_start(self, trainer, pl_module):
             if self.verbose > 0:
                 rank_zero_warn(error_msg, RuntimeWarning)
 
-    def on_validation_end(self, trainer, pl_module):
-        self._run_early_stopping_check(trainer, pl_module)
-
-    def _run_early_stopping_check(self, trainer, pl_module):
+    def on_epoch_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 9336fe309889a..155139c1ec6c1 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -226,7 +226,7 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     @rank_zero_only
-    def on_validation_end(self, trainer, pl_module):
+    def on_epoch_end(self, trainer, pl_module):
         # only run on main process
         if trainer.proc_rank != 0:
             return
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e08a1a2ea7799..707cd3e5fd59d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1044,6 +1044,7 @@ def run_pretrain_routine(self, model: LightningModule):
                                           self.num_sanity_val_steps,
                                           False)
             _, _, _, callback_metrics, _ = self.process_output(eval_results)
+            self.callback_metrics = callback_metrics
 
             self.on_sanity_check_end()
 

From 84d1d54ec8fa65ccabdd7d09bdcf1cb93adfe0f8 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 28 Apr 2020 22:23:20 -0400
Subject: [PATCH 013/136] update tests to use on epoch end

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index de8039fe17413..14c9c246c66bc 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -261,7 +261,7 @@ def mock_save_function(filepath, *args):
     for i, loss in enumerate(losses):
         trainer.current_epoch = i
         trainer.callback_metrics = {'val_loss': loss}
-        checkpoint_callback.on_validation_end(trainer, trainer.get_model())
+        checkpoint_callback.on_epoch_end(trainer, trainer.get_model())
 
     file_lists = set(os.listdir(tmpdir))
 

From 35f193c856c0e4a39624d6725f54a1f7fc9d8058 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 28 Apr 2020 22:41:54 -0400
Subject: [PATCH 014/136] PR cleanup

---
 pytorch_lightning/callbacks/early_stopping.py | 22 ++-----------------
 pytorch_lightning/trainer/training_loop.py    |  2 +-
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 039b8739fe0f2..f3e14c9befe12 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,10 +26,8 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of passes through the validation set
-            with no improvement after which training will be stopped.
-            This will usually correspond with epochs but may vary depending
-            on how often you have configured to check validation. Default: ``0``.
+        patience: number of epochs with no improvement
+            after which training will be stopped. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
             training will stop when the quantity
@@ -126,22 +124,6 @@ def on_sanity_check_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         self._validate_condition_metric(logs)
 
-    def on_train_start(self, trainer, pl_module):
-        if not (
-            trainer.is_overriden("validation_step")
-            and trainer.is_overriden("validation_epoch_end")
-        ):
-            error_msg = (f'''
-                Early stopping is expecting metrics to be returned from
-                validation but the Lightning model does not have a validation loop
-                defined with logging. Please ensure that your LightningModule has
-                both `validation_step` and `validation_epoch_end` defined.
-            ''')
-            if self.strict:
-                raise RuntimeError(error_msg)
-            if self.verbose > 0:
-                rank_zero_warn(error_msg, RuntimeWarning)
-
     def on_epoch_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index a1fd8b5bf7943..3b5c8c904ff62 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -518,7 +518,7 @@ def run_training_epoch(self):
         # when no val loop is present or fast-dev-run still need to call checkpoints
         if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
-            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
+            [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
 
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):

From f32d22b43e6375d16088373f174fe35b4433ef39 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 29 Apr 2020 21:46:09 -0400
Subject: [PATCH 015/136] address failing tests

---
 pytorch_lightning/loggers/wandb.py       | 7 ++-----
 pytorch_lightning/trainer/training_io.py | 2 +-
 tests/base/utils.py                      | 2 +-
 tests/loggers/test_wandb.py              | 3 +++
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index c884180501bc6..dead54084a695 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -132,11 +132,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
 
     @property
     def name(self) -> str:
-        # don't create an experiment if we don't have one
-        name = self._experiment.project_name() if self._experiment else None
-        return name
+        return self.experiment.project_name()
 
     @property
     def version(self) -> str:
-        # don't create an experiment if we don't have one
-        return self._experiment.id if self._experiment else None
+        return self.experiment.id
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 7a498a1cef5fa..4a57601618f4b 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -325,7 +325,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         """
         checkpoint = {
             'epoch': self.current_epoch + 1,
-            'global_step': self.global_step + 1,
+            'global_step': self.global_step,
         }
 
         if not weights_only:
diff --git a/tests/base/utils.py b/tests/base/utils.py
index dbf2666694386..1b57bc390ef7e 100644
--- a/tests/base/utils.py
+++ b/tests/base/utils.py
@@ -98,7 +98,7 @@ def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=T
 
 def get_default_logger(save_dir, version=None):
     # set up logger object without actually saving logs
-    logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version)
+    logger = TensorBoardLogger(str(save_dir), name='lightning_logs', version=version)
     return logger
 
 
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 4cd0eff431adc..195b530ff23c1 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -38,6 +38,9 @@ def test_wandb_pickle(wandb):
     class Experiment:
         id = 'the_id'
 
+        def project_name(self):
+            return 'the_project_name'
+
     wandb.init.return_value = Experiment()
 
     logger = WandbLogger(id='the_id', offline=True)

From 78d25459f6e006487369cace0a7e693046f4a433 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 30 Apr 2020 23:07:15 -0400
Subject: [PATCH 016/136] refactor for homogeneity

---
 pytorch_lightning/trainer/callback_config.py | 22 ++++++++------------
 pytorch_lightning/trainer/trainer.py         |  5 +----
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index cde6fcacb2f00..61f60d42aabfd 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -28,28 +28,24 @@ def slurm_job_id(self) -> int:
     def save_checkpoint(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def configure_checkpoint_callback(self,
-                                      checkpoint_callback,
-                                      default_root_dir,
-                                      logger,
-                                      weights_save_path):
+    def configure_checkpoint_callback(self, checkpoint_callback):
         """
         Weight path set in this priority:
         Checkpoint_callback's path (if passed in).
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = default_root_dir
+        ckpt_path = self.default_root_dir
         if checkpoint_callback:
             # init a default one
-            if logger is not None:
-                save_dir = (getattr(logger, 'save_dir', None)
-                            or getattr(logger, '_save_dir', None)
-                            or default_root_dir)
+            if self.logger is not None:
+                save_dir = (getattr(self.logger, 'save_dir', None)
+                            or getattr(self.logger, '_save_dir', None)
+                            or self.default_root_dir)
 
                 # weights_save_path overrides anything
-                if weights_save_path is not None:
-                    save_dir = weights_save_path
+                if self.weights_save_path is not None:
+                    save_dir = self.weights_save_path
 
                 version = self.logger.version if isinstance(
                     self.logger.version, str) else f'version_{self.logger.version}'
@@ -60,7 +56,7 @@ def configure_checkpoint_callback(self,
                     "checkpoints"
                 )
             else:
-                ckpt_path = os.path.join(default_root_dir, "checkpoints")
+                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
 
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 707cd3e5fd59d..727bad5694b42 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -363,10 +363,7 @@ def __init__(
         # it is important that this is the last callback to run
         # pass through the required args to figure out defaults
         self.weights_save_path = weights_save_path
-        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
-                                                                 self.default_root_dir,
-                                                                 self.logger,
-                                                                 self.weights_save_path)
+        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback)
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 

From f236d3ec069a6d879778c99b890ebf51b4deba11 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 4 May 2020 21:56:19 -0400
Subject: [PATCH 017/136] fix merge conflict

---
 pytorch_lightning/trainer/lr_finder.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 6c254ca977923..e9fc8927090b0 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -210,16 +210,18 @@ def __lr_finder_dump_params(self, model):
             'configure_optimizers': model.configure_optimizers,
         }
 
-    def __lr_finder_restore_params(self, model):
-        self.auto_lr_find = self.__dumped_params['auto_lr_find']
-        self.logger = self.__dumped_params['logger']
-        self.callbacks = self.__dumped_params['callbacks']
-        self.max_steps = self.__dumped_params['max_steps']
-        self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
-        self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
-        model.configure_optimizers = self.__dumped_params['configure_optimizers']
-        del self.__dumped_params
+    def _lr_finder_restore_params(self, model):
+        self.auto_lr_find = self._params['auto_lr_find']
+        self.logger = self._params['logger']
+        self.callbacks = self._params['callbacks']
+        self.max_steps = self._params['max_steps']
+        self.progress_bar_refresh_rate = self._params['progress_bar_refresh_rate']
+        self.accumulate_grad_batches = self._params['accumulate_grad_batches']
+        self.checkpoint_callback = self._params['checkpoint_callback']
+        self.early_stop_callback = self._params['early_stop_callback']
+        self.enable_early_stop = self._params['enable_early_stop']
+        self.progress_bar_callback = self._params['progress_bar_callback']
+        model.configure_optimizers = self._params['configure_optimizers']
 
 
 class _LRFinder(object):

From 9972af9f00f409c6b347f003b263f8c684203d07 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 21 May 2020 22:37:53 -0400
Subject: [PATCH 018/136] separate tests

---
 tests/callbacks/test_callbacks.py            | 83 --------------------
 tests/callbacks/test_early_stopping.py       | 44 +++++++++++
 tests/callbacks/test_learning_rate_logger.py | 78 ++++++++++++++++++
 tests/callbacks/test_model_checkpoint.py     | 58 ++++++++++++++
 4 files changed, 180 insertions(+), 83 deletions(-)
 create mode 100644 tests/callbacks/test_early_stopping.py
 create mode 100644 tests/callbacks/test_learning_rate_logger.py
 create mode 100644 tests/callbacks/test_model_checkpoint.py

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index c962e6cdb0a55..990cd89e0a5cd 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -5,7 +5,6 @@
 import tests.base.utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
 
@@ -200,85 +199,3 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_validation_end_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_validation_batch_start_called
-
-
-def test_early_stopping_no_val_step(tmpdir):
-    """Test that early stopping callback falls back to training metrics when no validation defined."""
-
-    class CurrentModel(EvalModelTemplate):
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            output.update({'my_train_metric': output['loss']})  # could be anything else
-            return output
-
-    model = CurrentModel()
-    model.validation_step = None
-    model.val_dataloader = None
-
-    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=stopping,
-        overfit_pct=0.20,
-        max_epochs=5,
-    )
-    result = trainer.fit(model)
-
-    assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch < trainer.max_epochs
-
-
-def test_pickling(tmpdir):
-    import pickle
-    early_stopping = EarlyStopping()
-    ckpt = ModelCheckpoint(tmpdir)
-
-    early_stopping_pickled = pickle.dumps(early_stopping)
-    ckpt_pickled = pickle.dumps(ckpt)
-
-    early_stopping_loaded = pickle.loads(early_stopping_pickled)
-    ckpt_loaded = pickle.loads(ckpt_pickled)
-
-    assert vars(early_stopping) == vars(early_stopping_loaded)
-    assert vars(ckpt) == vars(ckpt_loaded)
-
-
-@pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
-def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
-    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-
-    checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
-
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_pct=0.20,
-                      max_epochs=5
-                      )
-    trainer.fit(model)
-
-    # These should be different if the dirpath has be overridden
-    assert trainer.ckpt_path != trainer.default_root_dir
-
-
-@pytest.mark.parametrize(
-    'logger_version,expected',
-    [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
-)
-def test_model_checkpoint_path(tmpdir, logger_version, expected):
-    """Test that "version_" prefix is only added when logger's version is an integer"""
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-    logger = TensorBoardLogger(str(tmpdir), version=logger_version)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        overfit_pct=0.2,
-        max_epochs=5,
-        logger=logger
-    )
-    trainer.fit(model)
-
-    ckpt_version = Path(trainer.ckpt_path).parent.name
-    assert ckpt_version == expected
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
new file mode 100644
index 0000000000000..4560791f8e0a3
--- /dev/null
+++ b/tests/callbacks/test_early_stopping.py
@@ -0,0 +1,44 @@
+import pytest
+
+import tests.base.utils as tutils
+from pytorch_lightning import Callback
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from tests.base import EvalModelTemplate
+from pathlib import Path
+
+
+# TODO remove this test
+def test_early_stopping_no_val_step(tmpdir):
+    """Test that early stopping callback falls back to training metrics when no validation defined."""
+
+    class CurrentModel(EvalModelTemplate):
+        def training_step(self, *args, **kwargs):
+            output = super().training_step(*args, **kwargs)
+            output.update({'my_train_metric': output['loss']})  # could be anything else
+            return output
+
+    model = CurrentModel()
+    model.validation_step = None
+    model.val_dataloader = None
+
+    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        early_stop_callback=stopping,
+        overfit_pct=0.20,
+        max_epochs=5,
+    )
+    result = trainer.fit(model)
+
+    assert result == 1, 'training failed to complete'
+    assert trainer.current_epoch < trainer.max_epochs
+
+
+def test_pickling(tmpdir):
+    import pickle
+    early_stopping = EarlyStopping()
+    early_stopping_pickled = pickle.dumps(early_stopping)
+    early_stopping_loaded = pickle.loads(early_stopping_pickled)
+    assert vars(early_stopping) == vars(early_stopping_loaded)
\ No newline at end of file
diff --git a/tests/callbacks/test_learning_rate_logger.py b/tests/callbacks/test_learning_rate_logger.py
new file mode 100644
index 0000000000000..466d030f9a8c3
--- /dev/null
+++ b/tests/callbacks/test_learning_rate_logger.py
@@ -0,0 +1,78 @@
+import tests.base.utils as tutils
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import LearningRateLogger
+from tests.base import EvalModelTemplate
+
+
+def test_lr_logger_single_lr(tmpdir):
+    """ Test that learning rates are extracted and logged for single lr scheduler"""
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__single_scheduler
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=5,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert results == 1
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of lr schedulers'
+    assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
+
+
+def test_lr_logger_multi_lrs(tmpdir):
+    """ Test that learning rates are extracted and logged for multi lr schedulers """
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__multiple_schedulers
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert results == 1
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of lr schedulers'
+    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
+
+
+def test_lr_logger_param_groups(tmpdir):
+    """ Test that learning rates are extracted and logged for single lr scheduler"""
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__param_groups
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=5,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of param groups'
+    assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
new file mode 100644
index 0000000000000..5e758a1f51ee2
--- /dev/null
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -0,0 +1,58 @@
+import pytest
+
+import tests.base.utils as tutils
+from pytorch_lightning import Callback
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from tests.base import EvalModelTemplate
+from pathlib import Path
+
+
+@pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
+def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
+    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
+    tutils.reset_seed()
+    model = EvalModelTemplate()
+
+    checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
+
+    trainer = Trainer(default_root_dir=tmpdir,
+                      checkpoint_callback=checkpoint,
+                      overfit_pct=0.20,
+                      max_epochs=5
+                      )
+    trainer.fit(model)
+
+    # These should be different if the dirpath has be overridden
+    assert trainer.ckpt_path != trainer.default_root_dir
+
+
+@pytest.mark.parametrize(
+    'logger_version,expected',
+    [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
+)
+def test_model_checkpoint_path(tmpdir, logger_version, expected):
+    """Test that "version_" prefix is only added when logger's version is an integer"""
+    tutils.reset_seed()
+    model = EvalModelTemplate()
+    logger = TensorBoardLogger(str(tmpdir), version=logger_version)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        overfit_pct=0.2,
+        max_epochs=5,
+        logger=logger
+    )
+    trainer.fit(model)
+
+    ckpt_version = Path(trainer.ckpt_path).parent.name
+    assert ckpt_version == expected
+
+
+def test_pickling(tmpdir):
+    import pickle
+    ckpt = ModelCheckpoint(tmpdir)
+    ckpt_pickled = pickle.dumps(ckpt)
+    ckpt_loaded = pickle.loads(ckpt_pickled)
+    assert vars(ckpt) == vars(ckpt_loaded)

From fc7c6e8565bf69e03a276b004967b0311aa9e895 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 12:39:12 -0400
Subject: [PATCH 019/136] tests for early stopping bug regressions

---
 tests/callbacks/test_early_stopping.py | 70 ++++++++++++++++++--------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 4560791f8e0a3..0709e226a8807 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -1,5 +1,6 @@
 import pytest
 
+import torch
 import tests.base.utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
@@ -9,31 +10,58 @@
 from pathlib import Path
 
 
-# TODO remove this test
-def test_early_stopping_no_val_step(tmpdir):
-    """Test that early stopping callback falls back to training metrics when no validation defined."""
+def test_resume_early_stopping_from_checkpoint(tmpdir):
+    """
+    Prevent regressions to bugs:
+    https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
+    https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
+    """
+    class EarlyStoppingTestRestore(EarlyStopping):
+        def __init__(self, expected_state):
+            super().__init__()
+            self.expected_state = expected_state
 
-    class CurrentModel(EvalModelTemplate):
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            output.update({'my_train_metric': output['loss']})  # could be anything else
-            return output
+        def on_train_start(self, trainer, pl_module):
+            assert self.state_dict() == self.expected_state
 
-    model = CurrentModel()
-    model.validation_step = None
-    model.val_dataloader = None
+    model = EvalModelTemplate()
+    checkpoint_callback = ModelCheckpoint(save_top_k=1)
+    early_stop_callback = EarlyStopping()
+    trainer = Trainer(checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, max_epochs=4)
+    trainer.fit(model)
+    early_stop_callback_state = early_stop_callback.state_dict()
 
-    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=stopping,
-        overfit_pct=0.20,
-        max_epochs=5,
-    )
-    result = trainer.fit(model)
+    checkpoint_filepath = checkpoint_callback.kth_best_model
+    # ensure state is persisted properly
+    checkpoint = torch.load(checkpoint_filepath)
+    assert checkpoint['early_stop_callback_state_dict'] == early_stop_callback_state
+    # ensure state is reloaded properly (assertion in the callback)
+    early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
+    new_trainer = Trainer(max_epochs=2,
+                          resume_from_checkpoint=checkpoint_filepath,
+                          early_stop_callback=early_stop_callback)
+    new_trainer.fit(model)
 
-    assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch < trainer.max_epochs
+
+def test_early_stopping_no_extraneous_invocations():
+    """Test to ensure that callback methods aren't being invoked outside of the callback handler."""
+    class EarlyStoppingTestInvocations(EarlyStopping):
+        def __init__(self, expected_count):
+            super().__init__()
+            self.count = 0
+            self.expected_count = expected_count
+
+        def on_validation_end(self, trainer, pl_module):
+            self.count += 1
+
+        def on_train_end(self, trainer, pl_module):
+            assert self.count == self.expected_count
+
+    model = EvalModelTemplate()
+    expected_count = 4
+    early_stop_callback = EarlyStoppingTestInvocations(expected_count)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, max_epochs=expected_count)
+    trainer.fit(model)
 
 
 def test_pickling(tmpdir):

From 016da663f08a4c5ec630c4bd57a1747cd245be65 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 12:48:32 -0400
Subject: [PATCH 020/136] small fixes

---
 pytorch_lightning/callbacks/early_stopping.py | 2 +-
 pytorch_lightning/trainer/trainer.py          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index f3e14c9befe12..45c12a9dfa141 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -52,7 +52,7 @@ class EarlyStopping(Callback):
     }
 
     def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3,
-                 verbose: bool = False, mode: str = 'auto', strict: bool = False):
+                 verbose: bool = False, mode: str = 'auto', strict: bool = True):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 727bad5694b42..e5bd7f051cec9 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -367,8 +367,9 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
-        # TODO refactor codebase (tests) to not directly reach into this callback
+        # TODO refactor codebase (tests) to not directly reach into these callbacks
         self.checkpoint_callback = checkpoint_callback
+        self.early_stop_callback = early_stop_callback
 
         self.on_init_start()
 

From 5a0028ca964b73d166195cc5d293e0a0babb463d Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 13:00:22 -0400
Subject: [PATCH 021/136] revert model checkpoint change

---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 tests/callbacks/test_model_checkpoint.py        | 3 +--
 tests/trainer/test_trainer.py                   | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 155139c1ec6c1..9336fe309889a 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -226,7 +226,7 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     @rank_zero_only
-    def on_epoch_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer, pl_module):
         # only run on main process
         if trainer.proc_rank != 0:
             return
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 5e758a1f51ee2..86744e3cef099 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,9 +1,8 @@
 import pytest
 
 import tests.base.utils as tutils
-from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
 from pathlib import Path
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 14c9c246c66bc..de8039fe17413 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -261,7 +261,7 @@ def mock_save_function(filepath, *args):
     for i, loss in enumerate(losses):
         trainer.current_epoch = i
         trainer.callback_metrics = {'val_loss': loss}
-        checkpoint_callback.on_epoch_end(trainer, trainer.get_model())
+        checkpoint_callback.on_validation_end(trainer, trainer.get_model())
 
     file_lists = set(os.listdir(tmpdir))
 

From d6087af961c18a1894b4b12526a0888be66df8c5 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 13:00:31 -0400
Subject: [PATCH 022/136] typo fix

---
 pytorch_lightning/trainer/training_loop.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 3b5c8c904ff62..cf4f9942aac7e 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -513,10 +513,6 @@ def run_training_epoch(self):
 
         # when no val loop is present or fast-dev-run still need to call checkpoints
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
-            self.call_checkpoint_callback()
-
-        # when no val loop is present or fast-dev-run still need to call checkpoints
-        if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
             [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
 

From 1386365c478ba79036ee577311c52c61f5e3e489 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 23:47:48 -0400
Subject: [PATCH 023/136] fix tests

---
 docs/source/callbacks.rst                    | 8 ++++----
 pytorch_lightning/trainer/callback_config.py | 2 +-
 pytorch_lightning/trainer/logging.py         | 2 +-
 pytorch_lightning/trainer/trainer.py         | 2 +-
 pytorch_lightning/trainer/training_io.py     | 2 +-
 tests/base/utils.py                          | 2 +-
 tests/loggers/test_all.py                    | 2 +-
 tests/trainer/test_trainer_cli.py            | 6 +++---
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
index 3a6b1e84acdd6..2dcf81277e1b9 100644
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@@ -50,11 +50,11 @@ We successfully extended functionality without polluting our super clean
 Best Practices
 ==============
 
-1. Callbacks should be isolated in their functionality. Your callback should not rely on 
- the presence of other callbacks in order to work properly.
+1. Callbacks should be isolated in their functionality. Your callback should not rely on the 
+behavior of other callbacks in order to work properly.
 2. Do not manually call methods from the callback. The callbacks are designed to be
- invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
- is strongly discouraged.
+invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
+is strongly discouraged.
 3. Whenever possible, your callbacks should not depend on the order in which they are executed.
 
 
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 61f60d42aabfd..f3ca28f45478d 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -44,7 +44,7 @@ def configure_checkpoint_callback(self, checkpoint_callback):
                             or self.default_root_dir)
 
                 # weights_save_path overrides anything
-                if self.weights_save_path is not None:
+                if self.weights_save_path is not None and self.weights_save_path is not True:
                     save_dir = self.weights_save_path
 
                 version = self.logger.version if isinstance(
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index 854f7ee0951f7..978ac5df78d81 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -29,7 +29,7 @@ def configure_logger(self, logger):
         if logger is True:
             # default logger
             self.logger = TensorBoardLogger(
-                save_dir=str(self.default_root_dir),
+                save_dir=self.default_root_dir,
                 version=self.slurm_job_id,
                 name='lightning_logs'
             )
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e5bd7f051cec9..48166d19ef90d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -342,7 +342,7 @@ def __init__(
         self.should_stop = False
 
         # set default save path if user didn't provide one
-        if default_root_dir is None:
+        if default_root_dir is None or default_root_dir is True:
             # Backward compatibility, TODO: remove default_save_path in v0.8.0
             default_root_dir = default_save_path or os.getcwd()
         self.default_root_dir = default_root_dir
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 4a57601618f4b..7a498a1cef5fa 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -325,7 +325,7 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         """
         checkpoint = {
             'epoch': self.current_epoch + 1,
-            'global_step': self.global_step,
+            'global_step': self.global_step + 1,
         }
 
         if not weights_only:
diff --git a/tests/base/utils.py b/tests/base/utils.py
index 1b57bc390ef7e..dbf2666694386 100644
--- a/tests/base/utils.py
+++ b/tests/base/utils.py
@@ -98,7 +98,7 @@ def run_model_test(trainer_options, model, on_gpu=True, version=None, with_hpc=T
 
 def get_default_logger(save_dir, version=None):
     # set up logger object without actually saving logs
-    logger = TensorBoardLogger(str(save_dir), name='lightning_logs', version=version)
+    logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version)
     return logger
 
 
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index c001d4acb3c4f..a6ac0fac1b40f 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -68,10 +68,10 @@ def log_metrics(self, metrics, step):
 
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
-    CometLogger,
     MLFlowLogger,
     NeptuneLogger,
     TestTubeLogger,
+    # CometLogger,  # TODO: add this one
     # TrainsLogger,  # TODO: add this one
     # WandbLogger,  # TODO: add this one
 ])
diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py
index c66d614903c3d..31433446e01fd 100644
--- a/tests/trainer/test_trainer_cli.py
+++ b/tests/trainer/test_trainer_cli.py
@@ -10,10 +10,10 @@
 from pytorch_lightning import Trainer
 
 
-@mock.patch('argparse.ArgumentParser.parse_args',
-            return_value=Namespace(**Trainer.default_attributes()))
-def test_default_args(tmpdir):
+@mock.patch('argparse.ArgumentParser.parse_args')
+def test_default_args(mock_argparse, tmpdir):
     """Tests default argument parser for Trainer"""
+    mock_argparse.return_value = Namespace(**Trainer.default_attributes())
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)

From 2200c818cc7311ef07bd011b68856c9797189273 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 25 May 2020 14:23:41 -0400
Subject: [PATCH 024/136] update train loop

---
 pytorch_lightning/trainer/training_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index cf4f9942aac7e..fd880c16bbf76 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -512,9 +512,10 @@ def run_training_epoch(self):
             self.add_progress_bar_metrics(_processed_outputs[1])
 
         # when no val loop is present or fast-dev-run still need to call checkpoints
+        # TODO bake this logic into the checkpoint callback
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
-            [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
+            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
 
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):

From be80c8e6a8e1518785c3a06b73730546e95aa966 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 25 May 2020 14:36:09 -0400
Subject: [PATCH 025/136] cannot pass an int as default_save_path

---
 tests/test_deprecated.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py
index 79634578b75cb..6a7415d728b50 100644
--- a/tests/test_deprecated.py
+++ b/tests/test_deprecated.py
@@ -73,6 +73,7 @@ def test_tbd_remove_in_v0_8_0_trainer():
     }
     # skip 0 since it may be interested as False
     kwargs = {k: (i + 1) for i, k in enumerate(mapping_old_new)}
+    kwargs['default_save_path'] = 'lightning_logs'
 
     trainer = Trainer(**kwargs)
 

From 994e25b16cfbeb1a5cfb8a25e311110941ab24c4 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 26 May 2020 19:17:01 -0400
Subject: [PATCH 026/136] refactor log message

---
 pytorch_lightning/trainer/training_loop.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index fd880c16bbf76..6894d3b3fbfe3 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -374,9 +374,9 @@ def _signal_kill_handler(*args):
                         self.run_training_teardown()
                         return
                     else:
-                        log.info(f'''Trainer was signaled to stop but required minimum epochs
-                                 ({self.min_epochs}) or minimum steps ({self.min_steps}) has
-                                 not been met. Training will continue...''')
+                        log.info(f'Trainer was signaled to stop but required minimum epochs '
+                                 f'({self.min_epochs}) or minimum steps ({self.min_steps}) has '
+                                 f'not been met. Training will continue...')
 
             self.run_training_teardown()
 

From fa488eb0299d882e1ebd23af3809af1da4d685bf Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 26 May 2020 22:11:11 -0400
Subject: [PATCH 027/136] fix test case

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index de8039fe17413..e9c29e02ac73c 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -445,7 +445,7 @@ def test_trainer_min_steps_and_epochs(tmpdir):
         early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0),
         val_check_interval=2,
         min_epochs=1,
-        max_epochs=5
+        max_epochs=10
     )
 
     # define less min steps than 1 epoch

From 1a56edd07ea0871088796ddc9fca46911a6265f2 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 26 May 2020 22:30:44 -0400
Subject: [PATCH 028/136] appease the linter

---
 pytorch_lightning/trainer/callback_config.py | 6 +++---
 tests/callbacks/test_early_stopping.py       | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index f3ca28f45478d..0c3b6006759e9 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -39,9 +39,9 @@ def configure_checkpoint_callback(self, checkpoint_callback):
         if checkpoint_callback:
             # init a default one
             if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None)
-                            or getattr(self.logger, '_save_dir', None)
-                            or self.default_root_dir)
+                save_dir = (getattr(self.logger, 'save_dir', None) or
+                            getattr(self.logger, '_save_dir', None) or
+                            self.default_root_dir)
 
                 # weights_save_path overrides anything
                 if self.weights_save_path is not None and self.weights_save_path is not True:
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 0709e226a8807..165cda8ea2f37 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -16,6 +16,7 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
     https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
     https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
     """
+
     class EarlyStoppingTestRestore(EarlyStopping):
         def __init__(self, expected_state):
             super().__init__()
@@ -69,4 +70,4 @@ def test_pickling(tmpdir):
     early_stopping = EarlyStopping()
     early_stopping_pickled = pickle.dumps(early_stopping)
     early_stopping_loaded = pickle.loads(early_stopping_pickled)
-    assert vars(early_stopping) == vars(early_stopping_loaded)
\ No newline at end of file
+    assert vars(early_stopping) == vars(early_stopping_loaded)

From 9c831288d6b5d42db7b99b633805582aea58545f Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 28 May 2020 22:05:57 -0400
Subject: [PATCH 029/136] fix some doctests

---
 docs/source/experiment_logging.rst | 3 ++-
 docs/source/weights_loading.rst    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/experiment_logging.rst b/docs/source/experiment_logging.rst
index 772efcfc13bc5..422929337db47 100644
--- a/docs/source/experiment_logging.rst
+++ b/docs/source/experiment_logging.rst
@@ -88,6 +88,7 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.
 .. testcode::
 
     from pytorch_lightning.loggers import NeptuneLogger
+
     neptune_logger = NeptuneLogger(
         api_key='ANONYMOUS',  # replace with your own
         project_name='shared/pytorch-lightning-integration',
@@ -225,7 +226,7 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.
 .. testcode::
 
     from pytorch_lightning.loggers import WandbLogger
-    wandb_logger = WandbLogger()
+    wandb_logger = WandbLogger(offline=True)
     trainer = Trainer(logger=wandb_logger)
 
 The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index 11844678397a9..a9c660d7195e3 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -31,7 +31,7 @@ To change the checkpoint path pass in:
 
 .. testcode::
 
-    trainer = Trainer(default_save_path='/your/path/to/save/checkpoints')
+    trainer = Trainer(default_save_path='lightning_checkpoints')
 
 To modify the behavior of checkpointing pass in your own callback.
 

From 42b39c57f150239a08542c6cbc0cd96735dabb71 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 30 May 2020 12:42:25 -0400
Subject: [PATCH 030/136] move config to callback

---
 .../callbacks/model_checkpoint.py             | 30 ++++++++++++
 pytorch_lightning/trainer/callback_config.py  | 47 ++-----------------
 2 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 9336fe309889a..9195b0869c237 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -225,6 +225,36 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         filepath = os.path.join(self.dirpath, self.prefix + filename + str_ver + '.ckpt')
         return filepath
 
+    def on_train_start(self, trainer, pl_module):
+        if self.dirpath is None:
+            self.filename = '{epoch}'
+
+            ckpt_path = trainer.default_root_dir
+            if trainer.logger is not None:
+                save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                            getattr(trainer.logger, '_save_dir', None) or
+                            trainer.default_root_dir)
+
+                # weights_save_path overrides anything
+                if trainer.weights_save_path is not None:
+                    save_dir = trainer.weights_save_path
+
+                version = trainer.logger.version if isinstance(
+                    trainer.logger.version, str) else f'version_{trainer.logger.version}'
+                ckpt_path = os.path.join(
+                    save_dir,
+                    trainer.logger.name,
+                    version,
+                    "checkpoints"
+                )
+            else:
+                ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
+
+            self.dirpath = ckpt_path
+            os.makedirs(self.dirpath, exist_ok=True)
+            trainer.ckpt_path = ckpt_path
+            trainer.weights_save_path = self.dirpath
+
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):
         # only run on main process
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 0c3b6006759e9..c2b15c50b6f1c 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -35,57 +35,20 @@ def configure_checkpoint_callback(self, checkpoint_callback):
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = self.default_root_dir
-        if checkpoint_callback:
-            # init a default one
-            if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None) or
-                            getattr(self.logger, '_save_dir', None) or
-                            self.default_root_dir)
-
-                # weights_save_path overrides anything
-                if self.weights_save_path is not None and self.weights_save_path is not True:
-                    save_dir = self.weights_save_path
-
-                version = self.logger.version if isinstance(
-                    self.logger.version, str) else f'version_{self.logger.version}'
-                ckpt_path = os.path.join(
-                    save_dir,
-                    self.logger.name,
-                    version,
-                    "checkpoints"
-                )
-            else:
-                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
-
+        if checkpoint_callback is True:
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
             monitor_key = 'loss' if train_step_only else 'val_loss'
-
-            if checkpoint_callback is True:
-                os.makedirs(ckpt_path, exist_ok=True)
-                checkpoint_callback = ModelCheckpoint(
-                    filepath=ckpt_path,
-                    monitor=monitor_key
-                )
-            # If user specified None in filepath, override with runtime default
-            elif isinstance(checkpoint_callback, ModelCheckpoint) \
-                    and checkpoint_callback.dirpath is None:
-                checkpoint_callback.dirpath = ckpt_path
-                checkpoint_callback.filename = '{epoch}'
-                os.makedirs(checkpoint_callback.dirpath, exist_ok=True)
+            checkpoint_callback = ModelCheckpoint(
+                filepath=None,
+                monitor=monitor_key
+            )
         elif checkpoint_callback is False:
             checkpoint_callback = None
 
-        self.ckpt_path = ckpt_path
-
         if checkpoint_callback:
-            # set the path for the callbacks
             checkpoint_callback.save_function = self.save_checkpoint
 
-            # if checkpoint callback used, then override the weights path
-            self.weights_save_path = checkpoint_callback.dirpath
-
         # if weights_save_path is still none here, set to current working dir
         if self.weights_save_path is None:
             self.weights_save_path = self.default_root_dir

From 4e414d69cb108cab6b55c15cb936aa71bceb5af8 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 1 Jun 2020 00:21:26 -0400
Subject: [PATCH 031/136] fixes from rebase

---
 pytorch_lightning/callbacks/early_stopping.py |  4 +-
 .../callbacks/model_checkpoint.py             |  8 +-
 pytorch_lightning/trainer/lr_finder.py        | 28 +++----
 pytorch_lightning/trainer/trainer.py          |  3 +
 pytorch_lightning/trainer/training_loop.py    |  8 --
 tests/callbacks/test_learning_rate_logger.py  | 78 -------------------
 .../{test_lr.py => test_lr_logger.py}         |  0
 7 files changed, 21 insertions(+), 108 deletions(-)
 delete mode 100644 tests/callbacks/test_learning_rate_logger.py
 rename tests/callbacks/{test_lr.py => test_lr_logger.py} (100%)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 45c12a9dfa141..fc940fc1390ee 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,7 +26,7 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of epochs with no improvement
+        patience: number of validation epochs with no improvement
             after which training will be stopped. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
@@ -124,7 +124,7 @@ def on_sanity_check_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         self._validate_condition_metric(logs)
 
-    def on_epoch_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 9195b0869c237..77f393e8fd774 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -226,13 +226,17 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     def on_train_start(self, trainer, pl_module):
+        """
+        Determine model checkpoint save directory at runtime. References attributes from the
+        Trainer's logger to determine where to save checkpoints.
+        """
         if self.dirpath is None:
             self.filename = '{epoch}'
 
             ckpt_path = trainer.default_root_dir
             if trainer.logger is not None:
-                save_dir = (getattr(trainer.logger, 'save_dir', None) or
-                            getattr(trainer.logger, '_save_dir', None) or
+                save_dir = (getattr(trainer.logger, 'save_dir') or
+                            getattr(trainer.logger, '_save_dir') or
                             trainer.default_root_dir)
 
                 # weights_save_path overrides anything
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index e9fc8927090b0..f3679dddb9f4b 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -150,9 +150,6 @@ def lr_find(self,
         self.early_stop_callback = None
         self.enable_early_stop = False
 
-        # Accumulation of gradients
-        self.accumulate_grad_batches = num_accumulation_steps
-
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
         self.model = model
@@ -204,24 +201,19 @@ def __lr_finder_dump_params(self, model):
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
             'enable_early_stop': self.enable_early_stop,
-            'progress_bar_refresh_rate': self.progress_bar_refresh_rate,
-            'accumulate_grad_batches': self.accumulate_grad_batches,
-            'progress_bar_callback': self.progress_bar_callback,
             'configure_optimizers': model.configure_optimizers,
         }
 
-    def _lr_finder_restore_params(self, model):
-        self.auto_lr_find = self._params['auto_lr_find']
-        self.logger = self._params['logger']
-        self.callbacks = self._params['callbacks']
-        self.max_steps = self._params['max_steps']
-        self.progress_bar_refresh_rate = self._params['progress_bar_refresh_rate']
-        self.accumulate_grad_batches = self._params['accumulate_grad_batches']
-        self.checkpoint_callback = self._params['checkpoint_callback']
-        self.early_stop_callback = self._params['early_stop_callback']
-        self.enable_early_stop = self._params['enable_early_stop']
-        self.progress_bar_callback = self._params['progress_bar_callback']
-        model.configure_optimizers = self._params['configure_optimizers']
+    def __lr_finder_restore_params(self, model):
+        self.auto_lr_find = self.__dumped_params['auto_lr_find']
+        self.logger = self.__dumped_params['logger']
+        self.callbacks = self.__dumped_params['callbacks']
+        self.max_steps = self.__dumped_params['max_steps']
+        self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
+        self.early_stop_callback = self.__dumped_params['early_stop_callback']
+        self.enable_early_stop = self.__dumped_params['enable_early_stop']
+        model.configure_optimizers = self.__dumped_params['configure_optimizers']
+        del self.__dumped_params
 
 
 class _LRFinder(object):
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 48166d19ef90d..0660483fd7c7c 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -344,6 +344,9 @@ def __init__(
         # set default save path if user didn't provide one
         if default_root_dir is None or default_root_dir is True:
             # Backward compatibility, TODO: remove default_save_path in v0.8.0
+            if default_save_path:
+                rank_zero_warn("Argument `default_save_path` has been replaced by `default_root_dir`"
+                               " and will be removed in v0.8.0", DeprecationWarning)
             default_root_dir = default_save_path or os.getcwd()
         self.default_root_dir = default_root_dir
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 6894d3b3fbfe3..60d17dc742b95 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -809,14 +809,6 @@ def update_learning_rates(self, interval: str):
                 else:
                     lr_scheduler['scheduler'].step()
 
-    def call_checkpoint_callback(self):
-        if self.checkpoint_callback is not None:
-            self.checkpoint_callback.on_validation_end(self, self.get_model())
-
-    def call_early_stop_callback(self):
-        if self.early_stop_callback:
-            self.early_stop_callback.on_epoch_end(self, self.get_model())
-
 
 def _with_is_last(iterable):
     """Pass through values from the given iterable with an added boolean indicating if this is the last item.
diff --git a/tests/callbacks/test_learning_rate_logger.py b/tests/callbacks/test_learning_rate_logger.py
deleted file mode 100644
index 466d030f9a8c3..0000000000000
--- a/tests/callbacks/test_learning_rate_logger.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import tests.base.utils as tutils
-from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import LearningRateLogger
-from tests.base import EvalModelTemplate
-
-
-def test_lr_logger_single_lr(tmpdir):
-    """ Test that learning rates are extracted and logged for single lr scheduler"""
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__single_scheduler
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=5,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert results == 1
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of lr schedulers'
-    assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
-
-
-def test_lr_logger_multi_lrs(tmpdir):
-    """ Test that learning rates are extracted and logged for multi lr schedulers """
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__multiple_schedulers
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert results == 1
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of lr schedulers'
-    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
-
-
-def test_lr_logger_param_groups(tmpdir):
-    """ Test that learning rates are extracted and logged for single lr scheduler"""
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__param_groups
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=5,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of param groups'
-    assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
diff --git a/tests/callbacks/test_lr.py b/tests/callbacks/test_lr_logger.py
similarity index 100%
rename from tests/callbacks/test_lr.py
rename to tests/callbacks/test_lr_logger.py

From 52d60e89f3a0011353053a5b68d0694f1137cf51 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 1 Jun 2020 00:31:02 -0400
Subject: [PATCH 032/136] fixes from rebase

---
 pytorch_lightning/callbacks/early_stopping.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index fc940fc1390ee..6599cf634d0a7 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -76,8 +76,8 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.min_delta *= 1 if self.monitor_op == np.greater else -1
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+        self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
 
     def _validate_condition_metric(self, logs):
         """

From f0843711ef800f56431dcc413053a3ae3d65582e Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:42:42 +0200
Subject: [PATCH 033/136] chlog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e74f9787ed0ab..c1fc1cc2fba3c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -66,6 +66,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed `save_weights_only` in ModelCheckpoint ([#1780](https://github.com/PyTorchLightning/pytorch-lightning/pull/1780))
 
+- Fixed for early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504))
+
 ## [0.7.6] - 2020-05-16
 
 ### Added

From 24f4dfe8270bb1a436783f8d080b1059df802d0d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:47:36 +0200
Subject: [PATCH 034/136] docs

---
 pytorch_lightning/callbacks/early_stopping.py | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 6599cf634d0a7..6c804d71d0f32 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -59,7 +59,7 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.verbose = verbose
         self.strict = strict
         self.min_delta = min_delta
-        self.wait = 0
+        self.wait_count = 0
         self.stopped_epoch = 0
         self.mode = mode
 
@@ -77,13 +77,17 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
         self.min_delta *= 1 if self.monitor_op == torch.gt else -1
-        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
+        self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf
 
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
-        :param logs: callback metrics from validation output
-        :return: True if specified metric is available
+
+        Args:
+            logs: callback metrics from validation output
+
+        Return:
+             True if specified metric is available
         """
         monitor_val = logs.get(self.monitor)
         error_msg = (f'Early stopping conditioned on metric `{self.monitor}`'
@@ -107,17 +111,17 @@ def monitor_op(self):
 
     def state_dict(self):
         return {
-            'wait': self.wait,
+            'wait_count': self.wait_count,
             'stopped_epoch': self.stopped_epoch,
-            'best': self.best,
+            'best_score': self.best_score,
             'patience': self.patience
         }
 
     def load_state_dict(self, state_dict):
         state_dict = deepcopy(state_dict)
-        self.wait = state_dict['wait']
+        self.wait_count = state_dict['wait_count']
         self.stopped_epoch = state_dict['stopped_epoch']
-        self.best = state_dict['best']
+        self.best_score = state_dict['best_score']
         self.patience = state_dict['patience']
 
     def on_sanity_check_end(self, trainer, pl_module):
@@ -134,12 +138,12 @@ def on_validation_end(self, trainer, pl_module):
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
 
-        if self.monitor_op(current - self.min_delta, self.best):
-            self.best = current
-            self.wait = 0
+        if self.monitor_op(current - self.min_delta, self.best_score):
+            self.best_score = current
+            self.wait_count = 0
         else:
-            self.wait += 1
-            if self.wait >= self.patience:
+            self.wait_count += 1
+            if self.wait_count >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
 

From 2a0a9c26d5485af007d4464540e8b42a89ba90ea Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:52:34 +0200
Subject: [PATCH 035/136] reformat

---
 .../callbacks/model_checkpoint.py             | 55 ++++++++++---------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 77f393e8fd774..c31dc7a8dc2a7 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -230,34 +230,35 @@ def on_train_start(self, trainer, pl_module):
         Determine model checkpoint save directory at runtime. References attributes from the
         Trainer's logger to determine where to save checkpoints.
         """
-        if self.dirpath is None:
-            self.filename = '{epoch}'
-
-            ckpt_path = trainer.default_root_dir
-            if trainer.logger is not None:
-                save_dir = (getattr(trainer.logger, 'save_dir') or
-                            getattr(trainer.logger, '_save_dir') or
-                            trainer.default_root_dir)
-
-                # weights_save_path overrides anything
-                if trainer.weights_save_path is not None:
-                    save_dir = trainer.weights_save_path
-
-                version = trainer.logger.version if isinstance(
-                    trainer.logger.version, str) else f'version_{trainer.logger.version}'
-                ckpt_path = os.path.join(
-                    save_dir,
-                    trainer.logger.name,
-                    version,
-                    "checkpoints"
-                )
-            else:
-                ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
+        if self.dirpath is not None:
+            return # short circuit
+
+        self.filename = '{epoch}'
+
+        if trainer.logger is not None:
+            save_dir = (getattr(trainer.logger, 'save_dir') or
+                        getattr(trainer.logger, '_save_dir') or
+                        trainer.default_root_dir)
+
+            # weights_save_path overrides anything
+            if trainer.weights_save_path is not None:
+                save_dir = trainer.weights_save_path
+
+            version = trainer.logger.version if isinstance(
+                trainer.logger.version, str) else f'version_{trainer.logger.version}'
+            ckpt_path = os.path.join(
+                save_dir,
+                trainer.logger.name,
+                version,
+                "checkpoints"
+            )
+        else:
+            ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
 
-            self.dirpath = ckpt_path
-            os.makedirs(self.dirpath, exist_ok=True)
-            trainer.ckpt_path = ckpt_path
-            trainer.weights_save_path = self.dirpath
+        self.dirpath = ckpt_path
+        os.makedirs(self.dirpath, exist_ok=True)
+        trainer.ckpt_path = ckpt_path
+        trainer.weights_save_path = self.dirpath
 
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):

From 949d3e651145b7f9aea713359b8ddfa8608478e0 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 14:00:20 +0200
Subject: [PATCH 036/136] formatting

---
 pytorch_lightning/trainer/training_loop.py |  6 +++---
 tests/callbacks/test_model_checkpoint.py   | 18 ++++++++++--------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 60d17dc742b95..2d9d7b8d474f3 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -374,9 +374,9 @@ def _signal_kill_handler(*args):
                         self.run_training_teardown()
                         return
                     else:
-                        log.info(f'Trainer was signaled to stop but required minimum epochs '
-                                 f'({self.min_epochs}) or minimum steps ({self.min_steps}) has '
-                                 f'not been met. Training will continue...')
+                        log.info('Trainer was signaled to stop but required minimum epochs'
+                                 f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
+                                 'not been met. Training will continue...')
 
             self.run_training_teardown()
 
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 86744e3cef099..c8742dc14e103 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,11 +1,13 @@
+import pickle
+from pathlib import Path
+
 import pytest
 
 import tests.base.utils as tutils
-from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
-from pathlib import Path
 
 
 @pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
@@ -16,11 +18,12 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
 
     checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
 
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_pct=0.20,
-                      max_epochs=5
-                      )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        checkpoint_callback=checkpoint,
+        overfit_pct=0.20,
+        max_epochs=5
+    )
     trainer.fit(model)
 
     # These should be different if the dirpath has be overridden
@@ -50,7 +53,6 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
 
 
 def test_pickling(tmpdir):
-    import pickle
     ckpt = ModelCheckpoint(tmpdir)
     ckpt_pickled = pickle.dumps(ckpt)
     ckpt_loaded = pickle.loads(ckpt_pickled)

From 668b2caec1f9b328e98d4221f5a5a1b9e11eb4f6 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 14:54:44 +0200
Subject: [PATCH 037/136] fix

---
 pytorch_lightning/trainer/training_loop.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 2d9d7b8d474f3..a48bc37a993c6 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -256,7 +256,7 @@ def is_function_implemented(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
-    def run_evaluation(self, *args):
+    def run_evaluation(self, *args, **kwargs):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
@@ -464,7 +464,6 @@ def run_training_epoch(self):
             # fast_dev_run always forces val checking after train batch
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
-                self.call_checkpoint_callback()
 
             # ---------------
             # CHECKPOINTING, EARLY STOPPING

From 02914cfc23bb4cd6cb4c2c7bc2ec51c9ee8bdfb6 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 15:08:07 +0200
Subject: [PATCH 038/136] fix

---
 pytorch_lightning/callbacks/model_checkpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index c31dc7a8dc2a7..379d518eb45cd 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -236,8 +236,8 @@ def on_train_start(self, trainer, pl_module):
         self.filename = '{epoch}'
 
         if trainer.logger is not None:
-            save_dir = (getattr(trainer.logger, 'save_dir') or
-                        getattr(trainer.logger, '_save_dir') or
+            save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                        getattr(trainer.logger, '_save_dir', None) or
                         trainer.default_root_dir)
 
             # weights_save_path overrides anything

From 4837abeee3dc57ffa2af31a3e978d9b6544ce670 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 15 Jun 2020 20:10:41 -0400
Subject: [PATCH 039/136] fixes from rebase

---
 pytorch_lightning/trainer/training_io.py   | 2 +-
 pytorch_lightning/trainer/training_loop.py | 1 -
 tests/trainer/test_trainer.py              | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index fa99efc62366b..0ec92825c1636 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -94,7 +94,7 @@
 
 import pytorch_lightning
 from pytorch_lightning import _logger as log
-from pytorch_lightning.core.lightning import LightningModule, CHECKPOINT_KEY_MODULE_ARGS
+from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.overrides.data_parallel import (
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8c9cf771d0b14..38c1aecdbb3a7 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -470,7 +470,6 @@ def run_training_epoch(self):
             # fast_dev_run always forces val checking after train batch
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
-                self.call_checkpoint_callback()
 
             # ---------------
             # CHECKPOINTING, EARLY STOPPING
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index e397f6f132d52..c05e0ae8f30f6 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -464,7 +464,7 @@ def test_trainer_min_steps_and_epochs(tmpdir):
         early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0),
         val_check_interval=2,
         min_epochs=1,
-        max_epochs=2
+        max_epochs=7
     )
 
     # define less min steps than 1 epoch

From a8a39d5312f7e37b2aec21453859a9dff4eb736e Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 15 Jun 2020 21:00:42 -0400
Subject: [PATCH 040/136] add new test for patience

---
 pytorch_lightning/callbacks/early_stopping.py |  8 ++-----
 tests/callbacks/test_early_stopping.py        | 24 +++++++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 06251236384de..788ac3e5d1294 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -132,7 +132,6 @@ def _run_early_stopping_check(self, trainer, pl_module):
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
 
-        stop_training = False
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
@@ -144,13 +143,10 @@ def _run_early_stopping_check(self, trainer, pl_module):
             self.wait += 1
             if self.wait >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
-                stop_training = True
-
-        if stop_training:
-            trainer.should_stop = True
+                trainer.should_stop = True
 
     def on_train_end(self, trainer, pl_module):
         if self.stopped_epoch > 0 and self.verbose > 0:
             rank_zero_warn('Displayed epoch numbers by `EarlyStopping` start from "1" until v0.6.x,'
                            ' but will start from "0" in v0.8.0.', DeprecationWarning)
-            log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping')
+            log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping triggered.')
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 165cda8ea2f37..8512ddeae995c 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -65,6 +65,30 @@ def on_train_end(self, trainer, pl_module):
     trainer.fit(model)
 
 
+@pytest.mark.parametrize('loss_values, patience, expected_stop_epoch', [
+    ([6, 5, 5, 5, 5, 5], 3, 5),
+    ([6, 5, 4, 4, 3, 3], 1, 4),
+    ([6, 5, 6, 5, 5, 5], 3, 5),
+])
+def test_early_stopping_patience(loss_values, patience, expected_stop_epoch):
+    """Test to ensure that early stopping is not triggered before patience is exhausted."""
+
+    class ModelOverrideValidationReturn(EvalModelTemplate):
+        validation_return_values = torch.Tensor(loss_values)
+        count = 0
+
+        def validation_epoch_end(self, outputs):
+            loss = self.validation_return_values[self.count]
+            self.count += 1
+            return {"test_val_loss": loss}
+
+    model = ModelOverrideValidationReturn()
+    early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0)
+    trainer.fit(model)
+    assert trainer.current_epoch + 1 == expected_stop_epoch
+
+
 def test_pickling(tmpdir):
     import pickle
     early_stopping = EarlyStopping()

From 053ce18e553c2c3015b31daa446084a711902aa9 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:02:03 -0400
Subject: [PATCH 041/136] Update
 pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/callbacks/model_checkpoint.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index ed4b0f8767e5d..c3fec967a6740 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -237,13 +237,13 @@ def on_train_start(self, trainer, pl_module):
         self.filename = '{epoch}'
 
         if trainer.logger is not None:
-            save_dir = (getattr(trainer.logger, 'save_dir', None) or
-                        getattr(trainer.logger, '_save_dir', None) or
-                        trainer.default_root_dir)
-
             # weights_save_path overrides anything
-            if trainer.weights_save_path is not None:
+            if getattr(trainer, 'weights_save_path', None) is not None:
                 save_dir = trainer.weights_save_path
+            else:
+                save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                            getattr(trainer.logger, '_save_dir', None) or
+                            trainer.default_root_dir)
 
             version = trainer.logger.version if isinstance(
                 trainer.logger.version, str) else f'version_{trainer.logger.version}'

From 5902d8263556368e220d13073bc1c58c9259d5d3 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:02:15 -0400
Subject: [PATCH 042/136] Update
 pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index c3fec967a6740..f22d606aebb0f 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -259,7 +259,7 @@ def on_train_start(self, trainer, pl_module):
         self.dirpath = ckpt_path
         os.makedirs(self.dirpath, exist_ok=True)
         trainer.ckpt_path = ckpt_path
-        trainer.weights_save_path = self.dirpath
+        trainer.weights_save_path = ckpt_path
 
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):

From 83a754df5f0ff9c9abf7a29a9faae10122984d08 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:03:25 -0400
Subject: [PATCH 043/136] Update tests/callbacks/test_early_stopping.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/callbacks/test_early_stopping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 8512ddeae995c..1f0b203ccdea7 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -84,7 +84,7 @@ def validation_epoch_end(self, outputs):
 
     model = ModelOverrideValidationReturn()
     early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
-    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0, max_epochs=10)
     trainer.fit(model)
     assert trainer.current_epoch + 1 == expected_stop_epoch
 

From fa669ef7db97c69428ee42e30dc5c5be7d09f581 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 16 Jun 2020 21:53:53 -0400
Subject: [PATCH 044/136] fix formatting

---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 3632cec0b8ef6..b3fde36a74373 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -232,7 +232,7 @@ def on_train_start(self, trainer, pl_module):
         Trainer's logger to determine where to save checkpoints.
         """
         if self.dirpath is not None:
-            return # short circuit
+            return  # short circuit
 
         self.filename = '{epoch}'
 

From 33f6e2d534107a507d6d27517970e89fda8f1050 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 16 Jun 2020 22:34:49 -0400
Subject: [PATCH 045/136] remove enable_early_stop attribute

---
 pytorch_lightning/trainer/callback_config.py | 4 ----
 pytorch_lightning/trainer/lr_finder.py       | 3 ---
 pytorch_lightning/trainer/training_loop.py   | 1 -
 pytorch_lightning/trainer/training_tricks.py | 3 ---
 tests/trainer/test_lr_finder.py              | 2 +-
 tests/trainer/test_trainer_tricks.py         | 1 -
 6 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 9806c3eb72e57..b65dc37ef8b1b 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -68,14 +68,10 @@ def configure_early_stopping(self, early_stop_callback):
                 verbose=True,
                 mode='min'
             )
-            # TODO remove this attribute
-            self.enable_early_stop = True
         elif not early_stop_callback:
             early_stop_callback = None
-            self.enable_early_stop = False
         else:
             early_stop_callback = early_stop_callback
-            self.enable_early_stop = True
         return early_stop_callback
 
     def configure_progress_bar(self, refresh_rate=1, process_position=0):
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 96f38c86cb939..72228c81394ba 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -163,7 +163,6 @@ def lr_find(self,
         # Disable standard checkpoint & early stopping
         self.checkpoint_callback = False
         self.early_stop_callback = None
-        self.enable_early_stop = False
 
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
@@ -215,7 +214,6 @@ def __lr_finder_dump_params(self, model):
             'max_steps': self.max_steps,
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
-            'enable_early_stop': self.enable_early_stop,
             'configure_optimizers': model.configure_optimizers,
         }
 
@@ -226,7 +224,6 @@ def __lr_finder_restore_params(self, model):
         self.max_steps = self.__dumped_params['max_steps']
         self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
         self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
         model.configure_optimizers = self.__dumped_params['configure_optimizers']
         del self.__dumped_params
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 0497b107cc440..7f4b5de1325d6 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -208,7 +208,6 @@ class TrainerTrainLoopMixin(ABC):
     fast_dev_run: ...
     accumulation_scheduler: ...
     lr_schedulers: ...
-    enable_early_stop: ...
     early_stop_callback: ...
     callback_metrics: ...
     logger: Union[LightningLoggerBase, bool]
diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py
index 817215202992f..977bd51694ab9 100644
--- a/pytorch_lightning/trainer/training_tricks.py
+++ b/pytorch_lightning/trainer/training_tricks.py
@@ -188,7 +188,6 @@ def __scale_batch_dump_params(self):
             'callbacks': self.callbacks,
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
-            'enable_early_stop': self.enable_early_stop,
             'auto_scale_batch_size': self.auto_scale_batch_size,
             'train_percent_check': self.train_percent_check,
             'model': self.model,
@@ -202,7 +201,6 @@ def __scale_batch_reset_params(self, model, steps_per_trial):
         self.callbacks = []  # not needed before full run
         self.checkpoint_callback = False  # required for saving
         self.early_stop_callback = None
-        self.enable_early_stop = False
         self.train_percent_check = 1.0
         self.optimizers, self.schedulers = [], []  # required for saving
         self.model = model  # required for saving
@@ -215,7 +213,6 @@ def __scale_batch_restore_params(self):
         self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
         self.auto_scale_batch_size = self.__dumped_params['auto_scale_batch_size']
         self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
         self.train_percent_check = self.__dumped_params['train_percent_check']
         self.model = self.__dumped_params['model']
         del self.__dumped_params
diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py
index d0becff0918c6..66b4e1d2972de 100755
--- a/tests/trainer/test_lr_finder.py
+++ b/tests/trainer/test_lr_finder.py
@@ -57,7 +57,7 @@ def test_trainer_reset_correctly(tmpdir):
 
     changed_attributes = ['callbacks', 'logger', 'max_steps', 'auto_lr_find',
                           'early_stop_callback', 'accumulate_grad_batches',
-                          'enable_early_stop', 'checkpoint_callback']
+                          'checkpoint_callback']
     attributes_before = {}
     for ca in changed_attributes:
         attributes_before[ca] = getattr(trainer, ca)
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
index 973ed32e7cd92..99605443a67e8 100755
--- a/tests/trainer/test_trainer_tricks.py
+++ b/tests/trainer/test_trainer_tricks.py
@@ -48,7 +48,6 @@ def test_trainer_reset_correctly(tmpdir):
                           'callbacks',
                           'checkpoint_callback',
                           'early_stop_callback',
-                          'enable_early_stop',
                           'train_percent_check']
 
     attributes_before = {}

From 4e313354615503a724f3dcea0ae0664811ce887d Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 15 Apr 2020 23:44:54 -0400
Subject: [PATCH 046/136] add state_dict for early stopping

---
 pytorch_lightning/callbacks/early_stopping.py | 22 ++++++++++++++-----
 pytorch_lightning/trainer/training_io.py      | 11 ++++------
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 99a9bb073d787..ac2aca03614e8 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -5,6 +5,7 @@
 Monitor a validation metric and stop training when it stops improving.
 
 """
+from copy import deepcopy
 
 import numpy as np
 import torch
@@ -61,6 +62,7 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.wait = 0
         self.stopped_epoch = 0
         self.mode = mode
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
         if mode not in self.mode_dict:
             if self.verbose > 0:
@@ -103,11 +105,20 @@ def _validate_condition_metric(self, logs):
     def monitor_op(self):
         return self.mode_dict[self.mode]
 
-    def on_train_start(self, trainer, pl_module):
-        # Allow instances to be re-used
-        self.wait = 0
-        self.stopped_epoch = 0
-        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
+    def state_dict(self):
+        return {
+            'wait': self.wait,
+            'stopped_epoch': self.stopped_epoch,
+            'best': self.best,
+            'patience': self.patience
+        }
+
+    def load_state_dict(self, state_dict):
+        state_dict = deepcopy(state_dict)
+        self.wait = state_dict['wait']
+        self.stopped_epoch = state_dict['stopped_epoch']
+        self.best = state_dict['best']
+        self.patience = state_dict['patience']
 
     def on_validation_end(self, trainer, pl_module):
         return self._run_early_stopping_check(trainer, pl_module)
@@ -130,7 +141,6 @@ def _run_early_stopping_check(self, trainer, pl_module):
             if self.wait >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
-                self.on_train_end(trainer, pl_module)
 
         return stop_training
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index 6350ded8fc67f..e5f93a16b3192 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -334,21 +334,18 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
                 checkpoint['checkpoint_callback_best_model_path'] = self.checkpoint_callback.best_model_path
 
             if self.early_stop_callback:
-                checkpoint['early_stop_callback_wait'] = self.early_stop_callback.wait
-                checkpoint['early_stop_callback_patience'] = self.early_stop_callback.patience
+                checkpoint['early_stop_callback_state_dict'] = self.early_stop_callback.state_dict()
 
             # save optimizers
             optimizer_states = []
             for i, optimizer in enumerate(self.optimizers):
                 optimizer_states.append(optimizer.state_dict())
-
             checkpoint['optimizer_states'] = optimizer_states
 
             # save lr schedulers
             lr_schedulers = []
             for scheduler in self.lr_schedulers:
                 lr_schedulers.append(scheduler['scheduler'].state_dict())
-
             checkpoint['lr_schedulers'] = lr_schedulers
 
             # save native amp scaling
@@ -418,9 +415,9 @@ def restore_training_state(self, checkpoint):
                 self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best']
             self.checkpoint_callback.best_model_path = checkpoint['checkpoint_callback_best_model_path']
 
-        if self.early_stop_callback:
-            self.early_stop_callback.wait = checkpoint['early_stop_callback_wait']
-            self.early_stop_callback.patience = checkpoint['early_stop_callback_patience']
+        if self.early_stop_callback is not None and self.early_stop_callback is not False:
+            state = checkpoint['early_stop_callback_state_dict']
+            self.early_stop_callback.load_state_dict(state)
 
         self.global_step = checkpoint['global_step']
         self.current_epoch = checkpoint['epoch']

From 976335193c708f7ac56a9e8123cd506b28a69525 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 15 Apr 2020 23:57:59 -0400
Subject: [PATCH 047/136] move best attr after monitor_op defined

---
 pytorch_lightning/callbacks/early_stopping.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index ac2aca03614e8..4133c3822efc2 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -62,7 +62,6 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.wait = 0
         self.stopped_epoch = 0
         self.mode = mode
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
         if mode not in self.mode_dict:
             if self.verbose > 0:
@@ -77,7 +76,9 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+        self.monitor_op = self.mode_dict[mode]
+        self.min_delta *= 1 if self.monitor_op == np.greater else -1
+        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
     def _validate_condition_metric(self, logs):
         """

From 0f4fc5f800833256541f3bd174c49163e0ca21ab Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 12:21:58 -0400
Subject: [PATCH 048/136] improve early stopping and model checkpoint callbacks

---
 docs/source/callbacks.rst                     | 12 ++++
 pytorch_lightning/callbacks/early_stopping.py | 48 +++++++++++++---
 pytorch_lightning/trainer/callback_config.py  | 56 +++++++++++--------
 pytorch_lightning/trainer/trainer.py          | 30 +++++++---
 pytorch_lightning/trainer/training_loop.py    | 45 ++++++++-------
 5 files changed, 132 insertions(+), 59 deletions(-)

diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
index 744c1f0c5edd6..3a6b1e84acdd6 100644
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@@ -46,6 +46,18 @@ Example:
 We successfully extended functionality without polluting our super clean
 :class:`~pytorch_lightning.core.LightningModule` research code.
 
+
+Best Practices
+==============
+
+1. Callbacks should be isolated in their functionality. Your callback should not rely on 
+ the presence of other callbacks in order to work properly.
+2. Do not manually call methods from the callback. The callbacks are designed to be
+ invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
+ is strongly discouraged.
+3. Whenever possible, your callbacks should not depend on the order in which they are executed.
+
+
 ---------
 
 .. automodule:: pytorch_lightning.callbacks.base
diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 4133c3822efc2..9fcb6bc614502 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,8 +26,10 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of validation epochs with no improvement
-            after which training will be stopped. Default: ``0``.
+        patience: number of passes through the validation set
+            with no improvement after which training will be stopped. 
+            This will usually correspond with epochs but may vary depending
+            on how often you have configured to check validation. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
             training will stop when the quantity
@@ -76,15 +78,29 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.monitor_op = self.mode_dict[mode]
         self.min_delta *= 1 if self.monitor_op == np.greater else -1
         self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
+    def state_dict(self):
+        return {
+            'wait': self.wait,
+            'stopped_epoch': self.stopped_epoch,
+            'best': self.best,
+            'patience': self.patience
+        }
+
+    def load_state_dict(self, state_dict):
+        state_dict = deepcopy(state_dict)
+        self.wait = state_dict['wait']
+        self.stopped_epoch = state_dict['stopped_epoch']
+        self.best = state_dict['best']
+        self.patience = state_dict['patience']
+
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
-        :param logs:
-        :return:
+        :param logs: callback metrics from validation output
+        :return: True if specified metric is available
         """
         monitor_val = logs.get(self.monitor)
         error_msg = (f'Early stopping conditioned on metric `{self.monitor}`'
@@ -121,14 +137,29 @@ def load_state_dict(self, state_dict):
         self.best = state_dict['best']
         self.patience = state_dict['patience']
 
+    def on_train_start(self, trainer, pl_module):
+        if not (
+            trainer.is_overriden("validation_step") and
+            trainer.is_overriden("validation_epoch_end")
+        ):
+            error_msg = (f'''
+                Early stopping is expecting metrics to be returned from
+                validation but the Lightning model does not have a validation loop
+                defined with logging. Please ensure that your LightningModule has
+                both `validation_step` and `validation_epoch_end` defined.
+            ''')
+            if self.strict:
+                raise RuntimeError(error_msg)
+            if self.verbose > 0:
+                rank_zero_warn(error_msg, RuntimeWarning)
+
     def on_validation_end(self, trainer, pl_module):
         return self._run_early_stopping_check(trainer, pl_module)
 
     def _run_early_stopping_check(self, trainer, pl_module):
         logs = trainer.callback_metrics
-        stop_training = False
         if not self._validate_condition_metric(logs):
-            return stop_training
+            return  # short circuit if metric not present
 
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
@@ -143,7 +174,8 @@ def _run_early_stopping_check(self, trainer, pl_module):
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
 
-        return stop_training
+        if stop_training:
+            trainer.should_stop = True
 
     def on_train_end(self, trainer, pl_module):
         if self.stopped_epoch > 0 and self.verbose > 0:
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 5e490a106826b..77c6ead2a0a10 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -32,24 +32,28 @@ def save_checkpoint(self, *args):
     def is_overridden(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def configure_checkpoint_callback(self):
+    def configure_checkpoint_callback(self,
+                                      checkpoint_callback,
+                                      default_root_dir,
+                                      logger,
+                                      weights_save_path):
         """
         Weight path set in this priority:
         Checkpoint_callback's path (if passed in).
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = self.default_root_dir
-        if self.checkpoint_callback:
+        ckpt_path = default_root_dir
+        if checkpoint_callback:
             # init a default one
-            if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None) or
-                            getattr(self.logger, '_save_dir', None) or
-                            self.default_root_dir)
+            if logger is not None:
+                save_dir = (getattr(logger, 'save_dir', None) or
+                            getattr(logger, '_save_dir', None) or
+                            default_root_dir)
 
                 # weights_save_path overrides anything
-                if self.weights_save_path is not None:
-                    save_dir = self.weights_save_path
+                if weights_save_path is not None:
+                    save_dir = weights_save_path
 
                 version = self.logger.version if isinstance(
                     self.logger.version, str) else f'version_{self.logger.version}'
@@ -60,32 +64,32 @@ def configure_checkpoint_callback(self):
                     "checkpoints"
                 )
             else:
-                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
+                ckpt_path = os.path.join(default_root_dir, "checkpoints")
 
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
             monitor_key = 'loss' if train_step_only else 'val_loss'
 
-            if self.checkpoint_callback is True:
+            if checkpoint_callback is True:
                 os.makedirs(ckpt_path, exist_ok=True)
-                self.checkpoint_callback = ModelCheckpoint(
+                checkpoint_callback = ModelCheckpoint(
                     filepath=ckpt_path,
                     monitor=monitor_key
                 )
             # If user specified None in filepath, override with runtime default
-            elif isinstance(self.checkpoint_callback, ModelCheckpoint) \
-                    and self.checkpoint_callback.dirpath is None:
-                self.checkpoint_callback.dirpath = ckpt_path
-                self.checkpoint_callback.filename = '{epoch}'
-                os.makedirs(self.checkpoint_callback.dirpath, exist_ok=True)
-        elif self.checkpoint_callback is False:
-            self.checkpoint_callback = None
+            elif isinstance(checkpoint_callback, ModelCheckpoint) \
+                    and checkpoint_callback.dirpath is None:
+                checkpoint_callback.dirpath = ckpt_path
+                checkpoint_callback.filename = '{epoch}'
+                os.makedirs(checkpoint_callback.dirpath, exist_ok=True)
+        elif checkpoint_callback is False:
+            checkpoint_callback = None
 
         self.ckpt_path = ckpt_path
 
-        if self.checkpoint_callback:
+        if checkpoint_callback:
             # set the path for the callbacks
-            self.checkpoint_callback.save_function = self.save_checkpoint
+            checkpoint_callback.save_function = self.save_checkpoint
 
             # if checkpoint callback used, then override the weights path
             self.weights_save_path = self.checkpoint_callback.dirpath
@@ -94,22 +98,26 @@ def configure_checkpoint_callback(self):
         if self.weights_save_path is None:
             self.weights_save_path = self.default_root_dir
 
+        return checkpoint_callback
+
     def configure_early_stopping(self, early_stop_callback):
         if early_stop_callback is True or None:
-            self.early_stop_callback = EarlyStopping(
+            early_stop_callback = EarlyStopping(
                 monitor='val_loss',
                 patience=3,
                 strict=True,
                 verbose=True,
                 mode='min'
             )
+            # TODO remove this attribute
             self.enable_early_stop = True
         elif not early_stop_callback:
-            self.early_stop_callback = None
+            early_stop_callback = None
             self.enable_early_stop = False
         else:
-            self.early_stop_callback = early_stop_callback
+            early_stop_callback = early_stop_callback
             self.enable_early_stop = True
+        return early_stop_callback
 
     def configure_progress_bar(self, refresh_rate=1, process_position=0):
         progress_bars = [c for c in self.callbacks if isinstance(c, ProgressBarBase)]
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 3112cc305460e..d2607eeddcac5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -302,6 +302,27 @@ def __init__(
         self.callbacks = callbacks or []
         self.on_init_start()
 
+        # configure early stop callback
+        # creates a default one if none passed in
+        early_stop_callback = self.configure_early_stopping(early_stop_callback)
+        if early_stop_callback:
+            self.callbacks.append(early_stop_callback)
+
+        # configure checkpoint callback
+        # it is important that this is the last callback to run
+        # pass through the required args to figure out defaults
+        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
+                                                                 default_root_dir,
+                                                                 logger,
+                                                                 weights_save_path)
+        if checkpoint_callback:
+            self.callbacks.append(checkpoint_callback)
+
+        # TODO clean this up and follow same pattern as early_stop_callback
+        # configure checkpoint callback
+        self.checkpoint_callback = checkpoint_callback
+        self.weights_save_path = weights_save_path
+
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -400,6 +421,7 @@ def __init__(
         self.global_step = 0
         self.current_epoch = 0
         self.interrupted = False
+        self.should_stop = True
 
         # configure logger
         self.configure_logger(logger)
@@ -409,14 +431,6 @@ def __init__(
             profiler = SimpleProfiler()
         self.profiler = profiler or PassThroughProfiler()
 
-        # configure early stop callback
-        # creates a default one if none passed in
-        self.configure_early_stopping(early_stop_callback)
-
-        # configure checkpoint callback
-        self.checkpoint_callback = checkpoint_callback
-        self.weights_save_path = weights_save_path
-
         # accumulated grads
         self.accumulate_grad_batches = accumulate_grad_batches
         self.configure_accumulated_gradients(accumulate_grad_batches)
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index ed98192eaa260..fcf7cfffea1de 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -149,6 +149,7 @@ def training_step(self, batch, batch_idx):
 
 import numpy as np
 import torch
+import subprocess
 from torch.utils.data import DataLoader
 
 from pytorch_lightning import _logger as log
@@ -158,7 +159,7 @@ def training_step(self, batch, batch_idx):
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
 from pytorch_lightning.utilities import rank_zero_warn
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-import subprocess
+
 
 try:
     from apex import amp
@@ -380,14 +381,15 @@ def train(self):
                 # early stopping as a (new Callback) class doesn't yet work because we have to know these
                 # trainer flags including the current epoch stuff
                 # all of this needs to go into the early stopping to clean up better
-                if self.enable_early_stop:
-                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
-                        should_stop = self.early_stop_callback.on_validation_end(self, self.get_model())
-                        # stop training
-                        stop = should_stop and met_min_epochs
-                        if stop:
-                            self.run_training_teardown()
-                            return
+                if self.should_stop:
+                    # Question: didn't understand the check about self.fast_dev_run
+                    if met_min_epochs and met_min_steps:
+                        self.run_training_teardown()
+                        return
+                    else:
+                        log.info(f'Trainer was signaled to stop but required minimum epochs'
+                                 f' ({self.min_epochs}) or minimum steps ({self.min_steps}) has'
+                                 ' not been met. Training will continue...')
 
             self.run_training_teardown()
 
@@ -445,8 +447,7 @@ def run_training_epoch(self):
             # ---------------
             # RUN TRAIN STEP
             # ---------------
-            _outputs = self.run_training_batch(batch, batch_idx)
-            batch_result, grad_norm_dic, batch_step_metrics, batch_output = _outputs
+            batch_result, grad_norm_dic, batch_step_metrics, batch_output = self.run_training_batch(batch, batch_idx)
 
             # only track outputs when user implements training_epoch_end
             # otherwise we will build up unnecessary memory
@@ -454,7 +455,8 @@ def run_training_epoch(self):
                 outputs.append(batch_output)
 
             # when returning -1 from train_step, we end epoch early
-            early_stop_epoch = batch_result == -1
+            if batch_result == -1:
+                self.should_stop = True
 
             # TODO: consolidate all actions that need to take place only after
             # self.accumulate_grad_batches steps (optimizer step, lr update, global step increment)
@@ -468,26 +470,27 @@ def run_training_epoch(self):
             is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
             can_check_epoch = self.current_epoch % self.check_val_every_n_epoch == 0
             can_check_val = not self.disable_validation and can_check_epoch
-            should_check_val = is_val_check_batch or early_stop_epoch
+            should_check_val = is_val_check_batch or self.should_stop
             should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf'))
             should_check_val = can_check_val and should_check_val
 
-            # ---------------
-            # CHECKPOINTING, EARLY STOPPING
-            # ---------------
             # fast_dev_run always forces val checking after train batch
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
                 self.call_checkpoint_callback()
 
+            # ---------------
+            # CHECKPOINTING, EARLY STOPPING
+            # ---------------
+
             # when logs should be saved
-            should_save_log = (batch_idx + 1) % self.log_save_interval == 0 or early_stop_epoch
+            should_save_log = (batch_idx + 1) % self.log_save_interval == 0 or self.should_stop
             if should_save_log or self.fast_dev_run:
                 if self.is_global_zero and self.logger is not None:
                     self.logger.save()
 
             # when metrics should be logged
-            should_log_metrics = batch_idx % self.row_log_interval == 0 or early_stop_epoch
+            should_log_metrics = batch_idx % self.row_log_interval == 0 or self.should_stop
             if should_log_metrics or self.fast_dev_run:
                 # logs user requested information to logger
                 self.log_metrics(batch_step_metrics, grad_norm_dic)
@@ -504,7 +507,7 @@ def run_training_epoch(self):
             # end epoch early
             # stop when the flag is changed or we've gone past the amount
             # requested in the batches
-            if early_stop_epoch or self.fast_dev_run:
+            if self.fast_dev_run or self.should_stop:
                 break
 
         if self.use_horovod:
@@ -821,6 +824,10 @@ def call_checkpoint_callback(self):
         if self.checkpoint_callback is not None:
             self.checkpoint_callback.on_validation_end(self, self.get_model())
 
+    def call_early_stop_callback(self):
+        if self.early_stop_callback:
+            self.early_stop_callback.on_epoch_end(self, self.get_model())
+
 
 def _with_is_last(iterable):
     """Pass through values from the given iterable with an added boolean indicating if this is the last item.

From 1a24c81b26799d0e7a2f8d4d5fa261718870de7b Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 12:25:30 -0400
Subject: [PATCH 049/136] fix formatting

---
 pytorch_lightning/callbacks/early_stopping.py | 6 +++---
 pytorch_lightning/trainer/callback_config.py  | 6 +++---
 pytorch_lightning/trainer/training_loop.py    | 5 ++++-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 9fcb6bc614502..47282c6bec8be 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -27,7 +27,7 @@ class EarlyStopping(Callback):
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
         patience: number of passes through the validation set
-            with no improvement after which training will be stopped. 
+            with no improvement after which training will be stopped.
             This will usually correspond with epochs but may vary depending
             on how often you have configured to check validation. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
@@ -139,8 +139,8 @@ def load_state_dict(self, state_dict):
 
     def on_train_start(self, trainer, pl_module):
         if not (
-            trainer.is_overriden("validation_step") and
-            trainer.is_overriden("validation_epoch_end")
+            trainer.is_overriden("validation_step") 
+            and trainer.is_overriden("validation_epoch_end")
         ):
             error_msg = (f'''
                 Early stopping is expecting metrics to be returned from
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 77c6ead2a0a10..c83290f7290ad 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -47,9 +47,9 @@ def configure_checkpoint_callback(self,
         if checkpoint_callback:
             # init a default one
             if logger is not None:
-                save_dir = (getattr(logger, 'save_dir', None) or
-                            getattr(logger, '_save_dir', None) or
-                            default_root_dir)
+                save_dir = (getattr(logger, 'save_dir', None)
+                            or getattr(logger, '_save_dir', None)
+                            or default_root_dir)
 
                 # weights_save_path overrides anything
                 if weights_save_path is not None:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index fcf7cfffea1de..c1a6eb11d9090 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -447,7 +447,10 @@ def run_training_epoch(self):
             # ---------------
             # RUN TRAIN STEP
             # ---------------
-            batch_result, grad_norm_dic, batch_step_metrics, batch_output = self.run_training_batch(batch, batch_idx)
+            (batch_result,
+             grad_norm_dic,
+             batch_step_metrics,
+             batch_output) = self.run_training_batch(batch, batch_idx)
 
             # only track outputs when user implements training_epoch_end
             # otherwise we will build up unnecessary memory

From bd0d23aa5320a85f6a55eee13c8d3d2966ebae0e Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 13:20:44 -0400
Subject: [PATCH 050/136] fix attr init order

---
 pytorch_lightning/trainer/trainer.py | 30 ++++++++++++----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index d2607eeddcac5..4440c26e3b618 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -297,6 +297,15 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
+        # configure logger
+        self.configure_logger(logger)
+
+        # set default save path if user didn't provide one
+        self.default_root_dir = default_root_dir
+
+        if self.default_root_dir is None:
+            self.default_root_dir = os.getcwd()
+
         # Init callbacks
         self.prepare_data_per_node = prepare_data_per_node
         self.callbacks = callbacks or []
@@ -311,18 +320,14 @@ def __init__(
         # configure checkpoint callback
         # it is important that this is the last callback to run
         # pass through the required args to figure out defaults
+        self.weights_save_path = weights_save_path
         checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
-                                                                 default_root_dir,
-                                                                 logger,
-                                                                 weights_save_path)
+                                                                 self.default_root_dir,
+                                                                 self.logger,
+                                                                 self.weights_save_path)
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
-        # TODO clean this up and follow same pattern as early_stop_callback
-        # configure checkpoint callback
-        self.checkpoint_callback = checkpoint_callback
-        self.weights_save_path = weights_save_path
-
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -392,12 +397,6 @@ def __init__(
             rank_zero_info('Running in fast_dev_run mode: will run a full train,'
                            ' val and test loop using a single batch')
 
-        # set default save path if user didn't provide one
-        self.default_root_dir = default_root_dir
-
-        if self.default_root_dir is None:
-            self.default_root_dir = os.getcwd()
-
         # training bookeeping
         self.total_batch_idx = 0
         self.running_loss = TensorRunningAccum(window_length=20)
@@ -423,9 +422,6 @@ def __init__(
         self.interrupted = False
         self.should_stop = True
 
-        # configure logger
-        self.configure_logger(logger)
-
         # configure profiler
         if profiler is True:
             profiler = SimpleProfiler()

From fb8c858823a206d3a52416b4ff2267b183949780 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 14:37:07 -0400
Subject: [PATCH 051/136] clean up setting of default_root_dir attr

---
 pytorch_lightning/trainer/trainer.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 4440c26e3b618..f4aeed8f6a2bd 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -301,11 +301,10 @@ def __init__(
         self.configure_logger(logger)
 
         # set default save path if user didn't provide one
+        if default_root_dir is None:
+            default_root_dir = os.getcwd()
         self.default_root_dir = default_root_dir
 
-        if self.default_root_dir is None:
-            self.default_root_dir = os.getcwd()
-
         # Init callbacks
         self.prepare_data_per_node = prepare_data_per_node
         self.callbacks = callbacks or []

From 892bff3fbf1a26fe933bfbc55ec851f4a3dd8ddb Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 15:11:02 -0400
Subject: [PATCH 052/136] logger needs default root dir set first

---
 pytorch_lightning/trainer/trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index f4aeed8f6a2bd..b0cdc5f8b0f68 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -297,14 +297,14 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
-        # configure logger
-        self.configure_logger(logger)
-
         # set default save path if user didn't provide one
         if default_root_dir is None:
             default_root_dir = os.getcwd()
         self.default_root_dir = default_root_dir
 
+        # configure logger
+        self.configure_logger(logger)
+
         # Init callbacks
         self.prepare_data_per_node = prepare_data_per_node
         self.callbacks = callbacks or []

From 368787d68ef9fc14ff846d6a7fd5a3cd49929a6e Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 15:20:42 -0400
Subject: [PATCH 053/136] reorg trainer init

---
 pytorch_lightning/trainer/trainer.py | 53 ++++++++++++++--------------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b0cdc5f8b0f68..7f599aac1437a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -297,6 +297,31 @@ def __init__(
             # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
             os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)
 
+        # training bookeeping
+        self.total_batch_idx = 0
+        self.running_loss = TensorRunningAccum(window_length=20)
+        self.batch_idx = 0
+        self.progress_bar_metrics = {}
+        self.callback_metrics = {}
+        self.num_val_batches = 0
+        self.num_training_batches = 0
+        self.num_test_batches = 0
+        self.train_dataloader = None
+        self.test_dataloaders = None
+        self.val_dataloaders = None
+
+        # training state
+        self.model = None
+        self.testing = False
+        self.disable_validation = False
+        self.lr_schedulers = []
+        self.optimizers = None
+        self.optimizer_frequencies = []
+        self.global_step = 0
+        self.current_epoch = 0
+        self.interrupted = False
+        self.should_stop = True
+
         # set default save path if user didn't provide one
         if default_root_dir is None:
             default_root_dir = os.getcwd()
@@ -308,7 +333,6 @@ def __init__(
         # Init callbacks
         self.prepare_data_per_node = prepare_data_per_node
         self.callbacks = callbacks or []
-        self.on_init_start()
 
         # configure early stop callback
         # creates a default one if none passed in
@@ -327,6 +351,8 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
+        self.on_init_start()
+
         # benchmarking
         self.benchmark = benchmark
         torch.backends.cudnn.benchmark = self.benchmark
@@ -396,31 +422,6 @@ def __init__(
             rank_zero_info('Running in fast_dev_run mode: will run a full train,'
                            ' val and test loop using a single batch')
 
-        # training bookeeping
-        self.total_batch_idx = 0
-        self.running_loss = TensorRunningAccum(window_length=20)
-        self.batch_idx = 0
-        self.progress_bar_metrics = {}
-        self.callback_metrics = {}
-        self.num_val_batches = 0
-        self.num_training_batches = 0
-        self.num_test_batches = 0
-        self.train_dataloader = None
-        self.test_dataloaders = None
-        self.val_dataloaders = None
-
-        # training state
-        self.model = None
-        self.testing = False
-        self.disable_validation = False
-        self.lr_schedulers = []
-        self.optimizers = None
-        self.optimizer_frequencies = []
-        self.global_step = 0
-        self.current_epoch = 0
-        self.interrupted = False
-        self.should_stop = True
-
         # configure profiler
         if profiler is True:
             profiler = SimpleProfiler()

From 191f3e8612165853f4b53e44b5559e2beca62774 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 20:52:33 -0400
Subject: [PATCH 054/136] remove direct references to checkpoint callback

---
 .../trainer/distrib_data_parallel.py          |  1 -
 pytorch_lightning/trainer/lr_finder.py        |  6 ++++
 pytorch_lightning/trainer/trainer.py          |  6 ++--
 pytorch_lightning/trainer/training_io.py      | 32 +++++++++++++------
 pytorch_lightning/trainer/training_loop.py    |  1 -
 5 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
index d2b190ca06279..687c88c160619 100644
--- a/pytorch_lightning/trainer/distrib_data_parallel.py
+++ b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -153,7 +153,6 @@ class TrainerDDPMixin(ABC):
     num_gpu_nodes: int
     gpus: List[int]
     logger: Union[LightningLoggerBase, bool]
-    checkpoint_callback: Union[ModelCheckpoint, bool]
     data_parallel_device_ids: ...
     distributed_backend: Optional[str]
     amp_level: str
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 96f38c86cb939..d332aba2e2b9a 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -165,6 +165,9 @@ def lr_find(self,
         self.early_stop_callback = None
         self.enable_early_stop = False
 
+        # Accumulation of gradients
+        self.accumulate_grad_batches = num_accumulation_steps
+
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
         self.model = model
@@ -216,6 +219,9 @@ def __lr_finder_dump_params(self, model):
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
             'enable_early_stop': self.enable_early_stop,
+            'progress_bar_refresh_rate': self.progress_bar_refresh_rate,
+            'accumulate_grad_batches': self.accumulate_grad_batches,
+            'progress_bar_callback': self.progress_bar_callback,
             'configure_optimizers': model.configure_optimizers,
         }
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 7f599aac1437a..e1b6f0b09ac79 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -351,6 +351,9 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
+        # TODO refactor codebase (tests) to not directly reach into this callback
+        self.checkpoint_callback = checkpoint_callback
+
         self.on_init_start()
 
         # benchmarking
@@ -968,9 +971,6 @@ def run_pretrain_routine(self, model: LightningModule):
         # if cluster resets state, the model will update with the saved weights
         self.model = model
 
-        # set up checkpoint callback
-        self.configure_checkpoint_callback()
-
         # restore training and model before hpc call
         self.restore_weights(model)
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
index e5f93a16b3192..22c08b1c87e62 100644
--- a/pytorch_lightning/trainer/training_io.py
+++ b/pytorch_lightning/trainer/training_io.py
@@ -95,6 +95,7 @@
 import pytorch_lightning
 from pytorch_lightning import _logger as log
 from pytorch_lightning.core.lightning import LightningModule
+from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.overrides.data_parallel import (
     LightningDistributedDataParallel,
@@ -329,12 +330,21 @@ def dump_checkpoint(self, weights_only: bool = False) -> dict:
         }
 
         if not weights_only:
-            if self.checkpoint_callback:
+
+            # TODO support more generic way for callbacks to persist a state_dict in a checkpoint
+            checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+            early_stopping_callbacks = [c for c in self.callbacks if isinstance(c, EarlyStopping)]
+
+            if checkpoint_callbacks:
+                # we add the official checkpoint callback to the end of the list
+                # extra user provided callbacks will not be persisted yet
                 checkpoint['checkpoint_callback_best_model_score'] = self.checkpoint_callback.best_model_score
                 checkpoint['checkpoint_callback_best_model_path'] = self.checkpoint_callback.best_model_path
 
-            if self.early_stop_callback:
-                checkpoint['early_stop_callback_state_dict'] = self.early_stop_callback.state_dict()
+            if early_stopping_callbacks and checkpoint_callbacks:
+                # we add the official early stopping callback to the end of the list
+                # extra user provided callbacks will not be persisted yet
+                checkpoint['early_stop_callback_state_dict'] = early_stopping_callbacks[-1].state_dict()
 
             # save optimizers
             optimizer_states = []
@@ -403,21 +413,25 @@ def restore_training_state(self, checkpoint):
                 ' This is probably due to `ModelCheckpoint.save_weights_only` being set to `True`.'
             )
 
-        if self.checkpoint_callback:
+        # TODO support more generic way for callbacks to load callback state_dicts
+        checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+        early_stopping_callbacks = [c for c in self.callbacks if isinstance(c, EarlyStopping)]
+
+        if checkpoint_callbacks:
             if 'checkpoint_callback_best_model_score' in checkpoint:
-                self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best_model_score']
+                checkpoint_callbacks[-1].best_model_score = checkpoint['checkpoint_callback_best_model_score']
             else:
                 # Old naming until version 0.7.6
                 rank_zero_warn(
                     'Loading a checkpoint created with an old version of Lightning; '
                     'this will not be supported in the future.'
                 )
-                self.checkpoint_callback.best_model_score = checkpoint['checkpoint_callback_best']
-            self.checkpoint_callback.best_model_path = checkpoint['checkpoint_callback_best_model_path']
+                checkpoint_callbacks[-1].best_model_score = checkpoint['checkpoint_callback_best']
+            checkpoint_callbacks[-1].best_model_path = checkpoint['checkpoint_callback_best_model_path']
 
-        if self.early_stop_callback is not None and self.early_stop_callback is not False:
+        if early_stopping_callbacks:
             state = checkpoint['early_stop_callback_state_dict']
-            self.early_stop_callback.load_state_dict(state)
+            early_stopping_callbacks[-1].load_state_dict(state)
 
         self.global_step = checkpoint['global_step']
         self.current_epoch = checkpoint['epoch']
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index c1a6eb11d9090..aec1f103eb346 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -235,7 +235,6 @@ class TrainerTrainLoopMixin(ABC):
     max_steps: int
     min_steps: int
     total_batch_idx: int
-    checkpoint_callback: ...
     terminate_on_nan: bool
     tpu_id: int
     interactive_ddp_procs: ...

From b0a0b226e06fbeb345ad5066d8e3fdf5842b725c Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 22:57:04 -0400
Subject: [PATCH 055/136] more fixes

---
 pytorch_lightning/callbacks/early_stopping.py | 21 +++++--------------
 pytorch_lightning/trainer/callback_config.py  |  2 +-
 pytorch_lightning/trainer/trainer.py          |  4 ----
 pytorch_lightning/trainer/training_loop.py    |  3 ---
 4 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 47282c6bec8be..70c7a87c990f2 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -81,21 +81,6 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.min_delta *= 1 if self.monitor_op == np.greater else -1
         self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 
-    def state_dict(self):
-        return {
-            'wait': self.wait,
-            'stopped_epoch': self.stopped_epoch,
-            'best': self.best,
-            'patience': self.patience
-        }
-
-    def load_state_dict(self, state_dict):
-        state_dict = deepcopy(state_dict)
-        self.wait = state_dict['wait']
-        self.stopped_epoch = state_dict['stopped_epoch']
-        self.best = state_dict['best']
-        self.patience = state_dict['patience']
-
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
@@ -137,9 +122,13 @@ def load_state_dict(self, state_dict):
         self.best = state_dict['best']
         self.patience = state_dict['patience']
 
+    def on_sanity_check_end(self, trainer, pl_module):
+        logs = trainer.callback_metrics
+        self._validate_condition_metric(logs)
+
     def on_train_start(self, trainer, pl_module):
         if not (
-            trainer.is_overriden("validation_step") 
+            trainer.is_overriden("validation_step")
             and trainer.is_overriden("validation_epoch_end")
         ):
             error_msg = (f'''
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index c83290f7290ad..f1aaf253b99e4 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -92,7 +92,7 @@ def configure_checkpoint_callback(self,
             checkpoint_callback.save_function = self.save_checkpoint
 
             # if checkpoint callback used, then override the weights path
-            self.weights_save_path = self.checkpoint_callback.dirpath
+            self.weights_save_path = checkpoint_callback.dirpath
 
         # if weights_save_path is still none here, set to current working dir
         if self.weights_save_path is None:
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index e1b6f0b09ac79..a68ec064a9da8 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1002,10 +1002,6 @@ def run_pretrain_routine(self, model: LightningModule):
 
             self.on_sanity_check_end()
 
-            # verify that early stop has conditioned on a metric that exists
-            if self.enable_early_stop:
-                self.early_stop_callback._validate_condition_metric(callback_metrics)
-
         # clear cache before training
         if self.on_gpu and self.root_gpu is not None:
             # use context because of:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index aec1f103eb346..e522e28a7a9d4 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -329,9 +329,6 @@ def train(self):
         with self.profiler.profile('on_train_start'):
             # callbacks
             self.on_train_start()
-            # initialize early stop callback
-            if self.early_stop_callback is not None:
-                self.early_stop_callback.on_train_start(self, self.get_model())
             # model hooks
             model.on_train_start()
 

From 3fe257e2cef385c54b3d577be036a444012c768e Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 25 Apr 2020 23:52:56 -0400
Subject: [PATCH 056/136] more bugfixes

---
 pytorch_lightning/callbacks/early_stopping.py | 3 ++-
 pytorch_lightning/trainer/logging.py          | 2 +-
 pytorch_lightning/trainer/trainer.py          | 2 +-
 pytorch_lightning/trainer/training_loop.py    | 9 +++++++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 70c7a87c990f2..085b5bd250d30 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -54,7 +54,7 @@ class EarlyStopping(Callback):
     }
 
     def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3,
-                 verbose: bool = False, mode: str = 'auto', strict: bool = True):
+                 verbose: bool = False, mode: str = 'auto', strict: bool = False):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
@@ -150,6 +150,7 @@ def _run_early_stopping_check(self, trainer, pl_module):
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
 
+        stop_training = False
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index 5349849e09b89..d257db50996d0 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -29,7 +29,7 @@ def configure_logger(self, logger):
         if logger is True:
             # default logger
             self.logger = TensorBoardLogger(
-                save_dir=self.default_root_dir,
+                save_dir=str(self.default_root_dir),
                 version=self.slurm_job_id,
                 name='lightning_logs'
             )
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index a68ec064a9da8..b45e33447c9c5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -320,7 +320,7 @@ def __init__(
         self.global_step = 0
         self.current_epoch = 0
         self.interrupted = False
-        self.should_stop = True
+        self.should_stop = False
 
         # set default save path if user didn't provide one
         if default_root_dir is None:
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index e522e28a7a9d4..a9491812e7d0a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -154,6 +154,7 @@ def training_step(self, batch, batch_idx):
 
 from pytorch_lightning import _logger as log
 from pytorch_lightning.callbacks.base import Callback
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.trainer.supporters import TensorRunningAccum
@@ -378,8 +379,7 @@ def train(self):
                 # trainer flags including the current epoch stuff
                 # all of this needs to go into the early stopping to clean up better
                 if self.should_stop:
-                    # Question: didn't understand the check about self.fast_dev_run
-                    if met_min_epochs and met_min_steps:
+                    if (met_min_epochs and met_min_steps) or self.fast_dev_run:
                         self.run_training_teardown()
                         return
                     else:
@@ -527,6 +527,11 @@ def run_training_epoch(self):
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
             self.call_checkpoint_callback()
 
+        # when no val loop is present or fast-dev-run still need to call checkpoints
+        if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
+            checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
+            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
+
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):
             # callbacks

From 1649e9cc72e5c61bdad6584da6c2d4592ba26b6f Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 27 Apr 2020 23:57:29 -0400
Subject: [PATCH 057/136] run callbacks at epoch end

---
 pytorch_lightning/callbacks/early_stopping.py   | 5 +----
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 pytorch_lightning/trainer/trainer.py            | 1 +
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 085b5bd250d30..039b8739fe0f2 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -142,10 +142,7 @@ def on_train_start(self, trainer, pl_module):
             if self.verbose > 0:
                 rank_zero_warn(error_msg, RuntimeWarning)
 
-    def on_validation_end(self, trainer, pl_module):
-        return self._run_early_stopping_check(trainer, pl_module)
-
-    def _run_early_stopping_check(self, trainer, pl_module):
+    def on_epoch_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 6a47e6f58c88a..5f8434bf21222 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -227,7 +227,7 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     @rank_zero_only
-    def on_validation_end(self, trainer, pl_module):
+    def on_epoch_end(self, trainer, pl_module):
         # only run on main process
         if trainer.global_rank != 0:
             return
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b45e33447c9c5..ee0f05243d78a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -999,6 +999,7 @@ def run_pretrain_routine(self, model: LightningModule):
                                           self.num_sanity_val_steps,
                                           False)
             _, _, _, callback_metrics, _ = self.process_output(eval_results)
+            self.callback_metrics = callback_metrics
 
             self.on_sanity_check_end()
 

From 4a3146a64ada74ec7d7781eefe16c5166391a377 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 28 Apr 2020 22:23:20 -0400
Subject: [PATCH 058/136] update tests to use on epoch end

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 424e4c7b947ff..2e6faf0a7c31e 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -273,7 +273,7 @@ def mock_save_function(filepath, *args):
     for i, loss in enumerate(losses):
         trainer.current_epoch = i
         trainer.callback_metrics = {'val_loss': torch.tensor(loss)}
-        checkpoint_callback.on_validation_end(trainer, trainer.get_model())
+        checkpoint_callback.on_epoch_end(trainer, trainer.get_model())
 
     file_lists = set(os.listdir(tmpdir))
 

From 460fcefd21ba957cb176265f8fb830b206b0d435 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 28 Apr 2020 22:41:54 -0400
Subject: [PATCH 059/136] PR cleanup

---
 pytorch_lightning/callbacks/early_stopping.py | 22 ++-----------------
 pytorch_lightning/trainer/training_loop.py    |  2 +-
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 039b8739fe0f2..f3e14c9befe12 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,10 +26,8 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of passes through the validation set
-            with no improvement after which training will be stopped.
-            This will usually correspond with epochs but may vary depending
-            on how often you have configured to check validation. Default: ``0``.
+        patience: number of epochs with no improvement
+            after which training will be stopped. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
             training will stop when the quantity
@@ -126,22 +124,6 @@ def on_sanity_check_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         self._validate_condition_metric(logs)
 
-    def on_train_start(self, trainer, pl_module):
-        if not (
-            trainer.is_overriden("validation_step")
-            and trainer.is_overriden("validation_epoch_end")
-        ):
-            error_msg = (f'''
-                Early stopping is expecting metrics to be returned from
-                validation but the Lightning model does not have a validation loop
-                defined with logging. Please ensure that your LightningModule has
-                both `validation_step` and `validation_epoch_end` defined.
-            ''')
-            if self.strict:
-                raise RuntimeError(error_msg)
-            if self.verbose > 0:
-                rank_zero_warn(error_msg, RuntimeWarning)
-
     def on_epoch_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index a9491812e7d0a..b9c9c2bda69d0 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -530,7 +530,7 @@ def run_training_epoch(self):
         # when no val loop is present or fast-dev-run still need to call checkpoints
         if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
-            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
+            [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
 
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):

From 9f51575e3a0bb634d866d67f65feb49b3e7c3d8d Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 29 Apr 2020 21:46:09 -0400
Subject: [PATCH 060/136] address failing tests

---
 pytorch_lightning/loggers/wandb.py | 7 ++-----
 tests/base/utils.py                | 2 +-
 tests/loggers/test_wandb.py        | 3 +++
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 1b91afaec1e0b..c4cccbdbf019b 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -132,11 +132,8 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
 
     @property
     def name(self) -> str:
-        # don't create an experiment if we don't have one
-        name = self._experiment.project_name() if self._experiment else None
-        return name
+        return self.experiment.project_name()
 
     @property
     def version(self) -> str:
-        # don't create an experiment if we don't have one
-        return self._experiment.id if self._experiment else None
+        return self.experiment.id
diff --git a/tests/base/utils.py b/tests/base/utils.py
index 6690b18b804df..b4901843d9b58 100644
--- a/tests/base/utils.py
+++ b/tests/base/utils.py
@@ -102,7 +102,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi
 
 def get_default_logger(save_dir, version=None):
     # set up logger object without actually saving logs
-    logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version)
+    logger = TensorBoardLogger(str(save_dir), name='lightning_logs', version=version)
     return logger
 
 
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 0eb22331f690c..8b608191dde0b 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -38,6 +38,9 @@ def test_wandb_pickle(wandb):
     class Experiment:
         id = 'the_id'
 
+        def project_name(self):
+            return 'the_project_name'
+
     wandb.init.return_value = Experiment()
 
     logger = WandbLogger(id='the_id', offline=True)

From e47d2519a44ff337ce79834bd2ad03a3fca05ad5 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 30 Apr 2020 23:07:15 -0400
Subject: [PATCH 061/136] refactor for homogeneity

---
 pytorch_lightning/trainer/callback_config.py | 22 ++++++++------------
 pytorch_lightning/trainer/trainer.py         |  5 +----
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index f1aaf253b99e4..cd8dfc08bbf0d 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -32,28 +32,24 @@ def save_checkpoint(self, *args):
     def is_overridden(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
-    def configure_checkpoint_callback(self,
-                                      checkpoint_callback,
-                                      default_root_dir,
-                                      logger,
-                                      weights_save_path):
+    def configure_checkpoint_callback(self, checkpoint_callback):
         """
         Weight path set in this priority:
         Checkpoint_callback's path (if passed in).
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = default_root_dir
+        ckpt_path = self.default_root_dir
         if checkpoint_callback:
             # init a default one
-            if logger is not None:
-                save_dir = (getattr(logger, 'save_dir', None)
-                            or getattr(logger, '_save_dir', None)
-                            or default_root_dir)
+            if self.logger is not None:
+                save_dir = (getattr(self.logger, 'save_dir', None)
+                            or getattr(self.logger, '_save_dir', None)
+                            or self.default_root_dir)
 
                 # weights_save_path overrides anything
-                if weights_save_path is not None:
-                    save_dir = weights_save_path
+                if self.weights_save_path is not None:
+                    save_dir = self.weights_save_path
 
                 version = self.logger.version if isinstance(
                     self.logger.version, str) else f'version_{self.logger.version}'
@@ -64,7 +60,7 @@ def configure_checkpoint_callback(self,
                     "checkpoints"
                 )
             else:
-                ckpt_path = os.path.join(default_root_dir, "checkpoints")
+                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
 
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ee0f05243d78a..38264496a6f9a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -344,10 +344,7 @@ def __init__(
         # it is important that this is the last callback to run
         # pass through the required args to figure out defaults
         self.weights_save_path = weights_save_path
-        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback,
-                                                                 self.default_root_dir,
-                                                                 self.logger,
-                                                                 self.weights_save_path)
+        checkpoint_callback = self.configure_checkpoint_callback(checkpoint_callback)
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 

From 78f7efbcf75c3b2a75769e8f347623b4644a1241 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 4 May 2020 21:56:19 -0400
Subject: [PATCH 062/136] fix merge conflict

---
 pytorch_lightning/trainer/lr_finder.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index d332aba2e2b9a..91f8cf0ad0021 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -225,16 +225,18 @@ def __lr_finder_dump_params(self, model):
             'configure_optimizers': model.configure_optimizers,
         }
 
-    def __lr_finder_restore_params(self, model):
-        self.auto_lr_find = self.__dumped_params['auto_lr_find']
-        self.logger = self.__dumped_params['logger']
-        self.callbacks = self.__dumped_params['callbacks']
-        self.max_steps = self.__dumped_params['max_steps']
-        self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
-        self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
-        model.configure_optimizers = self.__dumped_params['configure_optimizers']
-        del self.__dumped_params
+    def _lr_finder_restore_params(self, model):
+        self.auto_lr_find = self._params['auto_lr_find']
+        self.logger = self._params['logger']
+        self.callbacks = self._params['callbacks']
+        self.max_steps = self._params['max_steps']
+        self.progress_bar_refresh_rate = self._params['progress_bar_refresh_rate']
+        self.accumulate_grad_batches = self._params['accumulate_grad_batches']
+        self.checkpoint_callback = self._params['checkpoint_callback']
+        self.early_stop_callback = self._params['early_stop_callback']
+        self.enable_early_stop = self._params['enable_early_stop']
+        self.progress_bar_callback = self._params['progress_bar_callback']
+        model.configure_optimizers = self._params['configure_optimizers']
 
 
 class _LRFinder(object):

From a4c72ccd9ceb0a32aef842a0ad584d24a496f7ff Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 21 May 2020 22:37:53 -0400
Subject: [PATCH 063/136] separate tests

---
 tests/callbacks/test_callbacks.py            | 83 --------------------
 tests/callbacks/test_early_stopping.py       | 44 +++++++++++
 tests/callbacks/test_learning_rate_logger.py | 78 ++++++++++++++++++
 tests/callbacks/test_model_checkpoint.py     | 58 ++++++++++++++
 4 files changed, 180 insertions(+), 83 deletions(-)
 create mode 100644 tests/callbacks/test_early_stopping.py
 create mode 100644 tests/callbacks/test_learning_rate_logger.py
 create mode 100644 tests/callbacks/test_model_checkpoint.py

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index 399f6ba3cb06d..27f3b13b7c8b2 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -6,7 +6,6 @@
 import tests.base.utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
 
@@ -223,85 +222,3 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_validation_end_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_validation_batch_start_called
-
-
-def test_early_stopping_no_val_step(tmpdir):
-    """Test that early stopping callback falls back to training metrics when no validation defined."""
-
-    class CurrentModel(EvalModelTemplate):
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            output.update({'my_train_metric': output['loss']})  # could be anything else
-            return output
-
-    model = CurrentModel()
-    model.validation_step = None
-    model.val_dataloader = None
-
-    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=stopping,
-        overfit_pct=0.20,
-        max_epochs=2,
-    )
-    result = trainer.fit(model)
-
-    assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch <= trainer.max_epochs
-
-
-def test_pickling(tmpdir):
-    import pickle
-    early_stopping = EarlyStopping()
-    ckpt = ModelCheckpoint(tmpdir)
-
-    early_stopping_pickled = pickle.dumps(early_stopping)
-    ckpt_pickled = pickle.dumps(ckpt)
-
-    early_stopping_loaded = pickle.loads(early_stopping_pickled)
-    ckpt_loaded = pickle.loads(ckpt_pickled)
-
-    assert vars(early_stopping) == vars(early_stopping_loaded)
-    assert vars(ckpt) == vars(ckpt_loaded)
-
-
-@pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
-def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
-    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-
-    checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
-
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_pct=0.20,
-                      max_epochs=2
-                      )
-    trainer.fit(model)
-
-    # These should be different if the dirpath has be overridden
-    assert trainer.ckpt_path != trainer.default_root_dir
-
-
-@pytest.mark.parametrize(
-    'logger_version,expected',
-    [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
-)
-def test_model_checkpoint_path(tmpdir, logger_version, expected):
-    """Test that "version_" prefix is only added when logger's version is an integer"""
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-    logger = TensorBoardLogger(str(tmpdir), version=logger_version)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        overfit_pct=0.2,
-        max_epochs=2,
-        logger=logger
-    )
-    trainer.fit(model)
-
-    ckpt_version = Path(trainer.ckpt_path).parent.name
-    assert ckpt_version == expected
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
new file mode 100644
index 0000000000000..4560791f8e0a3
--- /dev/null
+++ b/tests/callbacks/test_early_stopping.py
@@ -0,0 +1,44 @@
+import pytest
+
+import tests.base.utils as tutils
+from pytorch_lightning import Callback
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from tests.base import EvalModelTemplate
+from pathlib import Path
+
+
+# TODO remove this test
+def test_early_stopping_no_val_step(tmpdir):
+    """Test that early stopping callback falls back to training metrics when no validation defined."""
+
+    class CurrentModel(EvalModelTemplate):
+        def training_step(self, *args, **kwargs):
+            output = super().training_step(*args, **kwargs)
+            output.update({'my_train_metric': output['loss']})  # could be anything else
+            return output
+
+    model = CurrentModel()
+    model.validation_step = None
+    model.val_dataloader = None
+
+    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        early_stop_callback=stopping,
+        overfit_pct=0.20,
+        max_epochs=5,
+    )
+    result = trainer.fit(model)
+
+    assert result == 1, 'training failed to complete'
+    assert trainer.current_epoch < trainer.max_epochs
+
+
+def test_pickling(tmpdir):
+    import pickle
+    early_stopping = EarlyStopping()
+    early_stopping_pickled = pickle.dumps(early_stopping)
+    early_stopping_loaded = pickle.loads(early_stopping_pickled)
+    assert vars(early_stopping) == vars(early_stopping_loaded)
\ No newline at end of file
diff --git a/tests/callbacks/test_learning_rate_logger.py b/tests/callbacks/test_learning_rate_logger.py
new file mode 100644
index 0000000000000..466d030f9a8c3
--- /dev/null
+++ b/tests/callbacks/test_learning_rate_logger.py
@@ -0,0 +1,78 @@
+import tests.base.utils as tutils
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import LearningRateLogger
+from tests.base import EvalModelTemplate
+
+
+def test_lr_logger_single_lr(tmpdir):
+    """ Test that learning rates are extracted and logged for single lr scheduler"""
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__single_scheduler
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=5,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert results == 1
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of lr schedulers'
+    assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
+
+
+def test_lr_logger_multi_lrs(tmpdir):
+    """ Test that learning rates are extracted and logged for multi lr schedulers """
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__multiple_schedulers
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert results == 1
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of lr schedulers'
+    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
+
+
+def test_lr_logger_param_groups(tmpdir):
+    """ Test that learning rates are extracted and logged for single lr scheduler"""
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    model.configure_optimizers = model.configure_optimizers__param_groups
+
+    lr_logger = LearningRateLogger()
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=5,
+        val_percent_check=0.1,
+        train_percent_check=0.5,
+        callbacks=[lr_logger]
+    )
+    results = trainer.fit(model)
+
+    assert lr_logger.lrs, 'No learning rates logged'
+    assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \
+        'Number of learning rates logged does not match number of param groups'
+    assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \
+        'Names of learning rates not set correctly'
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
new file mode 100644
index 0000000000000..5e758a1f51ee2
--- /dev/null
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -0,0 +1,58 @@
+import pytest
+
+import tests.base.utils as tutils
+from pytorch_lightning import Callback
+from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.loggers import TensorBoardLogger
+from tests.base import EvalModelTemplate
+from pathlib import Path
+
+
+@pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
+def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
+    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
+    tutils.reset_seed()
+    model = EvalModelTemplate()
+
+    checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
+
+    trainer = Trainer(default_root_dir=tmpdir,
+                      checkpoint_callback=checkpoint,
+                      overfit_pct=0.20,
+                      max_epochs=5
+                      )
+    trainer.fit(model)
+
+    # These should be different if the dirpath has be overridden
+    assert trainer.ckpt_path != trainer.default_root_dir
+
+
+@pytest.mark.parametrize(
+    'logger_version,expected',
+    [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
+)
+def test_model_checkpoint_path(tmpdir, logger_version, expected):
+    """Test that "version_" prefix is only added when logger's version is an integer"""
+    tutils.reset_seed()
+    model = EvalModelTemplate()
+    logger = TensorBoardLogger(str(tmpdir), version=logger_version)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        overfit_pct=0.2,
+        max_epochs=5,
+        logger=logger
+    )
+    trainer.fit(model)
+
+    ckpt_version = Path(trainer.ckpt_path).parent.name
+    assert ckpt_version == expected
+
+
+def test_pickling(tmpdir):
+    import pickle
+    ckpt = ModelCheckpoint(tmpdir)
+    ckpt_pickled = pickle.dumps(ckpt)
+    ckpt_loaded = pickle.loads(ckpt_pickled)
+    assert vars(ckpt) == vars(ckpt_loaded)

From d81c90c658912da9d82f9c37b78023e933afe8bf Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 12:39:12 -0400
Subject: [PATCH 064/136] tests for early stopping bug regressions

---
 tests/callbacks/test_early_stopping.py | 70 ++++++++++++++++++--------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 4560791f8e0a3..0709e226a8807 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -1,5 +1,6 @@
 import pytest
 
+import torch
 import tests.base.utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
@@ -9,31 +10,58 @@
 from pathlib import Path
 
 
-# TODO remove this test
-def test_early_stopping_no_val_step(tmpdir):
-    """Test that early stopping callback falls back to training metrics when no validation defined."""
+def test_resume_early_stopping_from_checkpoint(tmpdir):
+    """
+    Prevent regressions to bugs:
+    https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
+    https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
+    """
+    class EarlyStoppingTestRestore(EarlyStopping):
+        def __init__(self, expected_state):
+            super().__init__()
+            self.expected_state = expected_state
 
-    class CurrentModel(EvalModelTemplate):
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            output.update({'my_train_metric': output['loss']})  # could be anything else
-            return output
+        def on_train_start(self, trainer, pl_module):
+            assert self.state_dict() == self.expected_state
 
-    model = CurrentModel()
-    model.validation_step = None
-    model.val_dataloader = None
+    model = EvalModelTemplate()
+    checkpoint_callback = ModelCheckpoint(save_top_k=1)
+    early_stop_callback = EarlyStopping()
+    trainer = Trainer(checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, max_epochs=4)
+    trainer.fit(model)
+    early_stop_callback_state = early_stop_callback.state_dict()
 
-    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=stopping,
-        overfit_pct=0.20,
-        max_epochs=5,
-    )
-    result = trainer.fit(model)
+    checkpoint_filepath = checkpoint_callback.kth_best_model
+    # ensure state is persisted properly
+    checkpoint = torch.load(checkpoint_filepath)
+    assert checkpoint['early_stop_callback_state_dict'] == early_stop_callback_state
+    # ensure state is reloaded properly (assertion in the callback)
+    early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
+    new_trainer = Trainer(max_epochs=2,
+                          resume_from_checkpoint=checkpoint_filepath,
+                          early_stop_callback=early_stop_callback)
+    new_trainer.fit(model)
 
-    assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch < trainer.max_epochs
+
+def test_early_stopping_no_extraneous_invocations():
+    """Test to ensure that callback methods aren't being invoked outside of the callback handler."""
+    class EarlyStoppingTestInvocations(EarlyStopping):
+        def __init__(self, expected_count):
+            super().__init__()
+            self.count = 0
+            self.expected_count = expected_count
+
+        def on_validation_end(self, trainer, pl_module):
+            self.count += 1
+
+        def on_train_end(self, trainer, pl_module):
+            assert self.count == self.expected_count
+
+    model = EvalModelTemplate()
+    expected_count = 4
+    early_stop_callback = EarlyStoppingTestInvocations(expected_count)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, max_epochs=expected_count)
+    trainer.fit(model)
 
 
 def test_pickling(tmpdir):

From fc616f24068019cf8ea2861a8598366a14255711 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 12:48:32 -0400
Subject: [PATCH 065/136] small fixes

---
 pytorch_lightning/callbacks/early_stopping.py | 2 +-
 pytorch_lightning/trainer/trainer.py          | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index f3e14c9befe12..45c12a9dfa141 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -52,7 +52,7 @@ class EarlyStopping(Callback):
     }
 
     def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience: int = 3,
-                 verbose: bool = False, mode: str = 'auto', strict: bool = False):
+                 verbose: bool = False, mode: str = 'auto', strict: bool = True):
         super().__init__()
         self.monitor = monitor
         self.patience = patience
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 38264496a6f9a..da60a11d97f1d 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -348,8 +348,9 @@ def __init__(
         if checkpoint_callback:
             self.callbacks.append(checkpoint_callback)
 
-        # TODO refactor codebase (tests) to not directly reach into this callback
+        # TODO refactor codebase (tests) to not directly reach into these callbacks
         self.checkpoint_callback = checkpoint_callback
+        self.early_stop_callback = early_stop_callback
 
         self.on_init_start()
 

From 78a092d86861313b396f2a6fd1e8d47f9101aab7 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 13:00:22 -0400
Subject: [PATCH 066/136] revert model checkpoint change

---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 tests/callbacks/test_model_checkpoint.py        | 3 +--
 tests/trainer/test_trainer.py                   | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 5f8434bf21222..6a47e6f58c88a 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -227,7 +227,7 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     @rank_zero_only
-    def on_epoch_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer, pl_module):
         # only run on main process
         if trainer.global_rank != 0:
             return
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 5e758a1f51ee2..86744e3cef099 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,9 +1,8 @@
 import pytest
 
 import tests.base.utils as tutils
-from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
+from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
 from pathlib import Path
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 2e6faf0a7c31e..424e4c7b947ff 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -273,7 +273,7 @@ def mock_save_function(filepath, *args):
     for i, loss in enumerate(losses):
         trainer.current_epoch = i
         trainer.callback_metrics = {'val_loss': torch.tensor(loss)}
-        checkpoint_callback.on_epoch_end(trainer, trainer.get_model())
+        checkpoint_callback.on_validation_end(trainer, trainer.get_model())
 
     file_lists = set(os.listdir(tmpdir))
 

From 8da8f64b05c4fe1d25cd855176696dfca9c85b0e Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 13:00:31 -0400
Subject: [PATCH 067/136] typo fix

---
 pytorch_lightning/trainer/training_loop.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index b9c9c2bda69d0..55662990d8f0d 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -525,10 +525,6 @@ def run_training_epoch(self):
 
         # when no val loop is present or fast-dev-run still need to call checkpoints
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
-            self.call_checkpoint_callback()
-
-        # when no val loop is present or fast-dev-run still need to call checkpoints
-        if not self.is_overriden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
             [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
 

From 02694cf542b26f5eff838af626948d1a65bac958 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 23 May 2020 23:47:48 -0400
Subject: [PATCH 068/136] fix tests

---
 docs/source/callbacks.rst                    | 8 ++++----
 pytorch_lightning/trainer/callback_config.py | 2 +-
 pytorch_lightning/trainer/logging.py         | 2 +-
 tests/base/utils.py                          | 2 +-
 tests/loggers/test_all.py                    | 2 +-
 tests/trainer/test_trainer_cli.py            | 6 +++---
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/source/callbacks.rst b/docs/source/callbacks.rst
index 3a6b1e84acdd6..2dcf81277e1b9 100644
--- a/docs/source/callbacks.rst
+++ b/docs/source/callbacks.rst
@@ -50,11 +50,11 @@ We successfully extended functionality without polluting our super clean
 Best Practices
 ==============
 
-1. Callbacks should be isolated in their functionality. Your callback should not rely on 
- the presence of other callbacks in order to work properly.
+1. Callbacks should be isolated in their functionality. Your callback should not rely on the 
+behavior of other callbacks in order to work properly.
 2. Do not manually call methods from the callback. The callbacks are designed to be
- invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
- is strongly discouraged.
+invoked at specific times during training. Directly calling methods (eg. `on_validation_end`)
+is strongly discouraged.
 3. Whenever possible, your callbacks should not depend on the order in which they are executed.
 
 
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index cd8dfc08bbf0d..6008730ac4c87 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -48,7 +48,7 @@ def configure_checkpoint_callback(self, checkpoint_callback):
                             or self.default_root_dir)
 
                 # weights_save_path overrides anything
-                if self.weights_save_path is not None:
+                if self.weights_save_path is not None and self.weights_save_path is not True:
                     save_dir = self.weights_save_path
 
                 version = self.logger.version if isinstance(
diff --git a/pytorch_lightning/trainer/logging.py b/pytorch_lightning/trainer/logging.py
index d257db50996d0..5349849e09b89 100644
--- a/pytorch_lightning/trainer/logging.py
+++ b/pytorch_lightning/trainer/logging.py
@@ -29,7 +29,7 @@ def configure_logger(self, logger):
         if logger is True:
             # default logger
             self.logger = TensorBoardLogger(
-                save_dir=str(self.default_root_dir),
+                save_dir=self.default_root_dir,
                 version=self.slurm_job_id,
                 name='lightning_logs'
             )
diff --git a/tests/base/utils.py b/tests/base/utils.py
index b4901843d9b58..6690b18b804df 100644
--- a/tests/base/utils.py
+++ b/tests/base/utils.py
@@ -102,7 +102,7 @@ def run_model_test(trainer_options, model, on_gpu: bool = True, version=None, wi
 
 def get_default_logger(save_dir, version=None):
     # set up logger object without actually saving logs
-    logger = TensorBoardLogger(str(save_dir), name='lightning_logs', version=version)
+    logger = TensorBoardLogger(save_dir, name='lightning_logs', version=version)
     return logger
 
 
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index f8a8fead41f58..291ec34e1e764 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -68,10 +68,10 @@ def log_metrics(self, metrics, step):
 
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
-    CometLogger,
     MLFlowLogger,
     NeptuneLogger,
     TestTubeLogger,
+    # CometLogger,  # TODO: add this one
     # TrainsLogger,  # TODO: add this one
     # WandbLogger,  # TODO: add this one
 ])
diff --git a/tests/trainer/test_trainer_cli.py b/tests/trainer/test_trainer_cli.py
index c66d614903c3d..31433446e01fd 100644
--- a/tests/trainer/test_trainer_cli.py
+++ b/tests/trainer/test_trainer_cli.py
@@ -10,10 +10,10 @@
 from pytorch_lightning import Trainer
 
 
-@mock.patch('argparse.ArgumentParser.parse_args',
-            return_value=Namespace(**Trainer.default_attributes()))
-def test_default_args(tmpdir):
+@mock.patch('argparse.ArgumentParser.parse_args')
+def test_default_args(mock_argparse, tmpdir):
     """Tests default argument parser for Trainer"""
+    mock_argparse.return_value = Namespace(**Trainer.default_attributes())
 
     # logger file to get meta
     logger = tutils.get_default_logger(tmpdir)

From 5692f5f108b6666bceb17405bf2122eb068effa3 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 25 May 2020 14:23:41 -0400
Subject: [PATCH 069/136] update train loop

---
 pytorch_lightning/trainer/training_loop.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 55662990d8f0d..41df5d3d9c69c 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -524,9 +524,10 @@ def run_training_epoch(self):
             self.add_progress_bar_metrics(_processed_outputs[1])
 
         # when no val loop is present or fast-dev-run still need to call checkpoints
+        # TODO bake this logic into the checkpoint callback
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
-            [c.on_epoch_end(self, self.get_model()) for c in checkpoint_callbacks]
+            [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
 
         # Epoch end events
         with self.profiler.profile('on_epoch_end'):

From 47c2c74875b01ee8ec828ecc9da90e8afeb58e2f Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 26 May 2020 22:11:11 -0400
Subject: [PATCH 070/136] fix test case

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 424e4c7b947ff..bc5864f1d2566 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -464,7 +464,7 @@ def test_trainer_min_steps_and_epochs(tmpdir):
         early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0),
         val_check_interval=2,
         min_epochs=1,
-        max_epochs=2
+        max_epochs=10
     )
 
     # define less min steps than 1 epoch

From 6aee10903fca4914b77ea0083f69121c94968451 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 26 May 2020 22:30:44 -0400
Subject: [PATCH 071/136] appease the linter

---
 pytorch_lightning/trainer/callback_config.py | 6 +++---
 tests/callbacks/test_early_stopping.py       | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 6008730ac4c87..18fbe2c092685 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -43,9 +43,9 @@ def configure_checkpoint_callback(self, checkpoint_callback):
         if checkpoint_callback:
             # init a default one
             if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None)
-                            or getattr(self.logger, '_save_dir', None)
-                            or self.default_root_dir)
+                save_dir = (getattr(self.logger, 'save_dir', None) or
+                            getattr(self.logger, '_save_dir', None) or
+                            self.default_root_dir)
 
                 # weights_save_path overrides anything
                 if self.weights_save_path is not None and self.weights_save_path is not True:
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 0709e226a8807..165cda8ea2f37 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -16,6 +16,7 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
     https://github.com/PyTorchLightning/pytorch-lightning/issues/1464
     https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
     """
+
     class EarlyStoppingTestRestore(EarlyStopping):
         def __init__(self, expected_state):
             super().__init__()
@@ -69,4 +70,4 @@ def test_pickling(tmpdir):
     early_stopping = EarlyStopping()
     early_stopping_pickled = pickle.dumps(early_stopping)
     early_stopping_loaded = pickle.loads(early_stopping_pickled)
-    assert vars(early_stopping) == vars(early_stopping_loaded)
\ No newline at end of file
+    assert vars(early_stopping) == vars(early_stopping_loaded)

From 84a2da72a1d89fd15701c8a1deee694b02aa0da8 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Thu, 28 May 2020 22:05:57 -0400
Subject: [PATCH 072/136] fix some doctests

---
 docs/source/experiment_logging.rst | 3 ++-
 docs/source/weights_loading.rst    | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/experiment_logging.rst b/docs/source/experiment_logging.rst
index 772efcfc13bc5..422929337db47 100644
--- a/docs/source/experiment_logging.rst
+++ b/docs/source/experiment_logging.rst
@@ -88,6 +88,7 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.
 .. testcode::
 
     from pytorch_lightning.loggers import NeptuneLogger
+
     neptune_logger = NeptuneLogger(
         api_key='ANONYMOUS',  # replace with your own
         project_name='shared/pytorch-lightning-integration',
@@ -225,7 +226,7 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.
 .. testcode::
 
     from pytorch_lightning.loggers import WandbLogger
-    wandb_logger = WandbLogger()
+    wandb_logger = WandbLogger(offline=True)
     trainer = Trainer(logger=wandb_logger)
 
 The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index 88a04edfcfd87..d68f9c1ddeaa8 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -31,7 +31,7 @@ To change the checkpoint path pass in:
 
 .. testcode::
 
-    trainer = Trainer(default_root_dir='/your/path/to/save/checkpoints')
+    trainer = Trainer(default_root_dir='lightning_checkpoints')
 
 To modify the behavior of checkpointing pass in your own callback.
 

From 6bc50df45b37a8399b44f02be6689a315e55b1e8 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Sat, 30 May 2020 12:42:25 -0400
Subject: [PATCH 073/136] move config to callback

---
 .../callbacks/model_checkpoint.py             | 30 ++++++++++++
 pytorch_lightning/trainer/callback_config.py  | 47 ++-----------------
 2 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 6a47e6f58c88a..9ab74990efc07 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -226,6 +226,36 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         filepath = os.path.join(self.dirpath, self.prefix + filename + str_ver + '.ckpt')
         return filepath
 
+    def on_train_start(self, trainer, pl_module):
+        if self.dirpath is None:
+            self.filename = '{epoch}'
+
+            ckpt_path = trainer.default_root_dir
+            if trainer.logger is not None:
+                save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                            getattr(trainer.logger, '_save_dir', None) or
+                            trainer.default_root_dir)
+
+                # weights_save_path overrides anything
+                if trainer.weights_save_path is not None:
+                    save_dir = trainer.weights_save_path
+
+                version = trainer.logger.version if isinstance(
+                    trainer.logger.version, str) else f'version_{trainer.logger.version}'
+                ckpt_path = os.path.join(
+                    save_dir,
+                    trainer.logger.name,
+                    version,
+                    "checkpoints"
+                )
+            else:
+                ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
+
+            self.dirpath = ckpt_path
+            os.makedirs(self.dirpath, exist_ok=True)
+            trainer.ckpt_path = ckpt_path
+            trainer.weights_save_path = self.dirpath
+
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):
         # only run on main process
diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 18fbe2c092685..9806c3eb72e57 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -39,57 +39,20 @@ def configure_checkpoint_callback(self, checkpoint_callback):
         User provided weights_saved_path
         Otherwise use os.getcwd()
         """
-        ckpt_path = self.default_root_dir
-        if checkpoint_callback:
-            # init a default one
-            if self.logger is not None:
-                save_dir = (getattr(self.logger, 'save_dir', None) or
-                            getattr(self.logger, '_save_dir', None) or
-                            self.default_root_dir)
-
-                # weights_save_path overrides anything
-                if self.weights_save_path is not None and self.weights_save_path is not True:
-                    save_dir = self.weights_save_path
-
-                version = self.logger.version if isinstance(
-                    self.logger.version, str) else f'version_{self.logger.version}'
-                ckpt_path = os.path.join(
-                    save_dir,
-                    self.logger.name,
-                    version,
-                    "checkpoints"
-                )
-            else:
-                ckpt_path = os.path.join(self.default_root_dir, "checkpoints")
-
+        if checkpoint_callback is True:
             # when no val step is defined, use 'loss' otherwise 'val_loss'
             train_step_only = not self.is_overridden('validation_step')
             monitor_key = 'loss' if train_step_only else 'val_loss'
-
-            if checkpoint_callback is True:
-                os.makedirs(ckpt_path, exist_ok=True)
-                checkpoint_callback = ModelCheckpoint(
-                    filepath=ckpt_path,
-                    monitor=monitor_key
-                )
-            # If user specified None in filepath, override with runtime default
-            elif isinstance(checkpoint_callback, ModelCheckpoint) \
-                    and checkpoint_callback.dirpath is None:
-                checkpoint_callback.dirpath = ckpt_path
-                checkpoint_callback.filename = '{epoch}'
-                os.makedirs(checkpoint_callback.dirpath, exist_ok=True)
+            checkpoint_callback = ModelCheckpoint(
+                filepath=None,
+                monitor=monitor_key
+            )
         elif checkpoint_callback is False:
             checkpoint_callback = None
 
-        self.ckpt_path = ckpt_path
-
         if checkpoint_callback:
-            # set the path for the callbacks
             checkpoint_callback.save_function = self.save_checkpoint
 
-            # if checkpoint callback used, then override the weights path
-            self.weights_save_path = checkpoint_callback.dirpath
-
         # if weights_save_path is still none here, set to current working dir
         if self.weights_save_path is None:
             self.weights_save_path = self.default_root_dir

From 9b44672a5c8011824be79ec915e60c313ac54ab4 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 1 Jun 2020 00:21:26 -0400
Subject: [PATCH 074/136] fixes from rebase

---
 pytorch_lightning/callbacks/early_stopping.py |  4 +-
 .../callbacks/model_checkpoint.py             |  8 +-
 pytorch_lightning/trainer/lr_finder.py        | 28 +++----
 pytorch_lightning/trainer/training_loop.py    |  8 --
 tests/callbacks/test_learning_rate_logger.py  | 78 -------------------
 .../{test_lr.py => test_lr_logger.py}         |  0
 6 files changed, 18 insertions(+), 108 deletions(-)
 delete mode 100644 tests/callbacks/test_learning_rate_logger.py
 rename tests/callbacks/{test_lr.py => test_lr_logger.py} (100%)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 45c12a9dfa141..fc940fc1390ee 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -26,7 +26,7 @@ class EarlyStopping(Callback):
             to qualify as an improvement, i.e. an absolute
             change of less than `min_delta`, will count as no
             improvement. Default: ``0``.
-        patience: number of epochs with no improvement
+        patience: number of validation epochs with no improvement
             after which training will be stopped. Default: ``0``.
         verbose: verbosity mode. Default: ``False``.
         mode: one of {auto, min, max}. In `min` mode,
@@ -124,7 +124,7 @@ def on_sanity_check_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         self._validate_condition_metric(logs)
 
-    def on_epoch_end(self, trainer, pl_module):
+    def on_validation_end(self, trainer, pl_module):
         logs = trainer.callback_metrics
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 9ab74990efc07..6bfad90d510b1 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -227,13 +227,17 @@ def format_checkpoint_name(self, epoch, metrics, ver=None):
         return filepath
 
     def on_train_start(self, trainer, pl_module):
+        """
+        Determine model checkpoint save directory at runtime. References attributes from the
+        Trainer's logger to determine where to save checkpoints.
+        """
         if self.dirpath is None:
             self.filename = '{epoch}'
 
             ckpt_path = trainer.default_root_dir
             if trainer.logger is not None:
-                save_dir = (getattr(trainer.logger, 'save_dir', None) or
-                            getattr(trainer.logger, '_save_dir', None) or
+                save_dir = (getattr(trainer.logger, 'save_dir') or
+                            getattr(trainer.logger, '_save_dir') or
                             trainer.default_root_dir)
 
                 # weights_save_path overrides anything
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 91f8cf0ad0021..96f38c86cb939 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -165,9 +165,6 @@ def lr_find(self,
         self.early_stop_callback = None
         self.enable_early_stop = False
 
-        # Accumulation of gradients
-        self.accumulate_grad_batches = num_accumulation_steps
-
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
         self.model = model
@@ -219,24 +216,19 @@ def __lr_finder_dump_params(self, model):
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
             'enable_early_stop': self.enable_early_stop,
-            'progress_bar_refresh_rate': self.progress_bar_refresh_rate,
-            'accumulate_grad_batches': self.accumulate_grad_batches,
-            'progress_bar_callback': self.progress_bar_callback,
             'configure_optimizers': model.configure_optimizers,
         }
 
-    def _lr_finder_restore_params(self, model):
-        self.auto_lr_find = self._params['auto_lr_find']
-        self.logger = self._params['logger']
-        self.callbacks = self._params['callbacks']
-        self.max_steps = self._params['max_steps']
-        self.progress_bar_refresh_rate = self._params['progress_bar_refresh_rate']
-        self.accumulate_grad_batches = self._params['accumulate_grad_batches']
-        self.checkpoint_callback = self._params['checkpoint_callback']
-        self.early_stop_callback = self._params['early_stop_callback']
-        self.enable_early_stop = self._params['enable_early_stop']
-        self.progress_bar_callback = self._params['progress_bar_callback']
-        model.configure_optimizers = self._params['configure_optimizers']
+    def __lr_finder_restore_params(self, model):
+        self.auto_lr_find = self.__dumped_params['auto_lr_find']
+        self.logger = self.__dumped_params['logger']
+        self.callbacks = self.__dumped_params['callbacks']
+        self.max_steps = self.__dumped_params['max_steps']
+        self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
+        self.early_stop_callback = self.__dumped_params['early_stop_callback']
+        self.enable_early_stop = self.__dumped_params['enable_early_stop']
+        model.configure_optimizers = self.__dumped_params['configure_optimizers']
+        del self.__dumped_params
 
 
 class _LRFinder(object):
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 41df5d3d9c69c..937daf89d662a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -821,14 +821,6 @@ def update_learning_rates(self, interval: str):
                 else:
                     lr_scheduler['scheduler'].step()
 
-    def call_checkpoint_callback(self):
-        if self.checkpoint_callback is not None:
-            self.checkpoint_callback.on_validation_end(self, self.get_model())
-
-    def call_early_stop_callback(self):
-        if self.early_stop_callback:
-            self.early_stop_callback.on_epoch_end(self, self.get_model())
-
 
 def _with_is_last(iterable):
     """Pass through values from the given iterable with an added boolean indicating if this is the last item.
diff --git a/tests/callbacks/test_learning_rate_logger.py b/tests/callbacks/test_learning_rate_logger.py
deleted file mode 100644
index 466d030f9a8c3..0000000000000
--- a/tests/callbacks/test_learning_rate_logger.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import tests.base.utils as tutils
-from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import LearningRateLogger
-from tests.base import EvalModelTemplate
-
-
-def test_lr_logger_single_lr(tmpdir):
-    """ Test that learning rates are extracted and logged for single lr scheduler"""
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__single_scheduler
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=5,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert results == 1
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of lr schedulers'
-    assert all([k in ['lr-Adam'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
-
-
-def test_lr_logger_multi_lrs(tmpdir):
-    """ Test that learning rates are extracted and logged for multi lr schedulers """
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__multiple_schedulers
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert results == 1
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of lr schedulers'
-    assert all([k in ['lr-Adam', 'lr-Adam-1'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
-
-
-def test_lr_logger_param_groups(tmpdir):
-    """ Test that learning rates are extracted and logged for single lr scheduler"""
-    tutils.reset_seed()
-
-    model = EvalModelTemplate()
-    model.configure_optimizers = model.configure_optimizers__param_groups
-
-    lr_logger = LearningRateLogger()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=5,
-        val_percent_check=0.1,
-        train_percent_check=0.5,
-        callbacks=[lr_logger]
-    )
-    results = trainer.fit(model)
-
-    assert lr_logger.lrs, 'No learning rates logged'
-    assert len(lr_logger.lrs) == 2 * len(trainer.lr_schedulers), \
-        'Number of learning rates logged does not match number of param groups'
-    assert all([k in ['lr-Adam/pg1', 'lr-Adam/pg2'] for k in lr_logger.lrs.keys()]), \
-        'Names of learning rates not set correctly'
diff --git a/tests/callbacks/test_lr.py b/tests/callbacks/test_lr_logger.py
similarity index 100%
rename from tests/callbacks/test_lr.py
rename to tests/callbacks/test_lr_logger.py

From e8d7c37a8217a4a8cdf300d586934604d647e24f Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 1 Jun 2020 00:31:02 -0400
Subject: [PATCH 075/136] fixes from rebase

---
 pytorch_lightning/callbacks/early_stopping.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index fc940fc1390ee..6599cf634d0a7 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -76,8 +76,8 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
             if self.verbose > 0:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
-        self.min_delta *= 1 if self.monitor_op == np.greater else -1
-        self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+        self.min_delta *= 1 if self.monitor_op == torch.gt else -1
+        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
 
     def _validate_condition_metric(self, logs):
         """

From d1d7aa2ba41cb9778c811a1cf3e9e28eee91380d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:42:42 +0200
Subject: [PATCH 076/136] chlog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11978430c6b86..fbb0f3a157ae4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -84,6 +84,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed an issue with the model summary and `example_input_array` depending on a specific ordering of the submodules in a LightningModule ([#1773](https://github.com/PyTorchLightning/pytorch-lightning/pull/1773))
 
+- Fixed for early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504))
+
 ## [0.7.6] - 2020-05-16
 
 ### Added

From 6aee4d2f56b6952d39567e1c6268f7174ca7d85a Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:47:36 +0200
Subject: [PATCH 077/136] docs

---
 pytorch_lightning/callbacks/early_stopping.py | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 6599cf634d0a7..6c804d71d0f32 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -59,7 +59,7 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
         self.verbose = verbose
         self.strict = strict
         self.min_delta = min_delta
-        self.wait = 0
+        self.wait_count = 0
         self.stopped_epoch = 0
         self.mode = mode
 
@@ -77,13 +77,17 @@ def __init__(self, monitor: str = 'val_loss', min_delta: float = 0.0, patience:
                 log.info(f'EarlyStopping mode set to {self.mode} for monitoring {self.monitor}.')
 
         self.min_delta *= 1 if self.monitor_op == torch.gt else -1
-        self.best = torch_inf if self.monitor_op == torch.lt else -torch_inf
+        self.best_score = torch_inf if self.monitor_op == torch.lt else -torch_inf
 
     def _validate_condition_metric(self, logs):
         """
         Checks that the condition metric for early stopping is good
-        :param logs: callback metrics from validation output
-        :return: True if specified metric is available
+
+        Args:
+            logs: callback metrics from validation output
+
+        Return:
+             True if specified metric is available
         """
         monitor_val = logs.get(self.monitor)
         error_msg = (f'Early stopping conditioned on metric `{self.monitor}`'
@@ -107,17 +111,17 @@ def monitor_op(self):
 
     def state_dict(self):
         return {
-            'wait': self.wait,
+            'wait_count': self.wait_count,
             'stopped_epoch': self.stopped_epoch,
-            'best': self.best,
+            'best_score': self.best_score,
             'patience': self.patience
         }
 
     def load_state_dict(self, state_dict):
         state_dict = deepcopy(state_dict)
-        self.wait = state_dict['wait']
+        self.wait_count = state_dict['wait_count']
         self.stopped_epoch = state_dict['stopped_epoch']
-        self.best = state_dict['best']
+        self.best_score = state_dict['best_score']
         self.patience = state_dict['patience']
 
     def on_sanity_check_end(self, trainer, pl_module):
@@ -134,12 +138,12 @@ def on_validation_end(self, trainer, pl_module):
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
 
-        if self.monitor_op(current - self.min_delta, self.best):
-            self.best = current
-            self.wait = 0
+        if self.monitor_op(current - self.min_delta, self.best_score):
+            self.best_score = current
+            self.wait_count = 0
         else:
-            self.wait += 1
-            if self.wait >= self.patience:
+            self.wait_count += 1
+            if self.wait_count >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
                 stop_training = True
 

From 8daadc1a5b807877ec10cdaa4f45e1298d34843c Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 13:52:34 +0200
Subject: [PATCH 078/136] reformat

---
 .../callbacks/model_checkpoint.py             | 55 ++++++++++---------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 6bfad90d510b1..83c73bbb14ace 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -231,34 +231,35 @@ def on_train_start(self, trainer, pl_module):
         Determine model checkpoint save directory at runtime. References attributes from the
         Trainer's logger to determine where to save checkpoints.
         """
-        if self.dirpath is None:
-            self.filename = '{epoch}'
-
-            ckpt_path = trainer.default_root_dir
-            if trainer.logger is not None:
-                save_dir = (getattr(trainer.logger, 'save_dir') or
-                            getattr(trainer.logger, '_save_dir') or
-                            trainer.default_root_dir)
-
-                # weights_save_path overrides anything
-                if trainer.weights_save_path is not None:
-                    save_dir = trainer.weights_save_path
-
-                version = trainer.logger.version if isinstance(
-                    trainer.logger.version, str) else f'version_{trainer.logger.version}'
-                ckpt_path = os.path.join(
-                    save_dir,
-                    trainer.logger.name,
-                    version,
-                    "checkpoints"
-                )
-            else:
-                ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
+        if self.dirpath is not None:
+            return # short circuit
+
+        self.filename = '{epoch}'
+
+        if trainer.logger is not None:
+            save_dir = (getattr(trainer.logger, 'save_dir') or
+                        getattr(trainer.logger, '_save_dir') or
+                        trainer.default_root_dir)
+
+            # weights_save_path overrides anything
+            if trainer.weights_save_path is not None:
+                save_dir = trainer.weights_save_path
+
+            version = trainer.logger.version if isinstance(
+                trainer.logger.version, str) else f'version_{trainer.logger.version}'
+            ckpt_path = os.path.join(
+                save_dir,
+                trainer.logger.name,
+                version,
+                "checkpoints"
+            )
+        else:
+            ckpt_path = os.path.join(trainer.default_root_dir, "checkpoints")
 
-            self.dirpath = ckpt_path
-            os.makedirs(self.dirpath, exist_ok=True)
-            trainer.ckpt_path = ckpt_path
-            trainer.weights_save_path = self.dirpath
+        self.dirpath = ckpt_path
+        os.makedirs(self.dirpath, exist_ok=True)
+        trainer.ckpt_path = ckpt_path
+        trainer.weights_save_path = self.dirpath
 
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):

From 8a623f0e2810930666b884ba2191432aeb28799b Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 14:00:20 +0200
Subject: [PATCH 079/136] formatting

---
 tests/callbacks/test_model_checkpoint.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 86744e3cef099..c8742dc14e103 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,11 +1,13 @@
+import pickle
+from pathlib import Path
+
 import pytest
 
 import tests.base.utils as tutils
-from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
-from pathlib import Path
 
 
 @pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
@@ -16,11 +18,12 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
 
     checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
 
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_pct=0.20,
-                      max_epochs=5
-                      )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        checkpoint_callback=checkpoint,
+        overfit_pct=0.20,
+        max_epochs=5
+    )
     trainer.fit(model)
 
     # These should be different if the dirpath has be overridden
@@ -50,7 +53,6 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
 
 
 def test_pickling(tmpdir):
-    import pickle
     ckpt = ModelCheckpoint(tmpdir)
     ckpt_pickled = pickle.dumps(ckpt)
     ckpt_loaded = pickle.loads(ckpt_pickled)

From b20b1c1ffc06a81c2980e9669d16f2094279e400 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 14:54:44 +0200
Subject: [PATCH 080/136] fix

---
 pytorch_lightning/trainer/training_loop.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 937daf89d662a..803337813f150 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -260,7 +260,7 @@ def is_function_implemented(self, *args):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
-    def run_evaluation(self, *args):
+    def run_evaluation(self, *args, **kwargs):
         """Warning: this is just empty shell for code implemented in other class."""
 
     @abstractmethod
@@ -476,7 +476,6 @@ def run_training_epoch(self):
             # fast_dev_run always forces val checking after train batch
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=self.testing)
-                self.call_checkpoint_callback()
 
             # ---------------
             # CHECKPOINTING, EARLY STOPPING

From a52983f4a471381b01b95ddbb0e457c4f0e2f231 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Mon, 1 Jun 2020 15:08:07 +0200
Subject: [PATCH 081/136] fix

---
 pytorch_lightning/callbacks/model_checkpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 83c73bbb14ace..933bbbcdab4bb 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -237,8 +237,8 @@ def on_train_start(self, trainer, pl_module):
         self.filename = '{epoch}'
 
         if trainer.logger is not None:
-            save_dir = (getattr(trainer.logger, 'save_dir') or
-                        getattr(trainer.logger, '_save_dir') or
+            save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                        getattr(trainer.logger, '_save_dir', None) or
                         trainer.default_root_dir)
 
             # weights_save_path overrides anything

From 5e4e710049666dcc4201e347a80359e505081226 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 15 Jun 2020 20:10:41 -0400
Subject: [PATCH 082/136] fixes from rebase

---
 tests/trainer/test_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index bc5864f1d2566..d1bd7634fa5d1 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -464,7 +464,7 @@ def test_trainer_min_steps_and_epochs(tmpdir):
         early_stop_callback=EarlyStopping(monitor='val_loss', min_delta=1.0),
         val_check_interval=2,
         min_epochs=1,
-        max_epochs=10
+        max_epochs=7
     )
 
     # define less min steps than 1 epoch

From 4ea2d99fe2ab36796c05bc1c8d08712eff9e1c6b Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 15 Jun 2020 21:00:42 -0400
Subject: [PATCH 083/136] add new test for patience

---
 pytorch_lightning/callbacks/early_stopping.py |  8 ++-----
 tests/callbacks/test_early_stopping.py        | 24 +++++++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/callbacks/early_stopping.py b/pytorch_lightning/callbacks/early_stopping.py
index 6c804d71d0f32..b8cf524341b29 100644
--- a/pytorch_lightning/callbacks/early_stopping.py
+++ b/pytorch_lightning/callbacks/early_stopping.py
@@ -133,7 +133,6 @@ def on_validation_end(self, trainer, pl_module):
         if not self._validate_condition_metric(logs):
             return  # short circuit if metric not present
 
-        stop_training = False
         current = logs.get(self.monitor)
         if not isinstance(current, torch.Tensor):
             current = torch.tensor(current)
@@ -145,13 +144,10 @@ def on_validation_end(self, trainer, pl_module):
             self.wait_count += 1
             if self.wait_count >= self.patience:
                 self.stopped_epoch = trainer.current_epoch
-                stop_training = True
-
-        if stop_training:
-            trainer.should_stop = True
+                trainer.should_stop = True
 
     def on_train_end(self, trainer, pl_module):
         if self.stopped_epoch > 0 and self.verbose > 0:
             rank_zero_warn('Displayed epoch numbers by `EarlyStopping` start from "1" until v0.6.x,'
                            ' but will start from "0" in v0.8.0.', DeprecationWarning)
-            log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping')
+            log.info(f'Epoch {self.stopped_epoch + 1:05d}: early stopping triggered.')
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 165cda8ea2f37..8512ddeae995c 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -65,6 +65,30 @@ def on_train_end(self, trainer, pl_module):
     trainer.fit(model)
 
 
+@pytest.mark.parametrize('loss_values, patience, expected_stop_epoch', [
+    ([6, 5, 5, 5, 5, 5], 3, 5),
+    ([6, 5, 4, 4, 3, 3], 1, 4),
+    ([6, 5, 6, 5, 5, 5], 3, 5),
+])
+def test_early_stopping_patience(loss_values, patience, expected_stop_epoch):
+    """Test to ensure that early stopping is not triggered before patience is exhausted."""
+
+    class ModelOverrideValidationReturn(EvalModelTemplate):
+        validation_return_values = torch.Tensor(loss_values)
+        count = 0
+
+        def validation_epoch_end(self, outputs):
+            loss = self.validation_return_values[self.count]
+            self.count += 1
+            return {"test_val_loss": loss}
+
+    model = ModelOverrideValidationReturn()
+    early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0)
+    trainer.fit(model)
+    assert trainer.current_epoch + 1 == expected_stop_epoch
+
+
 def test_pickling(tmpdir):
     import pickle
     early_stopping = EarlyStopping()

From 3c4d31e54082640bffff9adb284f555fc83d405a Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:02:03 -0400
Subject: [PATCH 084/136] Update
 pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/callbacks/model_checkpoint.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 933bbbcdab4bb..160fc1910005b 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -237,13 +237,13 @@ def on_train_start(self, trainer, pl_module):
         self.filename = '{epoch}'
 
         if trainer.logger is not None:
-            save_dir = (getattr(trainer.logger, 'save_dir', None) or
-                        getattr(trainer.logger, '_save_dir', None) or
-                        trainer.default_root_dir)
-
             # weights_save_path overrides anything
-            if trainer.weights_save_path is not None:
+            if getattr(trainer, 'weights_save_path', None) is not None:
                 save_dir = trainer.weights_save_path
+            else:
+                save_dir = (getattr(trainer.logger, 'save_dir', None) or
+                            getattr(trainer.logger, '_save_dir', None) or
+                            trainer.default_root_dir)
 
             version = trainer.logger.version if isinstance(
                 trainer.logger.version, str) else f'version_{trainer.logger.version}'

From d893b1850025e2a3595ff40618acb4a04acccb88 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:02:15 -0400
Subject: [PATCH 085/136] Update
 pytorch_lightning/callbacks/model_checkpoint.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 160fc1910005b..3632cec0b8ef6 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -259,7 +259,7 @@ def on_train_start(self, trainer, pl_module):
         self.dirpath = ckpt_path
         os.makedirs(self.dirpath, exist_ok=True)
         trainer.ckpt_path = ckpt_path
-        trainer.weights_save_path = self.dirpath
+        trainer.weights_save_path = ckpt_path
 
     @rank_zero_only
     def on_validation_end(self, trainer, pl_module):

From d650b7420c72ed895a8eedb9741f22fe2d0f63e1 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <13970565+jeremyjordan@users.noreply.github.com>
Date: Tue, 16 Jun 2020 09:03:25 -0400
Subject: [PATCH 086/136] Update tests/callbacks/test_early_stopping.py

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/callbacks/test_early_stopping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 8512ddeae995c..1f0b203ccdea7 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -84,7 +84,7 @@ def validation_epoch_end(self, outputs):
 
     model = ModelOverrideValidationReturn()
     early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
-    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0)
+    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0, max_epochs=10)
     trainer.fit(model)
     assert trainer.current_epoch + 1 == expected_stop_epoch
 

From 4eb89051c5ff7cf8a6ea603e2a6b4fef60b1192f Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 16 Jun 2020 21:53:53 -0400
Subject: [PATCH 087/136] fix formatting

---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 3632cec0b8ef6..b3fde36a74373 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -232,7 +232,7 @@ def on_train_start(self, trainer, pl_module):
         Trainer's logger to determine where to save checkpoints.
         """
         if self.dirpath is not None:
-            return # short circuit
+            return  # short circuit
 
         self.filename = '{epoch}'
 

From 9f345849592bb6fe2dcfb55bdae8af895d1100e9 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 16 Jun 2020 22:34:49 -0400
Subject: [PATCH 088/136] remove enable_early_stop attribute

---
 pytorch_lightning/trainer/callback_config.py | 4 ----
 pytorch_lightning/trainer/lr_finder.py       | 3 ---
 pytorch_lightning/trainer/training_loop.py   | 1 -
 pytorch_lightning/trainer/training_tricks.py | 3 ---
 tests/trainer/test_lr_finder.py              | 2 +-
 tests/trainer/test_trainer_tricks.py         | 1 -
 6 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/pytorch_lightning/trainer/callback_config.py b/pytorch_lightning/trainer/callback_config.py
index 9806c3eb72e57..b65dc37ef8b1b 100644
--- a/pytorch_lightning/trainer/callback_config.py
+++ b/pytorch_lightning/trainer/callback_config.py
@@ -68,14 +68,10 @@ def configure_early_stopping(self, early_stop_callback):
                 verbose=True,
                 mode='min'
             )
-            # TODO remove this attribute
-            self.enable_early_stop = True
         elif not early_stop_callback:
             early_stop_callback = None
-            self.enable_early_stop = False
         else:
             early_stop_callback = early_stop_callback
-            self.enable_early_stop = True
         return early_stop_callback
 
     def configure_progress_bar(self, refresh_rate=1, process_position=0):
diff --git a/pytorch_lightning/trainer/lr_finder.py b/pytorch_lightning/trainer/lr_finder.py
index 96f38c86cb939..72228c81394ba 100755
--- a/pytorch_lightning/trainer/lr_finder.py
+++ b/pytorch_lightning/trainer/lr_finder.py
@@ -163,7 +163,6 @@ def lr_find(self,
         # Disable standard checkpoint & early stopping
         self.checkpoint_callback = False
         self.early_stop_callback = None
-        self.enable_early_stop = False
 
         # Required for saving the model
         self.optimizers, self.schedulers = [], [],
@@ -215,7 +214,6 @@ def __lr_finder_dump_params(self, model):
             'max_steps': self.max_steps,
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
-            'enable_early_stop': self.enable_early_stop,
             'configure_optimizers': model.configure_optimizers,
         }
 
@@ -226,7 +224,6 @@ def __lr_finder_restore_params(self, model):
         self.max_steps = self.__dumped_params['max_steps']
         self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
         self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
         model.configure_optimizers = self.__dumped_params['configure_optimizers']
         del self.__dumped_params
 
diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 803337813f150..b6c762caf6105 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -209,7 +209,6 @@ class TrainerTrainLoopMixin(ABC):
     fast_dev_run: ...
     accumulation_scheduler: ...
     lr_schedulers: ...
-    enable_early_stop: ...
     early_stop_callback: ...
     callback_metrics: ...
     logger: Union[LightningLoggerBase, bool]
diff --git a/pytorch_lightning/trainer/training_tricks.py b/pytorch_lightning/trainer/training_tricks.py
index 817215202992f..977bd51694ab9 100644
--- a/pytorch_lightning/trainer/training_tricks.py
+++ b/pytorch_lightning/trainer/training_tricks.py
@@ -188,7 +188,6 @@ def __scale_batch_dump_params(self):
             'callbacks': self.callbacks,
             'checkpoint_callback': self.checkpoint_callback,
             'early_stop_callback': self.early_stop_callback,
-            'enable_early_stop': self.enable_early_stop,
             'auto_scale_batch_size': self.auto_scale_batch_size,
             'train_percent_check': self.train_percent_check,
             'model': self.model,
@@ -202,7 +201,6 @@ def __scale_batch_reset_params(self, model, steps_per_trial):
         self.callbacks = []  # not needed before full run
         self.checkpoint_callback = False  # required for saving
         self.early_stop_callback = None
-        self.enable_early_stop = False
         self.train_percent_check = 1.0
         self.optimizers, self.schedulers = [], []  # required for saving
         self.model = model  # required for saving
@@ -215,7 +213,6 @@ def __scale_batch_restore_params(self):
         self.checkpoint_callback = self.__dumped_params['checkpoint_callback']
         self.auto_scale_batch_size = self.__dumped_params['auto_scale_batch_size']
         self.early_stop_callback = self.__dumped_params['early_stop_callback']
-        self.enable_early_stop = self.__dumped_params['enable_early_stop']
         self.train_percent_check = self.__dumped_params['train_percent_check']
         self.model = self.__dumped_params['model']
         del self.__dumped_params
diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py
index d0becff0918c6..66b4e1d2972de 100755
--- a/tests/trainer/test_lr_finder.py
+++ b/tests/trainer/test_lr_finder.py
@@ -57,7 +57,7 @@ def test_trainer_reset_correctly(tmpdir):
 
     changed_attributes = ['callbacks', 'logger', 'max_steps', 'auto_lr_find',
                           'early_stop_callback', 'accumulate_grad_batches',
-                          'enable_early_stop', 'checkpoint_callback']
+                          'checkpoint_callback']
     attributes_before = {}
     for ca in changed_attributes:
         attributes_before[ca] = getattr(trainer, ca)
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
index 973ed32e7cd92..99605443a67e8 100755
--- a/tests/trainer/test_trainer_tricks.py
+++ b/tests/trainer/test_trainer_tricks.py
@@ -48,7 +48,6 @@ def test_trainer_reset_correctly(tmpdir):
                           'callbacks',
                           'checkpoint_callback',
                           'early_stop_callback',
-                          'enable_early_stop',
                           'train_percent_check']
 
     attributes_before = {}

From 5beb38fda453316cb24fdd2e0e3f0af7c992390a Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Wed, 17 Jun 2020 08:45:45 -0400
Subject: [PATCH 089/136] fix test with new epoch indexing

---
 tests/callbacks/test_early_stopping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 1f0b203ccdea7..c66b8083b90df 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -86,7 +86,7 @@ def validation_epoch_end(self, outputs):
     early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
     trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0, max_epochs=10)
     trainer.fit(model)
-    assert trainer.current_epoch + 1 == expected_stop_epoch
+    assert trainer.current_epoch == expected_stop_epoch
 
 
 def test_pickling(tmpdir):

From 1a39e1dfda050faf47d5813692a95679e352d92a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 22 Jun 2020 05:26:46 +0200
Subject: [PATCH 090/136] fix progress bar totals

---
 pytorch_lightning/trainer/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index ba4cf2d3e33e4..28efba725eb2a 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -336,9 +336,9 @@ def __init__(
         self.batch_idx = 0
         self.progress_bar_metrics = {}
         self.callback_metrics = {}
-        self.num_val_batches = 0
         self.num_training_batches = 0
-        self.num_test_batches = 0
+        self.num_val_batches = []
+        self.num_test_batches = []
         self.train_dataloader = None
         self.test_dataloaders = None
         self.val_dataloaders = None

From c5330edc74dfc01f697f607e5bae0572efc8f829 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 22 Jun 2020 05:35:09 +0200
Subject: [PATCH 091/136] fix off by one error (see #2289) epoch starts at 0
 now

---
 tests/callbacks/test_early_stopping.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index c66b8083b90df..87dae713b2557 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -66,9 +66,9 @@ def on_train_end(self, trainer, pl_module):
 
 
 @pytest.mark.parametrize('loss_values, patience, expected_stop_epoch', [
-    ([6, 5, 5, 5, 5, 5], 3, 5),
-    ([6, 5, 4, 4, 3, 3], 1, 4),
-    ([6, 5, 6, 5, 5, 5], 3, 5),
+    ([6, 5, 5, 5, 5, 5], 3, 4),
+    ([6, 5, 4, 4, 3, 3], 1, 3),
+    ([6, 5, 6, 5, 5, 5], 3, 4),
 ])
 def test_early_stopping_patience(loss_values, patience, expected_stop_epoch):
     """Test to ensure that early stopping is not triggered before patience is exhausted."""

From c86d08c596d54646513c389d83482dfb9269f62c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Mon, 22 Jun 2020 13:47:41 +0200
Subject: [PATCH 092/136] added missing imports

---
 tests/callbacks/test_callbacks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index f3890b105ca6b..d2bafe6d7c991 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -6,6 +6,7 @@
 import tests.base.utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
 from tests.base import EvalModelTemplate
 

From 776bc6403b8be3033595556f16bfc2180dfff981 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 22 Jun 2020 23:03:23 -0400
Subject: [PATCH 093/136] fix hpc_save folderpath

---
 tests/models/test_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index 31eea37f4350a..d45e1e60fc9d8 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -53,7 +53,7 @@ def test_cpu_slurm_save_load(tmpdir):
 
     # test HPC saving
     # simulate snapshot on slurm
-    saved_filepath = trainer.hpc_save(tmpdir, logger)
+    saved_filepath = trainer.hpc_save(trainer.weights_save_path, logger)
     assert os.path.exists(saved_filepath)
 
     # new logger file to get meta

From 3b9dbde1f2c4ca2f28ec882b62559a63f5027f5b Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Mon, 22 Jun 2020 23:14:46 -0400
Subject: [PATCH 094/136] fix formatting

---
 pytorch_lightning/trainer/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 28efba725eb2a..8509d86eac844 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -329,7 +329,7 @@ def __init__(
             rank_zero_only.rank = os.environ['LOCAL_RANK']
         if 'SLURM_JOB_ID' in os.environ:
             rank_zero_only.rank = os.environ['SLURM_JOB_ID']
-        
+
         # training bookeeping
         self.total_batch_idx = 0
         self.running_loss = TensorRunningAccum(window_length=20)

From 47a02a14f7ee709a746190e9af6eeafcc2f91f90 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 23 Jun 2020 22:40:24 -0400
Subject: [PATCH 095/136] fix tests

---
 pytorch_lightning/trainer/training_loop.py       |  2 +-
 tests/models/data/horovod/train_default_model.py | 10 +++++++++-
 tests/models/test_hooks.py                       |  1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8fe147a3b81f7..e89e16fcd6ef9 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -371,7 +371,7 @@ def train(self):
                 # -----------------
                 self.run_training_epoch()
 
-                if self.max_steps and self.max_steps == self.global_step:
+                if self.max_steps and self.max_steps <= self.global_step:
                     self.run_training_teardown()
                     return
 
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 6bf0e3aa9c4b2..5bfaeb7fa01ab 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -20,8 +20,15 @@
 import json
 import os
 import sys
+import pytest
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    HOROVOD_AVAILABLE = False
+else:
+    HOROVOD_AVAILABLE = True
 
-import horovod.torch as hvd
 
 PATH_HERE = os.path.abspath(os.path.dirname(__file__))
 PATH_ROOT = os.path.join(PATH_HERE, '..', '..', '..', '..')
@@ -38,6 +45,7 @@
 parser.add_argument('--on-gpu', action='store_true', default=False)
 
 
+@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod not installed")
 def run_test_from_config(trainer_options):
     """Trains the default model with the given config."""
     set_random_master_port()
diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 78efbd35ff4da..362c4103eace3 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -20,6 +20,7 @@ def on_before_zero_grad(self, optimizer):
 
     trainer = Trainer(
         max_steps=max_steps,
+        max_epochs=2,
         num_sanity_val_steps=5,
     )
     assert 0 == model.on_before_zero_grad_called

From 780b0f2b3bee36ed8187bc57708dc32314a5e796 Mon Sep 17 00:00:00 2001
From: Jeremy Jordan <jtjordan@ncsu.edu>
Date: Tue, 23 Jun 2020 23:16:22 -0400
Subject: [PATCH 096/136] small fixes from a rebase

---
 pytorch_lightning/trainer/training_loop.py |  1 -
 tests/callbacks/test_callbacks.py          | 82 ----------------------
 2 files changed, 83 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 696286c86404e..ff4cb9374b266 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -153,7 +153,6 @@ def training_step(self, batch, batch_idx):
 
 import numpy as np
 import torch
-import subprocess
 from torch.utils.data import DataLoader
 import torch.distributed as torch_distrib
 
diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index d2bafe6d7c991..5a1e2540543a7 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -261,85 +261,3 @@ def on_test_end(self, trainer, pl_module):
     assert not test_callback.on_validation_end_called
     assert not test_callback.on_validation_batch_end_called
     assert not test_callback.on_validation_batch_start_called
-
-
-def test_early_stopping_no_val_step(tmpdir):
-    """Test that early stopping callback falls back to training metrics when no validation defined."""
-
-    class CurrentModel(EvalModelTemplate):
-        def training_step(self, *args, **kwargs):
-            output = super().training_step(*args, **kwargs)
-            output.update({'my_train_metric': output['loss']})  # could be anything else
-            return output
-
-    model = CurrentModel()
-    model.validation_step = None
-    model.val_dataloader = None
-
-    stopping = EarlyStopping(monitor='my_train_metric', min_delta=0.1)
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        early_stop_callback=stopping,
-        overfit_batches=0.20,
-        max_epochs=2,
-    )
-    result = trainer.fit(model)
-
-    assert result == 1, 'training failed to complete'
-    assert trainer.current_epoch <= trainer.max_epochs
-
-
-def test_pickling(tmpdir):
-    import pickle
-    early_stopping = EarlyStopping()
-    ckpt = ModelCheckpoint(tmpdir)
-
-    early_stopping_pickled = pickle.dumps(early_stopping)
-    ckpt_pickled = pickle.dumps(ckpt)
-
-    early_stopping_loaded = pickle.loads(early_stopping_pickled)
-    ckpt_loaded = pickle.loads(ckpt_pickled)
-
-    assert vars(early_stopping) == vars(early_stopping_loaded)
-    assert vars(ckpt) == vars(ckpt_loaded)
-
-
-@pytest.mark.parametrize('save_top_k', [-1, 0, 1, 2])
-def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
-    """ Test that None in checkpoint callback is valid and that chkp_path is set correctly """
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-
-    checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
-
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_batches=0.20,
-                      max_epochs=2
-                      )
-    trainer.fit(model)
-
-    # These should be different if the dirpath has be overridden
-    assert trainer.ckpt_path != trainer.default_root_dir
-
-
-@pytest.mark.parametrize(
-    'logger_version,expected',
-    [(None, 'version_0'), (1, 'version_1'), ('awesome', 'awesome')],
-)
-def test_model_checkpoint_path(tmpdir, logger_version, expected):
-    """Test that "version_" prefix is only added when logger's version is an integer"""
-    tutils.reset_seed()
-    model = EvalModelTemplate()
-    logger = TensorBoardLogger(str(tmpdir), version=logger_version)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        overfit_batches=0.2,
-        max_epochs=2,
-        logger=logger
-    )
-    trainer.fit(model)
-
-    ckpt_version = Path(trainer.ckpt_path).parent.name
-    assert ckpt_version == expected

From a46ab9a77de07f0834e8fa57baf89c9a2897c82b Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 24 Jun 2020 16:29:10 +0200
Subject: [PATCH 097/136] fix

---
 tests/models/test_hooks.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_hooks.py b/tests/models/test_hooks.py
index 362c4103eace3..7d5a8849948d6 100644
--- a/tests/models/test_hooks.py
+++ b/tests/models/test_hooks.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.parametrize('max_steps', [1, 2, 3])
-def test_on_before_zero_grad_called(max_steps):
+def test_on_before_zero_grad_called(tmpdir, max_steps):
 
     class CurrentTestModel(EvalModelTemplate):
         on_before_zero_grad_called = 0
@@ -19,6 +19,7 @@ def on_before_zero_grad(self, optimizer):
     model = CurrentTestModel()
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_steps=max_steps,
         max_epochs=2,
         num_sanity_val_steps=5,

From 50174ae8412b122ba4f1ae0c8c247a4c841022d4 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 24 Jun 2020 16:37:47 +0200
Subject: [PATCH 098/136] tmpdir

---
 tests/trainer/test_trainer.py       |  4 +++-
 tests/trainer/test_trainer_steps.py | 20 ++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 179d1e4cb86a4..9714152ea83fc 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -297,6 +297,7 @@ def test_model_checkpoint_only_weights(tmpdir):
     model = EvalModelTemplate()
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         checkpoint_callback=ModelCheckpoint(tmpdir, save_weights_only=True)
     )
@@ -592,7 +593,7 @@ def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs):
             assert loaded_checkpoint_path == ckpt_path
 
 
-def test_disabled_validation():
+def test_disabled_validation(tmpdir):
     """Verify that `limit_val_batches=0` disables the validation loop unless `fast_dev_run=True`."""
 
     class CurrentModel(EvalModelTemplate):
@@ -612,6 +613,7 @@ def validation_epoch_end(self, *args, **kwargs):
     model = CurrentModel(**hparams)
 
     trainer_options = dict(
+        default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=2,
         limit_train_batches=0.4,
diff --git a/tests/trainer/test_trainer_steps.py b/tests/trainer/test_trainer_steps.py
index 7e23324eed192..0bda7531ccd5c 100644
--- a/tests/trainer/test_trainer_steps.py
+++ b/tests/trainer/test_trainer_steps.py
@@ -2,7 +2,7 @@
 from tests.base.deterministic_model import DeterministicModel
 
 
-def test_trainingstep_dict(tmpdir):
+def test_training_step_dict(tmpdir):
     """
     Tests that only training_step can be used
     """
@@ -10,7 +10,11 @@ def test_trainingstep_dict(tmpdir):
     model.training_step = model.training_step_dict_return
     model.val_dataloader = None
 
-    trainer = Trainer(fast_dev_run=True, weights_summary=None)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        fast_dev_run=True,
+        weights_summary=None,
+    )
     trainer.fit(model)
 
     # make sure correct steps were called
@@ -75,7 +79,11 @@ def test_full_training_loop_dict(tmpdir):
     model.training_epoch_end = model.training_epoch_end_dict
     model.val_dataloader = None
 
-    trainer = Trainer(max_epochs=1, weights_summary=None)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        weights_summary=None,
+    )
     trainer.fit(model)
 
     # make sure correct steps were called
@@ -112,7 +120,11 @@ def test_train_step_epoch_end(tmpdir):
     model.training_epoch_end = model.training_epoch_end_dict
     model.val_dataloader = None
 
-    trainer = Trainer(max_epochs=1, weights_summary=None)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        weights_summary=None,
+    )
     trainer.fit(model)
 
     # make sure correct steps were called

From 5180ce08e79145dc16d5d5f24de31e3f97a6dab8 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 24 Jun 2020 17:05:38 +0200
Subject: [PATCH 099/136] tmpdir

---
 tests/callbacks/test_early_stopping.py | 34 ++++++++++++++++++++------
 tests/callbacks/test_progress_bar.py   |  9 ++++---
 tests/loggers/test_all.py              |  1 +
 tests/loggers/test_base.py             | 10 +++++---
 tests/loggers/test_neptune.py          |  2 +-
 tests/loggers/test_trains.py           |  4 +--
 tests/loggers/test_wandb.py            |  8 ++++--
 tests/models/test_amp.py               |  3 ++-
 tests/models/test_cpu.py               |  9 ++++---
 tests/models/test_grad_norm.py         |  1 +
 tests/trainer/test_dataloaders.py      | 17 +++++++------
 tests/trainer/test_lr_finder.py        | 12 ++++-----
 tests/trainer/test_optimizers.py       | 10 ++++----
 tests/trainer/test_trainer.py          | 16 ++++++------
 tests/trainer/test_trainer_tricks.py   |  6 ++---
 15 files changed, 90 insertions(+), 52 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 87dae713b2557..4767c5f319775 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -28,7 +28,12 @@ def on_train_start(self, trainer, pl_module):
     model = EvalModelTemplate()
     checkpoint_callback = ModelCheckpoint(save_top_k=1)
     early_stop_callback = EarlyStopping()
-    trainer = Trainer(checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, max_epochs=4)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        checkpoint_callback=checkpoint_callback,
+        early_stop_callback=early_stop_callback,
+        max_epochs=4
+    )
     trainer.fit(model)
     early_stop_callback_state = early_stop_callback.state_dict()
 
@@ -38,13 +43,15 @@ def on_train_start(self, trainer, pl_module):
     assert checkpoint['early_stop_callback_state_dict'] == early_stop_callback_state
     # ensure state is reloaded properly (assertion in the callback)
     early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
-    new_trainer = Trainer(max_epochs=2,
-                          resume_from_checkpoint=checkpoint_filepath,
-                          early_stop_callback=early_stop_callback)
+    new_trainer = Trainer(
+        max_epochs=2,
+        resume_from_checkpoint=checkpoint_filepath,
+        early_stop_callback=early_stop_callback,
+    )
     new_trainer.fit(model)
 
 
-def test_early_stopping_no_extraneous_invocations():
+def test_early_stopping_no_extraneous_invocations(tmpdir):
     """Test to ensure that callback methods aren't being invoked outside of the callback handler."""
     class EarlyStoppingTestInvocations(EarlyStopping):
         def __init__(self, expected_count):
@@ -61,7 +68,12 @@ def on_train_end(self, trainer, pl_module):
     model = EvalModelTemplate()
     expected_count = 4
     early_stop_callback = EarlyStoppingTestInvocations(expected_count)
-    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, max_epochs=expected_count)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        early_stop_callback=early_stop_callback,
+        val_check_interval=1.0,
+        max_epochs=expected_count,
+    )
     trainer.fit(model)
 
 
@@ -70,7 +82,7 @@ def on_train_end(self, trainer, pl_module):
     ([6, 5, 4, 4, 3, 3], 1, 3),
     ([6, 5, 6, 5, 5, 5], 3, 4),
 ])
-def test_early_stopping_patience(loss_values, patience, expected_stop_epoch):
+def test_early_stopping_patience(tmpdir, loss_values, patience, expected_stop_epoch):
     """Test to ensure that early stopping is not triggered before patience is exhausted."""
 
     class ModelOverrideValidationReturn(EvalModelTemplate):
@@ -84,7 +96,13 @@ def validation_epoch_end(self, outputs):
 
     model = ModelOverrideValidationReturn()
     early_stop_callback = EarlyStopping(monitor="test_val_loss", patience=patience, verbose=True)
-    trainer = Trainer(early_stop_callback=early_stop_callback, val_check_interval=1.0, num_sanity_val_steps=0, max_epochs=10)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        early_stop_callback=early_stop_callback,
+        val_check_interval=1.0,
+        num_sanity_val_steps=0,
+        max_epochs=10,
+    )
     trainer.fit(model)
     assert trainer.current_epoch == expected_stop_epoch
 
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index a63fc62585c45..b5ff69d860695 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -13,10 +13,11 @@
     ([ProgressBar(refresh_rate=2)], 0),
     ([ProgressBar(refresh_rate=2)], 1),
 ])
-def test_progress_bar_on(callbacks, refresh_rate):
+def test_progress_bar_on(tmpdir, callbacks, refresh_rate):
     """Test different ways the progress bar can be turned on."""
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         callbacks=callbacks,
         progress_bar_refresh_rate=refresh_rate,
         max_epochs=1,
@@ -54,12 +55,13 @@ def test_progress_bar_misconfiguration():
         Trainer(callbacks=callbacks)
 
 
-def test_progress_bar_totals():
+def test_progress_bar_totals(tmpdir):
     """Test that the progress finishes with the correct total steps processed."""
 
     model = EvalModelTemplate()
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         progress_bar_refresh_rate=1,
         limit_val_batches=1.0,
         max_epochs=1,
@@ -136,7 +138,7 @@ def test_progress_bar_fast_dev_run():
 
 
 @pytest.mark.parametrize('refresh_rate', [0, 1, 50])
-def test_progress_bar_progress_refresh(refresh_rate):
+def test_progress_bar_progress_refresh(tmpdir, refresh_rate):
     """Test that the three progress bars get correctly updated when using different refresh rates."""
 
     model = EvalModelTemplate()
@@ -172,6 +174,7 @@ def on_test_batch_end(self, trainer, pl_module):
 
     progress_bar = CurrentProgressBar(refresh_rate=refresh_rate)
     trainer = Trainer(
+        default_root_dir=tmpdir,
         callbacks=[progress_bar],
         progress_bar_refresh_rate=101,  # should not matter if custom callback provided
         limit_train_batches=1.0,
diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 6f77e3e52285f..d88c4f4a1b39a 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -50,6 +50,7 @@ def log_metrics(self, metrics, step):
     logger = StoreHistoryLogger(**logger_args)
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
         limit_train_batches=0.2,
diff --git a/tests/loggers/test_base.py b/tests/loggers/test_base.py
index 083a43af2c68f..dfe9ffc6437fe 100644
--- a/tests/loggers/test_base.py
+++ b/tests/loggers/test_base.py
@@ -68,7 +68,7 @@ def test_custom_logger(tmpdir):
         max_epochs=1,
         limit_train_batches=0.05,
         logger=logger,
-        default_root_dir=tmpdir
+        default_root_dir=tmpdir,
     )
     result = trainer.fit(model)
     assert result == 1, "Training failed"
@@ -88,7 +88,7 @@ def test_multiple_loggers(tmpdir):
         max_epochs=1,
         limit_train_batches=0.05,
         logger=[logger1, logger2],
-        default_root_dir=tmpdir
+        default_root_dir=tmpdir,
     )
     result = trainer.fit(model)
     assert result == 1, "Training failed"
@@ -108,7 +108,11 @@ def test_multiple_loggers_pickle(tmpdir):
     logger1 = CustomLogger()
     logger2 = CustomLogger()
 
-    trainer = Trainer(max_epochs=1, logger=[logger1, logger2])
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=[logger1, logger2],
+    )
     pkl_bytes = pickle.dumps(trainer)
     trainer2 = pickle.loads(pkl_bytes)
     trainer2.logger.log_metrics({"acc": 1.0}, 0)
diff --git a/tests/loggers/test_neptune.py b/tests/loggers/test_neptune.py
index 33a17b63819a7..3e8bb8c6bfaf2 100644
--- a/tests/loggers/test_neptune.py
+++ b/tests/loggers/test_neptune.py
@@ -83,7 +83,7 @@ def _run_training(logger):
             default_root_dir=tmpdir,
             max_epochs=1,
             limit_train_batches=0.05,
-            logger=logger
+            logger=logger,
         )
         trainer.fit(model)
         return logger
diff --git a/tests/loggers/test_trains.py b/tests/loggers/test_trains.py
index c2076ad759278..59bb9cace97a3 100644
--- a/tests/loggers/test_trains.py
+++ b/tests/loggers/test_trains.py
@@ -18,7 +18,7 @@ def test_trains_logger(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_train_batches=0.05,
-        logger=logger
+        logger=logger,
     )
     result = trainer.fit(model)
 
@@ -40,7 +40,7 @@ def test_trains_pickle(tmpdir):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        logger=logger
+        logger=logger,
     )
     pkl_bytes = pickle.dumps(trainer)
     trainer2 = pickle.loads(pkl_bytes)
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 8b608191dde0b..77297457ed301 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -30,7 +30,7 @@ def test_wandb_logger(wandb):
 
 
 @patch('pytorch_lightning.loggers.wandb.wandb')
-def test_wandb_pickle(wandb):
+def test_wandb_pickle(tmpdir, wandb):
     """Verify that pickling trainer with wandb logger works.
 
     Wandb doesn't work well with pytest so we have to mock it out here.
@@ -45,7 +45,11 @@ def project_name(self):
 
     logger = WandbLogger(id='the_id', offline=True)
 
-    trainer = Trainer(max_epochs=1, logger=logger)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=logger,
+    )
     # Access the experiment to ensure it's created
     assert trainer.logger.experiment, 'missing experiment'
     pkl_bytes = pickle.dumps(trainer)
diff --git a/tests/models/test_amp.py b/tests/models/test_amp.py
index 63cdeac45fc43..6cd242249650f 100644
--- a/tests/models/test_amp.py
+++ b/tests/models/test_amp.py
@@ -20,7 +20,7 @@ def test_amp_single_gpu(tmpdir, backend):
         max_epochs=1,
         gpus=1,
         distributed_backend=backend,
-        precision=16
+        precision=16,
     )
 
     model = EvalModelTemplate()
@@ -99,6 +99,7 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir):
 
     # fit model
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         gpus=[0],
         distributed_backend='ddp',
diff --git a/tests/models/test_cpu.py b/tests/models/test_cpu.py
index d45e1e60fc9d8..5a2206e96aed2 100644
--- a/tests/models/test_cpu.py
+++ b/tests/models/test_cpu.py
@@ -23,6 +23,7 @@ def test_cpu_slurm_save_load(tmpdir):
 
     # fit model
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
         limit_train_batches=0.2,
@@ -60,6 +61,7 @@ def test_cpu_slurm_save_load(tmpdir):
     logger = tutils.get_default_logger(tmpdir, version=version)
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
         checkpoint_callback=ModelCheckpoint(tmpdir),
@@ -187,7 +189,7 @@ def test_running_test_after_fitting(tmpdir):
         limit_val_batches=0.2,
         limit_test_batches=0.2,
         checkpoint_callback=checkpoint,
-        logger=logger
+        logger=logger,
     )
     result = trainer.fit(model)
 
@@ -211,6 +213,7 @@ def test_running_test_no_val(tmpdir):
 
     # fit model
     trainer = Trainer(
+        default_root_dir=tmpdir,
         progress_bar_refresh_rate=0,
         max_epochs=1,
         limit_train_batches=0.4,
@@ -218,7 +221,7 @@ def test_running_test_no_val(tmpdir):
         limit_test_batches=0.2,
         checkpoint_callback=checkpoint,
         logger=logger,
-        early_stop_callback=False
+        early_stop_callback=False,
     )
     result = trainer.fit(model)
 
@@ -344,7 +347,7 @@ def train_dataloader(self):
         truncated_bptt_steps=truncated_bptt_steps,
         limit_val_batches=0,
         weights_summary=None,
-        early_stop_callback=False
+        early_stop_callback=False,
     )
     result = trainer.fit(model)
 
diff --git a/tests/models/test_grad_norm.py b/tests/models/test_grad_norm.py
index 7a6659ecfa1d5..c9b72f74d97db 100644
--- a/tests/models/test_grad_norm.py
+++ b/tests/models/test_grad_norm.py
@@ -84,6 +84,7 @@ def test_grad_tracking(tmpdir, norm_type, rtol=5e-3):
     logger = OnlyMetricsListLogger()
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=3,
         logger=logger,
         track_grad_norm=norm_type,
diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index be7e08f36039f..a5b8f837d5a3b 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -132,7 +132,7 @@ def test_step(self, batch, batch_idx, *args, **kwargs):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     trainer.fit(model)
     if ckpt_path == 'specific':
@@ -160,7 +160,7 @@ def test_train_dataloader_passed_to_fit(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     fit_options = dict(train_dataloader=model.dataloader(train=True))
     result = trainer.fit(model, **fit_options)
@@ -177,7 +177,7 @@ def test_train_val_dataloaders_passed_to_fit(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     fit_options = dict(train_dataloader=model.dataloader(train=True),
                        val_dataloaders=model.dataloader(train=False))
@@ -199,7 +199,7 @@ def test_all_dataloaders_passed_to_fit(tmpdir, ckpt_path):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     fit_options = dict(train_dataloader=model.dataloader(train=True),
                        val_dataloaders=model.dataloader(train=False))
@@ -441,7 +441,7 @@ def test_inf_train_dataloader(tmpdir, check_interval):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        val_check_interval=check_interval
+        val_check_interval=check_interval,
     )
     result = trainer.fit(model)
     # verify training completed
@@ -459,7 +459,7 @@ def test_not_implemented_error_train_dataloader(tmpdir, check_interval):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        val_check_interval=check_interval
+        val_check_interval=check_interval,
     )
     result = trainer.fit(model)
     # verify training completed
@@ -519,7 +519,7 @@ def test_error_on_zero_len_dataloader(tmpdir):
             max_epochs=1,
             limit_train_batches=0.1,
             limit_val_batches=0.1,
-            limit_test_batches=0.1
+            limit_test_batches=0.1,
         )
         trainer.fit(model)
 
@@ -613,7 +613,7 @@ class CustomSampler(torch.utils.data.Sampler):
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 3, reason='Test requires multiple GPUs')
-def test_batch_size_smaller_than_num_gpus():
+def test_batch_size_smaller_than_num_gpus(tmpdir):
     # we need at least 3 gpus for this test
     num_gpus = 3
     batch_size = 3
@@ -651,6 +651,7 @@ def train_dataloader(self):
     model = CurrentTestModel(**hparams)
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         limit_train_batches=0.1,
         limit_val_batches=0,
diff --git a/tests/trainer/test_lr_finder.py b/tests/trainer/test_lr_finder.py
index 66b4e1d2972de..2eecd5ee5405f 100755
--- a/tests/trainer/test_lr_finder.py
+++ b/tests/trainer/test_lr_finder.py
@@ -15,7 +15,7 @@ def test_error_on_more_than_1_optimizer(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1
+        max_epochs=1,
     )
 
     with pytest.raises(MisconfigurationException):
@@ -30,7 +30,7 @@ def test_model_reset_correctly(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1
+        max_epochs=1,
     )
 
     before_state_dict = model.state_dict()
@@ -52,7 +52,7 @@ def test_trainer_reset_correctly(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1
+        max_epochs=1,
     )
 
     changed_attributes = ['callbacks', 'logger', 'max_steps', 'auto_lr_find',
@@ -83,7 +83,7 @@ def test_trainer_arg_bool(tmpdir):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        auto_lr_find=True
+        auto_lr_find=True,
     )
 
     trainer.fit(model)
@@ -102,7 +102,7 @@ def test_trainer_arg_str(tmpdir):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=2,
-        auto_lr_find='my_fancy_lr'
+        auto_lr_find='my_fancy_lr',
     )
 
     trainer.fit(model)
@@ -188,7 +188,7 @@ def test_suggestion_with_non_finite_values(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=3
+        max_epochs=3,
     )
 
     lrfinder = trainer.lr_find(model)
diff --git a/tests/trainer/test_optimizers.py b/tests/trainer/test_optimizers.py
index 222805b2e432d..f07806da14633 100644
--- a/tests/trainer/test_optimizers.py
+++ b/tests/trainer/test_optimizers.py
@@ -17,7 +17,7 @@ def test_optimizer_with_scheduling(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     results = trainer.fit(model)
     assert results == 1
@@ -48,7 +48,7 @@ def test_multi_optimizer_with_scheduling(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     results = trainer.fit(model)
     assert results == 1
@@ -83,7 +83,7 @@ def test_multi_optimizer_with_scheduling_stepping(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     results = trainer.fit(model)
     assert results == 1
@@ -122,7 +122,7 @@ def test_reduce_lr_on_plateau_scheduling(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     results = trainer.fit(model)
     assert results == 1
@@ -212,7 +212,7 @@ def test_none_optimizer(tmpdir):
         default_root_dir=tmpdir,
         max_epochs=1,
         limit_val_batches=0.1,
-        limit_train_batches=0.2
+        limit_train_batches=0.2,
     )
     result = trainer.fit(model)
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9714152ea83fc..92358979bbbdd 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -36,9 +36,10 @@ def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
     logger = tutils.get_default_logger(tmpdir)
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
-        checkpoint_callback=ModelCheckpoint(tmpdir)
+        checkpoint_callback=ModelCheckpoint(tmpdir),
     )
     # fit model
     result = trainer.fit(model)
@@ -77,9 +78,10 @@ def test_no_val_end_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt):
 
     # fit model
     trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
-        checkpoint_callback=ModelCheckpoint(tmpdir)
+        checkpoint_callback=ModelCheckpoint(tmpdir),
     )
     result = trainer.fit(model)
 
@@ -299,7 +301,7 @@ def test_model_checkpoint_only_weights(tmpdir):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_epochs=1,
-        checkpoint_callback=ModelCheckpoint(tmpdir, save_weights_only=True)
+        checkpoint_callback=ModelCheckpoint(tmpdir, save_weights_only=True),
     )
     # fit model
     result = trainer.fit(model)
@@ -666,7 +668,7 @@ def training_step(self, batch, batch_idx, optimizer_idx=None):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_steps=(model.test_batch_inf_loss + 1),
-        terminate_on_nan=True
+        terminate_on_nan=True,
     )
 
     with pytest.raises(ValueError, match=r'.*The loss returned in `training_step` is nan or inf.*'):
@@ -691,7 +693,7 @@ def on_after_backward(self):
     trainer = Trainer(
         default_root_dir=tmpdir,
         max_steps=(model.test_batch_nan + 1),
-        terminate_on_nan=True
+        terminate_on_nan=True,
     )
 
     with pytest.raises(ValueError, match=r'.*Detected nan and/or inf values in `c_d1.bias`.*'):
@@ -759,7 +761,7 @@ def _optimizer_step(*args, **kwargs):
         max_steps=1,
         max_epochs=1,
         gradient_clip_val=1.0,
-        default_root_dir=tmpdir
+        default_root_dir=tmpdir,
     )
 
     # for the test
@@ -924,7 +926,7 @@ def test_trainer_omegaconf(trainer_params):
 def test_trainer_pickle(tmpdir):
     trainer = Trainer(
         max_epochs=1,
-        default_root_dir=tmpdir
+        default_root_dir=tmpdir,
     )
     pickle.dumps(trainer)
     cloudpickle.dumps(trainer)
diff --git a/tests/trainer/test_trainer_tricks.py b/tests/trainer/test_trainer_tricks.py
index 4a95ad9eb8106..75e64c9b883c1 100755
--- a/tests/trainer/test_trainer_tricks.py
+++ b/tests/trainer/test_trainer_tricks.py
@@ -118,7 +118,7 @@ def test_model_reset_correctly(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1
+        max_epochs=1,
     )
 
     before_state_dict = model.state_dict()
@@ -141,7 +141,7 @@ def test_trainer_reset_correctly(tmpdir):
     # logger file to get meta
     trainer = Trainer(
         default_root_dir=tmpdir,
-        max_epochs=1
+        max_epochs=1,
     )
 
     changed_attributes = ['max_steps',
@@ -223,7 +223,7 @@ def test_error_on_dataloader_passed_to_fit(tmpdir):
         max_epochs=1,
         limit_val_batches=0.1,
         limit_train_batches=0.2,
-        auto_scale_batch_size='power'
+        auto_scale_batch_size='power',
     )
     fit_options = dict(train_dataloader=model.dataloader(train=True))
 

From 655a60ca9b36844883a6542062bfcc6d5cbbda9d Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 24 Jun 2020 17:33:05 +0200
Subject: [PATCH 100/136] tmpdir

---
 tests/callbacks/test_callbacks.py        | 14 ++++++++------
 tests/callbacks/test_early_stopping.py   |  3 ++-
 tests/callbacks/test_lr_logger.py        |  8 ++++----
 tests/callbacks/test_model_checkpoint.py |  4 ++--
 tests/callbacks/test_progress_bar.py     |  3 ++-
 5 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/callbacks/test_callbacks.py b/tests/callbacks/test_callbacks.py
index d2bafe6d7c991..37ebae6f2463b 100644
--- a/tests/callbacks/test_callbacks.py
+++ b/tests/callbacks/test_callbacks.py
@@ -160,6 +160,7 @@ def on_test_end(self, trainer, pl_module):
     test_callback = TestCallback()
 
     trainer_options = dict(
+        default_root_dir=tmpdir,
         callbacks=[test_callback],
         max_epochs=1,
         limit_val_batches=0.1,
@@ -312,11 +313,12 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
 
     checkpoint = ModelCheckpoint(filepath=None, save_top_k=save_top_k)
 
-    trainer = Trainer(default_root_dir=tmpdir,
-                      checkpoint_callback=checkpoint,
-                      overfit_batches=0.20,
-                      max_epochs=2
-                      )
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        checkpoint_callback=checkpoint,
+        overfit_batches=0.20,
+        max_epochs=2,
+    )
     trainer.fit(model)
 
     # These should be different if the dirpath has be overridden
@@ -337,7 +339,7 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
         default_root_dir=tmpdir,
         overfit_batches=0.2,
         max_epochs=2,
-        logger=logger
+        logger=logger,
     )
     trainer.fit(model)
 
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 4767c5f319775..7552d6ba10704 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -32,7 +32,7 @@ def on_train_start(self, trainer, pl_module):
         default_root_dir=tmpdir,
         checkpoint_callback=checkpoint_callback,
         early_stop_callback=early_stop_callback,
-        max_epochs=4
+        max_epochs=4,
     )
     trainer.fit(model)
     early_stop_callback_state = early_stop_callback.state_dict()
@@ -44,6 +44,7 @@ def on_train_start(self, trainer, pl_module):
     # ensure state is reloaded properly (assertion in the callback)
     early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
     new_trainer = Trainer(
+        default_root_dir=tmpdir,
         max_epochs=2,
         resume_from_checkpoint=checkpoint_filepath,
         early_stop_callback=early_stop_callback,
diff --git a/tests/callbacks/test_lr_logger.py b/tests/callbacks/test_lr_logger.py
index 8302e67f05d34..65ddb2fb1d127 100644
--- a/tests/callbacks/test_lr_logger.py
+++ b/tests/callbacks/test_lr_logger.py
@@ -19,7 +19,7 @@ def test_lr_logger_single_lr(tmpdir):
         max_epochs=2,
         limit_val_batches=0.1,
         limit_train_batches=0.5,
-        callbacks=[lr_logger]
+        callbacks=[lr_logger],
     )
     result = trainer.fit(model)
     assert result
@@ -42,7 +42,7 @@ def test_lr_logger_no_lr(tmpdir):
         max_epochs=2,
         limit_val_batches=0.1,
         limit_train_batches=0.5,
-        callbacks=[lr_logger]
+        callbacks=[lr_logger],
     )
 
     with pytest.warns(RuntimeWarning):
@@ -63,7 +63,7 @@ def test_lr_logger_multi_lrs(tmpdir):
         max_epochs=2,
         limit_val_batches=0.1,
         limit_train_batches=0.5,
-        callbacks=[lr_logger]
+        callbacks=[lr_logger],
     )
     result = trainer.fit(model)
     assert result
@@ -90,7 +90,7 @@ def test_lr_logger_param_groups(tmpdir):
         max_epochs=2,
         limit_val_batches=0.1,
         limit_train_batches=0.5,
-        callbacks=[lr_logger]
+        callbacks=[lr_logger],
     )
     result = trainer.fit(model)
     assert result
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index c8742dc14e103..0359103f28a44 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -22,7 +22,7 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
         default_root_dir=tmpdir,
         checkpoint_callback=checkpoint,
         overfit_pct=0.20,
-        max_epochs=5
+        max_epochs=5,
     )
     trainer.fit(model)
 
@@ -44,7 +44,7 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
         default_root_dir=tmpdir,
         overfit_pct=0.2,
         max_epochs=5,
-        logger=logger
+        logger=logger,
     )
     trainer.fit(model)
 
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index b5ff69d860695..cd02369744369 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -107,10 +107,11 @@ def test_progress_bar_totals(tmpdir):
     assert bar.test_batch_idx == k
 
 
-def test_progress_bar_fast_dev_run():
+def test_progress_bar_fast_dev_run(tmpdir):
     model = EvalModelTemplate()
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         fast_dev_run=True,
     )
 

From a19989d6884d054b5c73b9c6b2a80f6868cce982 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Wed, 24 Jun 2020 18:06:41 +0200
Subject: [PATCH 101/136] wandb

---
 pytorch_lightning/trainer/training_loop.py |  3 -
 tests/loggers/test_wandb.py                | 75 +++++++++++-----------
 2 files changed, 38 insertions(+), 40 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 696286c86404e..841a157f0f5b1 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -144,8 +144,6 @@ def training_step(self, batch, batch_idx):
 
 """
 
-import atexit
-import signal
 import subprocess
 from abc import ABC, abstractmethod
 from typing import Callable
@@ -153,7 +151,6 @@ def training_step(self, batch, batch_idx):
 
 import numpy as np
 import torch
-import subprocess
 from torch.utils.data import DataLoader
 import torch.distributed as torch_distrib
 
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 77297457ed301..54e991d705585 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -1,12 +1,12 @@
 import os
 import pickle
-from unittest.mock import patch
+from unittest import mock
 
 from pytorch_lightning import Trainer
 from pytorch_lightning.loggers import WandbLogger
 
 
-@patch('pytorch_lightning.loggers.wandb.wandb')
+@mock.patch('pytorch_lightning.loggers.wandb.wandb')
 def test_wandb_logger(wandb):
     """Verify that basic functionality of wandb logger works.
     Wandb doesn't work well with pytest so we have to mock it out here."""
@@ -29,38 +29,39 @@ def test_wandb_logger(wandb):
     assert logger.version == wandb.init().id
 
 
-@patch('pytorch_lightning.loggers.wandb.wandb')
-def test_wandb_pickle(tmpdir, wandb):
-    """Verify that pickling trainer with wandb logger works.
-
-    Wandb doesn't work well with pytest so we have to mock it out here.
-    """
-    class Experiment:
-        id = 'the_id'
-
-        def project_name(self):
-            return 'the_project_name'
-
-    wandb.init.return_value = Experiment()
-
-    logger = WandbLogger(id='the_id', offline=True)
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        logger=logger,
-    )
-    # Access the experiment to ensure it's created
-    assert trainer.logger.experiment, 'missing experiment'
-    pkl_bytes = pickle.dumps(trainer)
-    trainer2 = pickle.loads(pkl_bytes)
-
-    assert os.environ['WANDB_MODE'] == 'dryrun'
-    assert trainer2.logger.__class__.__name__ == WandbLogger.__name__
-    assert trainer2.logger.experiment, 'missing experiment'
-
-    wandb.init.assert_called()
-    assert 'id' in wandb.init.call_args[1]
-    assert wandb.init.call_args[1]['id'] == 'the_id'
-
-    del os.environ['WANDB_MODE']
+# TODO: find the issue with running this test
+# @mock.patch('pytorch_lightning.loggers.wandb.wandb')
+# def test_wandb_pickle(tmpdir, wandb):
+#     """Verify that pickling trainer with wandb logger works.
+#
+#     Wandb doesn't work well with pytest so we have to mock it out here.
+#     """
+#     class Experiment:
+#         id = 'the_id'
+#
+#         def project_name(self):
+#             return 'the_project_name'
+#
+#     wandb.init.return_value = Experiment()
+#
+#     logger = WandbLogger(id='the_id', offline=True)
+#
+#     trainer = Trainer(
+#         default_root_dir=tmpdir,
+#         max_epochs=1,
+#         logger=logger,
+#     )
+#     # Access the experiment to ensure it's created
+#     assert trainer.logger.experiment, 'missing experiment'
+#     pkl_bytes = pickle.dumps(trainer)
+#     trainer2 = pickle.loads(pkl_bytes)
+#
+#     assert os.environ['WANDB_MODE'] == 'dryrun'
+#     assert trainer2.logger.__class__.__name__ == WandbLogger.__name__
+#     assert trainer2.logger.experiment, 'missing experiment'
+#
+#     wandb.init.assert_called()
+#     assert 'id' in wandb.init.call_args[1]
+#     assert wandb.init.call_args[1]['id'] == 'the_id'
+#
+#     del os.environ['WANDB_MODE']

From 7e08c6e621e9ae6c2b8f5ad420fc5e3cecc32aa9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 17:00:08 +0200
Subject: [PATCH 102/136] fix merge conflict

---
 tests/callbacks/test_early_stopping.py           | 2 +-
 tests/callbacks/test_model_checkpoint.py         | 2 +-
 tests/callbacks/test_progress_bar.py             | 3 ++-
 tests/models/data/horovod/train_default_model.py | 5 ++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 7552d6ba10704..827b747a1fcce 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -1,7 +1,7 @@
 import pytest
 
 import torch
-import tests.base.utils as tutils
+import tests.base.develop_utils as tutils
 from pytorch_lightning import Callback
 from pytorch_lightning import Trainer, LightningModule
 from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 0359103f28a44..85955218ca38d 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-import tests.base.utils as tutils
+import tests.base.develop_utils as tutils
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import TensorBoardLogger
diff --git a/tests/callbacks/test_progress_bar.py b/tests/callbacks/test_progress_bar.py
index cd02369744369..f621e70228012 100644
--- a/tests/callbacks/test_progress_bar.py
+++ b/tests/callbacks/test_progress_bar.py
@@ -35,10 +35,11 @@ def test_progress_bar_on(tmpdir, callbacks, refresh_rate):
     ([], False),
     ([ModelCheckpoint('../trainer')], 0),
 ])
-def test_progress_bar_off(callbacks, refresh_rate):
+def test_progress_bar_off(tmpdir, callbacks, refresh_rate):
     """Test different ways the progress bar can be turned off."""
 
     trainer = Trainer(
+        default_root_dir=tmpdir,
         callbacks=callbacks,
         progress_bar_refresh_rate=refresh_rate,
     )
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index 2fcb50f87af7c..c5e51b09b3552 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -22,13 +22,12 @@
 import sys
 import pytest
 
-   
 
 try:
     import horovod.torch as hvd
-     HOROVOD_AVAILABLE = True
+    HOROVOD_AVAILABLE = True
 except (ModuleNotFoundError, ImportError):
-     HOROVOD_AVAILABLE = False
+    HOROVOD_AVAILABLE = False
     print('You requested to import Horovod which is missing or not supported for your OS.')
 
 PATH_HERE = os.path.abspath(os.path.dirname(__file__))

From 16f14483279661e519cd483c6eefe5b68752d4c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 17:00:28 +0200
Subject: [PATCH 103/136] add back evaluation after training

---
 pytorch_lightning/trainer/training_loop.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 109a87f8cc879..14b8083ca5383 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -469,6 +469,8 @@ def run_training_epoch(self):
             # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
             # -----------------------------------------
             should_check_val = self.check_validation_in_train_loop(batch_idx, is_last_batch)
+            if self.fast_dev_run or should_check_val:
+                self.run_evaluation(test_mode=self.testing)
 
             # -----------------------------------------
             # SAVE LOGGERS (ie: Tensorboard, etc...)

From 04f20a55553e2ec0c9b4cfa758a3b7d6792c1f11 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 20:43:57 +0200
Subject: [PATCH 104/136] test_resume_early_stopping_from_checkpoint TODO

---
 pytorch_lightning/trainer/training_loop.py | 2 +-
 tests/callbacks/test_early_stopping.py     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 14b8083ca5383..98636eaa74907 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -470,7 +470,7 @@ def run_training_epoch(self):
             # -----------------------------------------
             should_check_val = self.check_validation_in_train_loop(batch_idx, is_last_batch)
             if self.fast_dev_run or should_check_val:
-                self.run_evaluation(test_mode=self.testing)
+                self.run_evaluation(test_mode=False)
 
             # -----------------------------------------
             # SAVE LOGGERS (ie: Tensorboard, etc...)
diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index 827b747a1fcce..d6c253d3c8c35 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 
 
+@pytest.mark.skip('TODO: fix this test')
 def test_resume_early_stopping_from_checkpoint(tmpdir):
     """
     Prevent regressions to bugs:

From 86bd66fa66c05a5aee0383d390ca8579b25793ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 20:54:35 +0200
Subject: [PATCH 105/136] undo the horovod check

---
 tests/models/data/horovod/train_default_model.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
index c5e51b09b3552..f32df08ca83b2 100644
--- a/tests/models/data/horovod/train_default_model.py
+++ b/tests/models/data/horovod/train_default_model.py
@@ -20,14 +20,11 @@
 import json
 import os
 import sys
-import pytest
 
 
 try:
     import horovod.torch as hvd
-    HOROVOD_AVAILABLE = True
 except (ModuleNotFoundError, ImportError):
-    HOROVOD_AVAILABLE = False
     print('You requested to import Horovod which is missing or not supported for your OS.')
 
 PATH_HERE = os.path.abspath(os.path.dirname(__file__))
@@ -46,7 +43,6 @@
 parser.add_argument('--on-gpu', action='store_true', default=False)
 
 
-@pytest.mark.skipif(not HOROVOD_AVAILABLE, reason="Horovod not installed")
 def run_test_from_config(trainer_options):
     """Trains the default model with the given config."""
     set_random_master_port()

From f34ac7d86e057fc59a57383dd28542dc22c4313f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 20:56:13 +0200
Subject: [PATCH 106/136] update changelog

---
 CHANGELOG.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7129b11153f02..e8db67e4a301d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -38,6 +38,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed lost compatibility with custom datatypes implementing `.to` ([#2335](https://github.com/PyTorchLightning/pytorch-lightning/pull/2335))
 
+- Fixed several issues with early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504))
+
+
 ## [0.8.1] - 2020-06-19
 
 ### Fixed
@@ -127,8 +130,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed Tpu logging ([#2230](https://github.com/PyTorchLightning/pytorch-lightning/pull/2230))
 - Fixed Pid port + duplicate `rank_zero` logging ([#2140](https://github.com/PyTorchLightning/pytorch-lightning/pull/2140), [#2231](https://github.com/PyTorchLightning/pytorch-lightning/pull/2231))
 
-- Fixed for early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504))
-
 ## [0.7.6] - 2020-05-16
 
 ### Added

From 02ccd195eb90502d2fcab3246974a5bf82e88245 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 21:24:15 +0200
Subject: [PATCH 107/136] remove a duplicate test from merge error

---
 tests/trainer/test_dataloaders.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/tests/trainer/test_dataloaders.py b/tests/trainer/test_dataloaders.py
index 3e6b04751a107..b36eca8a2e429 100644
--- a/tests/trainer/test_dataloaders.py
+++ b/tests/trainer/test_dataloaders.py
@@ -408,24 +408,6 @@ def test_inf_train_dataloader(tmpdir, check_interval):
     assert result == 1
 
 
-@pytest.mark.parametrize('check_interval', [50, 1.0])
-@pytest.mark.skip('TODO: speed up this test')
-def test_not_implemented_error_train_dataloader(tmpdir, check_interval):
-    """Test not_implemented_error train data loader (e.g. IterableDataset)"""
-
-    model = EvalModelTemplate()
-    model.train_dataloader = model.train_dataloader__not_implemented_error
-
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        max_epochs=1,
-        val_check_interval=check_interval,
-    )
-    result = trainer.fit(model)
-    # verify training completed
-    assert result == 1
-
-
 @pytest.mark.parametrize('check_interval', [1.0])
 def test_inf_val_dataloader(tmpdir, check_interval):
     """Test inf val data loader (e.g. IterableDataset)"""

From a5252996f77294604a43fd018cfbf67e03de84ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 21:38:54 +0200
Subject: [PATCH 108/136] try fix dp_resume test

---
 tests/models/test_restore.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
index c77f7a841f3c3..9eb1067322127 100644
--- a/tests/models/test_restore.py
+++ b/tests/models/test_restore.py
@@ -154,6 +154,7 @@ def test_dp_resume(tmpdir):
         max_epochs=1,
         gpus=2,
         distributed_backend='dp',
+        default_root_dir=tmpdir,
     )
 
     # get logger

From 651fb09637ce1fbb760a63291c41cdfb9c20e9b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sat, 27 Jun 2020 21:56:08 +0200
Subject: [PATCH 109/136] add the logger fix from master

---
 pytorch_lightning/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index b3fde36a74373..09b00e5aa1487 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -236,7 +236,7 @@ def on_train_start(self, trainer, pl_module):
 
         self.filename = '{epoch}'
 
-        if trainer.logger is not None:
+        if trainer.logger is not None and trainer.logger.experiment is not None:
             # weights_save_path overrides anything
             if getattr(trainer, 'weights_save_path', None) is not None:
                 save_dir = trainer.weights_save_path

From 335a2e54b9f473dfb8501b2f8731ec3afc1ee946 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 02:03:04 +0200
Subject: [PATCH 110/136] try remove default_root_dir

---
 tests/loggers/test_all.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index f8e4010332c71..8736b3eae8628 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -49,7 +49,6 @@ def log_metrics(self, metrics, step):
     logger = StoreHistoryLogger(**logger_args)
 
     trainer = Trainer(
-        default_root_dir=tmpdir,
         max_epochs=1,
         logger=logger,
         limit_train_batches=0.2,

From aa7fb9288d458a0b1451650db5e296231ec00e95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 02:29:55 +0200
Subject: [PATCH 111/136] try mocking numpy

---
 docs/source/conf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index d81c0d00e7da5..a25b1afb5f547 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -332,6 +332,7 @@ def package_list_from_file(file):
 
 MOCK_MANUAL_PACKAGES = [
     'torchvision',
+    'numpy',
     'PIL',
     # packages with different package name compare to import name
     'yaml',

From 978bed0c7fbc5d8c56ec258dfb93622a83109233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 02:38:57 +0200
Subject: [PATCH 112/136] try import numpy in docs test

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index a25b1afb5f547..e63a7ddeaac99 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -332,7 +332,6 @@ def package_list_from_file(file):
 
 MOCK_MANUAL_PACKAGES = [
     'torchvision',
-    'numpy',
     'PIL',
     # packages with different package name compare to import name
     'yaml',
@@ -416,6 +415,7 @@ def find_source():
 import importlib
 import os
 import torch
+import numpy
 
 TORCHVISION_AVAILABLE = importlib.util.find_spec('torchvision')
 

From a06970a17118e5cb250e828db5e449b5a43a6e2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 02:39:06 +0200
Subject: [PATCH 113/136] fix wandb test

---
 tests/loggers/test_wandb.py | 71 ++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
index 54e991d705585..aa8b616bcf475 100644
--- a/tests/loggers/test_wandb.py
+++ b/tests/loggers/test_wandb.py
@@ -29,39 +29,38 @@ def test_wandb_logger(wandb):
     assert logger.version == wandb.init().id
 
 
-# TODO: find the issue with running this test
-# @mock.patch('pytorch_lightning.loggers.wandb.wandb')
-# def test_wandb_pickle(tmpdir, wandb):
-#     """Verify that pickling trainer with wandb logger works.
-#
-#     Wandb doesn't work well with pytest so we have to mock it out here.
-#     """
-#     class Experiment:
-#         id = 'the_id'
-#
-#         def project_name(self):
-#             return 'the_project_name'
-#
-#     wandb.init.return_value = Experiment()
-#
-#     logger = WandbLogger(id='the_id', offline=True)
-#
-#     trainer = Trainer(
-#         default_root_dir=tmpdir,
-#         max_epochs=1,
-#         logger=logger,
-#     )
-#     # Access the experiment to ensure it's created
-#     assert trainer.logger.experiment, 'missing experiment'
-#     pkl_bytes = pickle.dumps(trainer)
-#     trainer2 = pickle.loads(pkl_bytes)
-#
-#     assert os.environ['WANDB_MODE'] == 'dryrun'
-#     assert trainer2.logger.__class__.__name__ == WandbLogger.__name__
-#     assert trainer2.logger.experiment, 'missing experiment'
-#
-#     wandb.init.assert_called()
-#     assert 'id' in wandb.init.call_args[1]
-#     assert wandb.init.call_args[1]['id'] == 'the_id'
-#
-#     del os.environ['WANDB_MODE']
+@mock.patch('pytorch_lightning.loggers.wandb.wandb')
+def test_wandb_pickle(wandb, tmpdir):
+    """Verify that pickling trainer with wandb logger works.
+
+    Wandb doesn't work well with pytest so we have to mock it out here.
+    """
+    class Experiment:
+        id = 'the_id'
+
+        def project_name(self):
+            return 'the_project_name'
+
+    wandb.init.return_value = Experiment()
+
+    logger = WandbLogger(id='the_id', offline=True)
+
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=logger,
+    )
+    # Access the experiment to ensure it's created
+    assert trainer.logger.experiment, 'missing experiment'
+    pkl_bytes = pickle.dumps(trainer)
+    trainer2 = pickle.loads(pkl_bytes)
+
+    assert os.environ['WANDB_MODE'] == 'dryrun'
+    assert trainer2.logger.__class__.__name__ == WandbLogger.__name__
+    assert trainer2.logger.experiment, 'missing experiment'
+
+    wandb.init.assert_called()
+    assert 'id' in wandb.init.call_args[1]
+    assert wandb.init.call_args[1]['id'] == 'the_id'
+
+    del os.environ['WANDB_MODE']

From 6a2acf3315a629cf78bb9a7f5cc2e2262e69e54d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 02:44:26 +0200
Subject: [PATCH 114/136] pep 8 fix

---
 pytorch_lightning/callbacks/model_checkpoint.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/callbacks/model_checkpoint.py b/pytorch_lightning/callbacks/model_checkpoint.py
index 09b00e5aa1487..45e5560e9f288 100644
--- a/pytorch_lightning/callbacks/model_checkpoint.py
+++ b/pytorch_lightning/callbacks/model_checkpoint.py
@@ -241,9 +241,9 @@ def on_train_start(self, trainer, pl_module):
             if getattr(trainer, 'weights_save_path', None) is not None:
                 save_dir = trainer.weights_save_path
             else:
-                save_dir = (getattr(trainer.logger, 'save_dir', None) or
-                            getattr(trainer.logger, '_save_dir', None) or
-                            trainer.default_root_dir)
+                save_dir = (getattr(trainer.logger, 'save_dir', None)
+                            or getattr(trainer.logger, '_save_dir', None)
+                            or trainer.default_root_dir)
 
             version = trainer.logger.version if isinstance(
                 trainer.logger.version, str) else f'version_{trainer.logger.version}'

From 594795a176c803fb217e2491694eb765a4110caf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 04:05:21 +0200
Subject: [PATCH 115/136] skip if no amp

---
 docs/source/apex.rst                  | 3 +++
 docs/source/conf.py                   | 6 +++++-
 pytorch_lightning/trainer/__init__.py | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/apex.rst b/docs/source/apex.rst
index f6d647e5a9f9b..d3d4ee6e94838 100644
--- a/docs/source/apex.rst
+++ b/docs/source/apex.rst
@@ -21,6 +21,7 @@ Native torch
 When using PyTorch 1.6+ Lightning uses the native amp implementation to support 16-bit.
 
 .. testcode::
+    :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE
 
     # turn on 16-bit
     trainer = Trainer(precision=16)
@@ -62,6 +63,7 @@ Enable 16-bit
 ^^^^^^^^^^^^^
 
 .. testcode::
+    :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE
 
     # turn on 16-bit
     trainer = Trainer(amp_level='O2', precision=16)
@@ -76,6 +78,7 @@ TPU 16-bit
 16-bit on TPus is much simpler. To use 16-bit with TPUs set precision to 16 when using the tpu flag
 
 .. testcode::
+    :skipif: not XLA_AVAILABLE
 
     # DEFAULT
     trainer = Trainer(tpu_cores=8, precision=32)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d81c0d00e7da5..c6a0638281adc 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -416,7 +416,11 @@ def find_source():
 import os
 import torch
 
-TORCHVISION_AVAILABLE = importlib.util.find_spec('torchvision')
+from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
+APEX_AVAILABLE = importlib.util.find_spec("apex") is not None
+XLA_AVAILABLE = importlib.util.find_spec("torch_xla") is not None
+TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
+
 
 """
 coverage_skip_undoc_in_source = True
diff --git a/pytorch_lightning/trainer/__init__.py b/pytorch_lightning/trainer/__init__.py
index 6e276360470cf..1b3a087a3cb5a 100644
--- a/pytorch_lightning/trainer/__init__.py
+++ b/pytorch_lightning/trainer/__init__.py
@@ -721,6 +721,7 @@ def on_train_end(self, trainer, pl_module):
 will still show torch.float32.
 
 .. testcode::
+    :skipif: not APEX_AVAILABLE and not NATIVE_AMP_AVALAIBLE
 
     # default used by the Trainer
     trainer = Trainer(precision=32)

From b6c99b4b33e8eca590c6a0a33d90adf6eccc8150 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 04:06:24 +0200
Subject: [PATCH 116/136] dont mock when doctesting

---
 .github/workflows/docs-checks.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 6bfc9b46fc119..144303cb4d579 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -45,6 +45,8 @@ jobs:
         shell: bash
 
       - name: Test Documentation
+        env:
+          SPHINX_MOCK_REQUIREMENTS: 0
         run: |
           # First run the same pipeline as Read-The-Docs
           apt-get update && sudo apt-get install -y cmake

From 45c1cbfbf9506ba9bcfbf508d4b6744d83b4821c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 04:17:16 +0200
Subject: [PATCH 117/136] install extra

---
 .github/workflows/docs-checks.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml
index 144303cb4d579..068ebe151437e 100644
--- a/.github/workflows/docs-checks.yml
+++ b/.github/workflows/docs-checks.yml
@@ -40,6 +40,7 @@ jobs:
         run: |
           # python -m pip install --upgrade --user pip
           pip install -r requirements/base.txt -U -f https://download.pytorch.org/whl/torch_stable.html -q
+          pip install -r requirements/extra.txt
           pip install -r requirements/docs.txt
           python --version ; pip --version ; pip list
         shell: bash

From 4e694f16a308f5d7accfb976735999d9e2f39cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 05:58:39 +0200
Subject: [PATCH 118/136] fix the resume ES test

---
 tests/callbacks/test_early_stopping.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index d6c253d3c8c35..c594d2f0a9cd5 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -1,16 +1,11 @@
 import pytest
 
 import torch
-import tests.base.develop_utils as tutils
-from pytorch_lightning import Callback
-from pytorch_lightning import Trainer, LightningModule
-from pytorch_lightning.callbacks import EarlyStopping, LearningRateLogger, ModelCheckpoint
-from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 from tests.base import EvalModelTemplate
-from pathlib import Path
 
 
-@pytest.mark.skip('TODO: fix this test')
 def test_resume_early_stopping_from_checkpoint(tmpdir):
     """
     Prevent regressions to bugs:
@@ -18,6 +13,16 @@ def test_resume_early_stopping_from_checkpoint(tmpdir):
     https://github.com/PyTorchLightning/pytorch-lightning/issues/1463
     """
 
+    class EarlyStoppingTestStore(EarlyStopping):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            # cache the state for each epoch
+            self.saved_states = []
+
+        def on_validation_end(self, trainer, pl_module):
+            super().on_validation_end(trainer, pl_module)
+            self.saved_states.append(self.state_dict().copy())
+
     class EarlyStoppingTestRestore(EarlyStopping):
         def __init__(self, expected_state):
             super().__init__()
@@ -28,7 +33,7 @@ def on_train_start(self, trainer, pl_module):
 
     model = EvalModelTemplate()
     checkpoint_callback = ModelCheckpoint(save_top_k=1)
-    early_stop_callback = EarlyStopping()
+    early_stop_callback = EarlyStoppingTestStore()
     trainer = Trainer(
         default_root_dir=tmpdir,
         checkpoint_callback=checkpoint_callback,
@@ -36,12 +41,15 @@ def on_train_start(self, trainer, pl_module):
         max_epochs=4,
     )
     trainer.fit(model)
-    early_stop_callback_state = early_stop_callback.state_dict()
 
     checkpoint_filepath = checkpoint_callback.kth_best_model
     # ensure state is persisted properly
     checkpoint = torch.load(checkpoint_filepath)
+    # the checkpoint saves "epoch + 1"
+    early_stop_callback_state = early_stop_callback.saved_states[checkpoint['epoch'] - 1]
+    assert 4 == len(early_stop_callback.saved_states)
     assert checkpoint['early_stop_callback_state_dict'] == early_stop_callback_state
+
     # ensure state is reloaded properly (assertion in the callback)
     early_stop_callback = EarlyStoppingTestRestore(early_stop_callback_state)
     new_trainer = Trainer(

From 5f72cec19efb7d9a394630ffaa083188db0007fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 06:03:25 +0200
Subject: [PATCH 119/136] undo conf.py changes

---
 docs/source/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 63cd705feb347..c6a0638281adc 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -415,12 +415,12 @@ def find_source():
 import importlib
 import os
 import torch
-import numpy
 
 from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
 APEX_AVAILABLE = importlib.util.find_spec("apex") is not None
 XLA_AVAILABLE = importlib.util.find_spec("torch_xla") is not None
 TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None
 
+
 """
 coverage_skip_undoc_in_source = True

From ae75fa413406814be6c4384198c57586ee9ef4df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 06:05:57 +0200
Subject: [PATCH 120/136] revert remove comet pickle from test

---
 tests/loggers/test_all.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/loggers/test_all.py b/tests/loggers/test_all.py
index 8736b3eae8628..ca309f42afeee 100644
--- a/tests/loggers/test_all.py
+++ b/tests/loggers/test_all.py
@@ -67,6 +67,7 @@ def log_metrics(self, metrics, step):
 
 @pytest.mark.parametrize("logger_class", [
     TensorBoardLogger,
+    CometLogger,
     MLFlowLogger,
     NeptuneLogger,
     TestTubeLogger,

From 2463b41ba833927663e91c45a77930be9fa4ddd5 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:26:33 -0400
Subject: [PATCH 121/136] Update CHANGELOG.md

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3ddcfe3d11e9a..8bec593169598 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,7 +42,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed loading model with kwargs ([#2387](https://github.com/PyTorchLightning/pytorch-lightning/pull/2387))
 
-- Fixed several issues with early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504))
+- Fixed several issues with early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504), [#2391](https://github.com/PyTorchLightning/pytorch-lightning/pull/2391))
 
 ## [0.8.1] - 2020-06-19
 

From 1e822add981bea9866817c36b27035928db17d2b Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:28:30 -0400
Subject: [PATCH 122/136] Update weights_loading.rst

---
 docs/source/weights_loading.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index d6823cf0e6b63..443cbb09faa8b 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -29,9 +29,9 @@ Automatic saving
 Checkpointing is enabled by default to the current working directory.
 To change the checkpoint path pass in:
 
-.. testcode::
+.. code-block:: python
 
-    trainer = Trainer(default_root_dir='lightning_checkpoints')
+    trainer = Trainer(default_root_dir='path/to/your/checkpoint')
 
 To modify the behavior of checkpointing pass in your own callback.
 

From e4d450e605c638ba4d7d8d5d90abde50cf1aafd2 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:29:27 -0400
Subject: [PATCH 123/136] Update weights_loading.rst

---
 docs/source/weights_loading.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index 443cbb09faa8b..0c7a02cb91204 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -31,7 +31,7 @@ To change the checkpoint path pass in:
 
 .. code-block:: python
 
-    trainer = Trainer(default_root_dir='path/to/your/checkpoint')
+    trainer = Trainer(default_root_dir='your/path/to/save/checkpoints')
 
 To modify the behavior of checkpointing pass in your own callback.
 

From d84022692f683c9c4c4c5cd1ec74638a37688a16 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:29:45 -0400
Subject: [PATCH 124/136] Update weights_loading.rst

---
 docs/source/weights_loading.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/weights_loading.rst b/docs/source/weights_loading.rst
index 0c7a02cb91204..067cb380b82da 100644
--- a/docs/source/weights_loading.rst
+++ b/docs/source/weights_loading.rst
@@ -31,7 +31,7 @@ To change the checkpoint path pass in:
 
 .. code-block:: python
 
-    trainer = Trainer(default_root_dir='your/path/to/save/checkpoints')
+    trainer = Trainer(default_root_dir='/your/path/to/save/checkpoints')
 
 To modify the behavior of checkpointing pass in your own callback.
 

From 21d6c8cfd61356718975fdb5a94624ba317dbe8f Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:55:48 -0400
Subject: [PATCH 125/136] renamed flag

---
 pytorch_lightning/trainer/training_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 98636eaa74907..8beb998a2313c 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -468,7 +468,7 @@ def run_training_epoch(self):
             # -----------------------------------------
             # VALIDATE IF NEEDED + CHECKPOINT CALLBACK
             # -----------------------------------------
-            should_check_val = self.check_validation_in_train_loop(batch_idx, is_last_batch)
+            should_check_val = self.should_check_val(batch_idx, is_last_batch)
             if self.fast_dev_run or should_check_val:
                 self.run_evaluation(test_mode=False)
 
@@ -564,7 +564,7 @@ def save_loggers_in_training_loop(self, batch_idx):
             if self.is_global_zero and self.logger is not None:
                 self.logger.save()
 
-    def check_validation_in_train_loop(self, batch_idx, is_last_batch):
+    def should_check_val(self, batch_idx, is_last_batch):
         # decide if we should run validation
         is_val_check_batch = (batch_idx + 1) % self.val_check_batch == 0
         can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0

From 71702055eed52ba3d776221c5bf8cb33b7844cc8 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 06:58:00 -0400
Subject: [PATCH 126/136] renamed flag

---
 pytorch_lightning/trainer/training_loop.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index 8beb998a2313c..ea4b47764987a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -462,8 +462,7 @@ def run_training_epoch(self):
             self.update_train_loop_lr_schedulers()
 
             # when returning -1 from train_step, we end epoch early
-            if batch_output.signal == -1:
-                self.should_stop = True
+            self.should_stop = batch_output.signal == -1
 
             # -----------------------------------------
             # VALIDATE IF NEEDED + CHECKPOINT CALLBACK

From 37304c6348c5dd15fb5d6d46b584a573e3841ce5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 18:19:26 +0200
Subject: [PATCH 127/136] revert the None check in logger experiment
 name/version

---
 pytorch_lightning/loggers/wandb.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index c4cccbdbf019b..a307f211a8e62 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -131,9 +131,9 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
         self.experiment.log({'global_step': step, **metrics} if step is not None else metrics)
 
     @property
-    def name(self) -> str:
-        return self.experiment.project_name()
+    def name(self) -> Optional[str]:
+        return self.experiment.project_name() if self._experiment else None
 
     @property
-    def version(self) -> str:
-        return self.experiment.id
+    def version(self) -> Optional[str]:
+        return self.experiment.id if self._experiment else None

From c16cf77ac034627f262cae5cd196c9d2ff7759a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 18:28:39 +0200
Subject: [PATCH 128/136] add the old comments

---
 pytorch_lightning/loggers/wandb.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index a307f211a8e62..36c7ecf4c2779 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -132,8 +132,11 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
 
     @property
     def name(self) -> Optional[str]:
-        return self.experiment.project_name() if self._experiment else None
+        # don't create an experiment if we don't have one
+        name = self._experiment.project_name() if self._experiment else None
+        return name
 
     @property
     def version(self) -> Optional[str]:
+        # don't create an experiment if we don't have one
         return self.experiment.id if self._experiment else None

From 88454f0855773505f6b9e4030215f18e3d9671f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 18:43:05 +0200
Subject: [PATCH 129/136] _experiment

---
 pytorch_lightning/loggers/wandb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/loggers/wandb.py b/pytorch_lightning/loggers/wandb.py
index 36c7ecf4c2779..3c8dd5457a22c 100644
--- a/pytorch_lightning/loggers/wandb.py
+++ b/pytorch_lightning/loggers/wandb.py
@@ -139,4 +139,4 @@ def name(self) -> Optional[str]:
     @property
     def version(self) -> Optional[str]:
         # don't create an experiment if we don't have one
-        return self.experiment.id if self._experiment else None
+        return self._experiment.id if self._experiment else None

From d3edf9c2b7b4cc102018f06669450e988d0336fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 28 Jun 2020 21:00:34 +0200
Subject: [PATCH 130/136] test chckpointing on DDP

---
 tests/callbacks/test_model_checkpoint.py | 40 ++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 85955218ca38d..eb873705e11d1 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,3 +1,4 @@
+import os
 import pickle
 from pathlib import Path
 
@@ -57,3 +58,42 @@ def test_pickling(tmpdir):
     ckpt_pickled = pickle.dumps(ckpt)
     ckpt_loaded = pickle.loads(ckpt_pickled)
     assert vars(ckpt) == vars(ckpt_loaded)
+
+
+class ModelCheckpointTestInvocations(ModelCheckpoint):
+    # this class has to be defined outside the test function, otherwise we get pickle error
+    # due to the way ddp process is launched
+
+    def __init__(self, expected_count, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.count = 0
+        self.expected_count = expected_count
+
+    def _save_model(self, filepath):
+        # make sure we don't save twice
+        assert not os.path.isfile(filepath)
+        self.count += 1
+        super()._save_model(filepath)
+
+    def on_train_end(self, trainer, pl_module):
+        super().on_train_end(trainer, pl_module)
+        # on rank 0 we expect the saved files and on all others no saves
+        assert trainer.global_rank == 0 and self.count == self.expected_count \
+            or trainer.global_rank > 0 and self.count == 0
+
+
+def test_model_checkpoint_no_extraneous_invocations(tmpdir):
+    """Test to ensure that the model callback saves the checkpoints only once in distributed mode."""
+    model = EvalModelTemplate()
+    num_epochs = 4
+    model_checkpoint = ModelCheckpointTestInvocations(expected_count=num_epochs, save_top_k=-1)
+    trainer = Trainer(
+        distributed_backend='ddp_cpu',
+        num_processes=2,
+        default_root_dir=tmpdir,
+        early_stop_callback=False,
+        checkpoint_callback=model_checkpoint,
+        max_epochs=num_epochs,
+    )
+    result = trainer.fit(model)
+    assert 1 == result

From 0b3d40242278184f1baac93d1d6ea41e83b99770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 21:12:21 +0200
Subject: [PATCH 131/136] skip the ddp test on windows

---
 tests/callbacks/test_model_checkpoint.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index eb873705e11d1..682e42ee15206 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -1,5 +1,6 @@
 import os
 import pickle
+import platform
 from pathlib import Path
 
 import pytest
@@ -82,6 +83,7 @@ def on_train_end(self, trainer, pl_module):
             or trainer.global_rank > 0 and self.count == 0
 
 
+@pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")
 def test_model_checkpoint_no_extraneous_invocations(tmpdir):
     """Test to ensure that the model callback saves the checkpoints only once in distributed mode."""
     model = EvalModelTemplate()

From 190e76182fc0b6bc8898a88a7ad2e2e91bb7ef57 Mon Sep 17 00:00:00 2001
From: Jirka <jirka@pytorchlightning.ai>
Date: Sun, 28 Jun 2020 23:24:06 +0200
Subject: [PATCH 132/136] cloudpickle

---
 tests/callbacks/test_early_stopping.py   | 10 +++++++++-
 tests/callbacks/test_model_checkpoint.py |  6 ++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/callbacks/test_early_stopping.py b/tests/callbacks/test_early_stopping.py
index c594d2f0a9cd5..c3e5fa3914682 100644
--- a/tests/callbacks/test_early_stopping.py
+++ b/tests/callbacks/test_early_stopping.py
@@ -1,3 +1,6 @@
+import pickle
+
+import cloudpickle
 import pytest
 
 import torch
@@ -118,8 +121,13 @@ def validation_epoch_end(self, outputs):
 
 
 def test_pickling(tmpdir):
-    import pickle
     early_stopping = EarlyStopping()
+
     early_stopping_pickled = pickle.dumps(early_stopping)
     early_stopping_loaded = pickle.loads(early_stopping_pickled)
     assert vars(early_stopping) == vars(early_stopping_loaded)
+
+    early_stopping_pickled = cloudpickle.dumps(early_stopping)
+    early_stopping_loaded = cloudpickle.loads(early_stopping_pickled)
+    assert vars(early_stopping) == vars(early_stopping_loaded)
+
diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 682e42ee15206..08789020bfeba 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -3,6 +3,7 @@
 import platform
 from pathlib import Path
 
+import cloudpickle
 import pytest
 
 import tests.base.develop_utils as tutils
@@ -56,10 +57,15 @@ def test_model_checkpoint_path(tmpdir, logger_version, expected):
 
 def test_pickling(tmpdir):
     ckpt = ModelCheckpoint(tmpdir)
+
     ckpt_pickled = pickle.dumps(ckpt)
     ckpt_loaded = pickle.loads(ckpt_pickled)
     assert vars(ckpt) == vars(ckpt_loaded)
 
+    ckpt_pickled = cloudpickle.dumps(ckpt)
+    ckpt_loaded = cloudpickle.loads(ckpt_pickled)
+    assert vars(ckpt) == vars(ckpt_loaded)
+
 
 class ModelCheckpointTestInvocations(ModelCheckpoint):
     # this class has to be defined outside the test function, otherwise we get pickle error

From c62150f7a49f4f9b66e4733928dca695227a92a7 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 17:28:46 -0400
Subject: [PATCH 133/136] renamed flag

---
 pytorch_lightning/trainer/training_loop.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index aa9da1f63c991..b6d4b563602d7 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -433,6 +433,7 @@ def run_training_epoch(self):
 
         # bookkeeping
         epoch_output = []
+        should_check_val = False
 
         # run epoch
         for batch_idx, (batch, is_last_batch) in self.profiler.profile_iterable(
@@ -497,15 +498,19 @@ def run_training_epoch(self):
         # process epoch outputs
         self.run_training_epoch_end(epoch_output)
 
+        # checkpoint callback
+        self.check_checkpoint_callback(should_check_val)
+
+        # epoch end hook
+        self.run_on_epoch_end_hook(model)
+
+    def check_checkpoint_callback(self, should_check_val):
         # when no val loop is present or fast-dev-run still need to call checkpoints
         # TODO bake this logic into the checkpoint callback
         if not self.is_overridden('validation_step') and not (self.fast_dev_run or should_check_val):
             checkpoint_callbacks = [c for c in self.callbacks if isinstance(c, ModelCheckpoint)]
             [c.on_validation_end(self, self.get_model()) for c in checkpoint_callbacks]
 
-        # epoch end hook
-        self.run_on_epoch_end_hook(model)
-
     def update_train_loop_lr_schedulers(self):
         if (self.batch_idx + 1) % self.accumulate_grad_batches == 0:
             # update lr

From 137e38feaa9c784b6507888d7502e653b6ad2dc8 Mon Sep 17 00:00:00 2001
From: William Falcon <waf2107@columbia.edu>
Date: Sun, 28 Jun 2020 17:30:34 -0400
Subject: [PATCH 134/136] renamed flag

---
 pytorch_lightning/trainer/training_loop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/trainer/training_loop.py b/pytorch_lightning/trainer/training_loop.py
index b6d4b563602d7..be0735701850a 100644
--- a/pytorch_lightning/trainer/training_loop.py
+++ b/pytorch_lightning/trainer/training_loop.py
@@ -571,8 +571,8 @@ def should_check_val(self, batch_idx, is_last_batch):
         can_check_epoch = (self.current_epoch + 1) % self.check_val_every_n_epoch == 0
         can_check_val = not self.disable_validation and can_check_epoch
         should_check_val = is_val_check_batch or self.should_stop
-        should_check_val = should_check_val or (is_last_batch and self.val_check_batch == float('inf'))
-        should_check_val = can_check_val and should_check_val
+        is_last_batch_for_infinite_dataset = (is_last_batch and self.val_check_batch == float('inf'))
+        should_check_val = can_check_val and (should_check_val or is_last_batch_for_infinite_dataset)
 
         return should_check_val
 

From d15cb70e2651b578b40ea0ea24fe6673bc65b0c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <adrian.waelchli@inf.unibe.ch>
Date: Sun, 28 Jun 2020 23:41:32 +0200
Subject: [PATCH 135/136] parentheses for clarity

---
 tests/callbacks/test_model_checkpoint.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 08789020bfeba..58169f9aeb426 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -85,8 +85,8 @@ def _save_model(self, filepath):
     def on_train_end(self, trainer, pl_module):
         super().on_train_end(trainer, pl_module)
         # on rank 0 we expect the saved files and on all others no saves
-        assert trainer.global_rank == 0 and self.count == self.expected_count \
-            or trainer.global_rank > 0 and self.count == 0
+        assert (trainer.global_rank == 0 and self.count == self.expected_count) \
+            or (trainer.global_rank > 0 and self.count == 0)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Distributed training is not supported on Windows")

From 18cc13098329aa57ab2ee2265aaef6a04b0fa0f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 28 Jun 2020 23:51:10 +0200
Subject: [PATCH 136/136] apply suggestion max epochs

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
---
 tests/callbacks/test_model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/callbacks/test_model_checkpoint.py b/tests/callbacks/test_model_checkpoint.py
index 58169f9aeb426..b5cb7ca0c756e 100644
--- a/tests/callbacks/test_model_checkpoint.py
+++ b/tests/callbacks/test_model_checkpoint.py
@@ -25,7 +25,7 @@ def test_model_checkpoint_with_non_string_input(tmpdir, save_top_k):
         default_root_dir=tmpdir,
         checkpoint_callback=checkpoint,
         overfit_pct=0.20,
-        max_epochs=5,
+        max_epochs=(save_top_k + 2),
     )
     trainer.fit(model)