Lightning-AI · Borda · Jan 5, 2021 · Oct 27, 2020 · Oct 29, 2020 · Oct 29, 2020
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## Unreleased
+
+### Added
+
+- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
+
 
 ## [1.1.3rc] - 2020-12-29
 

@@ -30,7 +30,7 @@ dependencies:
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
-    - fsspec>=0.8.0
+    - fsspec[http]>=0.8.1
     #- tensorboard>=2.2.0  # not needed, already included in pytorch
 
     # Optional

@@ -43,7 +43,7 @@ def __init__(self, trainer):
         # used to validate checkpointing logic
         self.has_trained = False
 
-    def restore_weights(self, model: LightningModule):
+    def restore_weights(self, model: LightningModule) -> None:
         """
         Attempt to restore a checkpoint (e.g. weights) in this priority:
         1. from HPC weights
@@ -73,11 +73,16 @@ def restore_weights(self, model: LightningModule):
         if self.trainer.on_gpu:
             torch.cuda.empty_cache()
 
-    def restore(self, checkpoint_path: str, on_gpu: bool):
+    def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
         """
         Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore.
         All restored states are listed in return value description of `dump_checkpoint`.
         """
+        # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(checkpoint_path)
+        if not fs.exists(checkpoint_path):
+            rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch")
+            return False
 
         # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
         checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
@@ -94,6 +99,9 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
         # restore training state
         self.restore_training_state(checkpoint)
 
+        rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}")
+        return True
+
     def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         """
         Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object

@@ -251,7 +251,7 @@ def __init__(
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
             resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
-                This can be a URL.
+                This can be a URL. If there is no checkpoint file at the path, start from scratch.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 

@@ -6,5 +6,5 @@ future>=0.17.1  # required for builtins in setup.py
 # pyyaml>=3.13
 PyYAML>=5.1  # OmegaConf requirement >=5.1
 tqdm>=4.41.0
-fsspec>=0.8.0
+fsspec[http]>=0.8.1
 tensorboard>=2.2.0
@@ -12,3 +12,4 @@ sphinx-autodoc-typehints
 sphinx-paramlinks<0.4.0
 sphinx-togglebutton
 sphinx-copybutton
+fsspec[http]
@@ -73,6 +73,18 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
     trainer.fit(model)
 
 
+def test_try_resume_from_non_existing_checkpoint(tmpdir):
+    """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
+    model = EvalModelTemplate()
+    checkpoint_callback = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    trainer = Trainer(default_root_dir=tmpdir, max_epochs=2, logger=False, checkpoint_callback=checkpoint_callback)
+    # Generate checkpoint `last.ckpt` with template model
+    trainer.fit(model)
+    # `True` if resume/restore successfully else `False`
+    assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
+    assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
+
+
 class CaptureCallbacksBeforeTraining(Callback):
     callbacks = []