Lightning-AI · carmocca · Sep 4, 2021 · Aug 12, 2021 · Aug 12, 2021 · Aug 27, 2021
@@ -132,8 +132,8 @@ def done(self) -> bool:
         or if the maximum number of steps or epochs is reached.
         """
         # TODO(@awaelchli): Move track steps inside training loop and move part of these condition inside training loop
-        stop_steps = self.max_steps is not None and self.global_step >= self.max_steps
-        stop_epochs = self.max_epochs is not None and self.current_epoch >= self.max_epochs
+        stop_steps = self.max_steps not in (None, -1) and self.global_step >= self.max_steps
+        stop_epochs = self.max_epochs not in (None, -1) and self.current_epoch >= self.max_epochs
 
         should_stop = False
         if self.trainer.should_stop:

@@ -190,7 +190,7 @@ def restore_loops(self) -> None:
         self.trainer.fit_loop.current_epoch = self._loaded_checkpoint["epoch"]
 
         # crash if max_epochs is lower then the current epoch from the checkpoint
-        if self.trainer.max_epochs is not None and self.trainer.current_epoch > self.trainer.max_epochs:
+        if self.trainer.max_epochs not in (None, -1) and self.trainer.current_epoch > self.trainer.max_epochs:
             raise MisconfigurationException(
                 f"You restored a checkpoint with current_epoch={self.trainer.current_epoch},"
                 f" but you have set Trainer(max_epochs={self.trainer.max_epochs})."

@@ -262,11 +262,15 @@ def __init__(
 
             max_epochs: Stop training once this number of epochs is reached. Disabled by default (None).
                 If both max_epochs and max_steps are not specified, defaults to ``max_epochs`` = 1000.
+                To disable automatic stopping, you can set ``max_epochs = -1`` and set ``max_steps`` as ``None``
+                or ``-1``. Note that if the the ``max_time`` limit is specified, it will still be observed.
 
             min_epochs: Force training for at least these many epochs. Disabled by default (None).
                 If both min_epochs and min_steps are not specified, defaults to ``min_epochs`` = 1.
 
-            max_steps: Stop training after this number of steps. Disabled by default (None).
+            max_steps: Stop training after this number of steps. Disabled by default (None). If ``max_steps = None``
+                and ``max_epochs = None``, will default to ``max_epochs = 1000``. To override this
+                behavior, see ``max_epochs``.
 
             min_steps: Force training for at least these number of steps. Disabled by default (None).
 
@@ -374,6 +378,7 @@ def __init__(
         self.slurm_connector = SLURMConnector(self)
         self.tuner = Tuner(self)
 
+        # max_epochs won't default to 1000 if max_steps/max_time are specified (including being set to -1).
         fit_loop = FitLoop(
             min_epochs=(1 if (min_epochs is None and min_steps is None and max_time is None) else min_epochs),
             max_epochs=(1000 if (max_epochs is None and max_steps is None and max_time is None) else max_epochs),

@@ -47,6 +47,13 @@ def on_fit_start(self):
     assert trainer.max_epochs is None
     assert trainer.max_steps is None
 
+    # Make sure max_time still honored even if max_epochs == -1
+    trainer = Trainer(max_time=dict(seconds=10), max_epochs=-1)
+    with pytest.raises(SystemExit):
+        trainer.fit(TestModel())
+    timer = [c for c in trainer.callbacks if isinstance(c, Timer)][0]
+    assert timer._duration == 10
+
 
 @pytest.mark.parametrize(
     "duration,expected",

@@ -491,6 +491,40 @@ def test_trainer_max_steps_and_epochs(tmpdir):
     assert trainer.global_step == num_train_samples * trainer.max_epochs
     assert trainer.current_epoch == trainer.max_epochs - 1, "Model did not stop at max_epochs"
 
+    # if max_steps is positive and max_epochs is negative, use max_steps
+    trainer_kwargs["max_epochs"] = -1
+    trainer_kwargs["max_steps"] = 3 * 2 * num_train_samples
+    trainer = Trainer(**trainer_kwargs)
+    trainer.fit(model)
+
+    assert trainer.state.finished, f"Training failed with {trainer.state}"
+    assert trainer.global_step == 3 * 2 * num_train_samples
+
+    # if max_steps is 0 and max_epochs is negative, use max_steps
+    trainer_kwargs["max_epochs"] = -1
+    trainer_kwargs["max_steps"] = 0
+    trainer = Trainer(**trainer_kwargs)
+
+    assert trainer.done is True
+
+    # allow specifying max_epochs < 0 and max_steps = None. This should immediately stop
+    trainer_kwargs["max_epochs"] = -100
+    trainer_kwargs["max_steps"] = None
+    trainer = Trainer(**trainer_kwargs)
+
+    assert trainer.done is True
+
+    # Make sure various combinations work to disable automatic stopping
+    for x, y in [(-1, None), (None, -1), (None, None)]:
+        trainer_kwargs["max_epochs"] = x
+        trainer_kwargs["max_steps"] = y
+        trainer = Trainer(**trainer_kwargs)
+
+        assert trainer.max_epochs == x
+        assert trainer.max_steps == y
+        assert trainer.max_time is None
+        assert trainer.done is False
+
 
 def test_trainer_min_steps_and_epochs(tmpdir):
     """Verify model trains according to specified min steps"""