diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 55c9e0afe2..ffa0ff7225 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -2308,9 +2308,9 @@ def fit( ) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0'): raise ValueError( 'PyTorch 2.4 breaks (distributed) checkpointing with SGD. ' - 'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW ' - 'instead. See https://github.com/pytorch/pytorch/issues/133415 ' - 'for further information.', + 'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW, ' + 'instead or downgrade to PyTorch <2.4. See ', + 'https://github.com/pytorch/pytorch/issues/133415 for further information.', ) if self.state.max_duration is None: diff --git a/examples/TPU_Training_in_composer.ipynb b/examples/TPU_Training_in_composer.ipynb index ca68592d5a..94724e7410 100644 --- a/examples/TPU_Training_in_composer.ipynb +++ b/examples/TPU_Training_in_composer.ipynb @@ -58,7 +58,8 @@ "# %pip install 'mosaicml @ git+https://github.com/mosaicml/composer.git'\"\n", "\n", "from composer import Trainer\n", - "from composer.models import ComposerClassifier" + "from composer.models import ComposerClassifier\n", + "from composer.optim import DecoupledSGDW" ] }, { @@ -166,10 +167,11 @@ "\n", "model = model.to(xm.xla_device())\n", "\n", - "optimizer = torch.optim.SGD(\n", + "optimizer = DecoupledSGDW(\n", " model.parameters(),\n", " lr=0.02,\n", - " momentum=0.9)\n" + " momentum=0.9,\n", + ")" ] }, { diff --git a/examples/exporting_for_inference.ipynb b/examples/exporting_for_inference.ipynb index ca4081c9f2..cf381dec5b 100644 --- a/examples/exporting_for_inference.ipynb +++ b/examples/exporting_for_inference.ipynb @@ -304,8 +304,9 @@ "import torch\n", "from composer import Trainer\n", "from composer.algorithms import SqueezeExcite\n", + "from composer.optim import DecoupledSGDW\n", "\n", - "optimizer = torch.optim.SGD(model.parameters(), lr=0.01)\n", + "optimizer = DecoupledSGDW(model.parameters(), lr=0.01)\n", "scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)\n", "\n", "trainer = Trainer(\n", @@ -318,7 +319,8 @@ " callbacks=[export_callback],\n", " max_duration='2ep',\n", " save_interval='1ep',\n", - " save_overwrite=True)\n", + " save_overwrite=True,\n", + ")\n", "trainer.fit()" ] }, diff --git a/examples/migrate_from_ptl.ipynb b/examples/migrate_from_ptl.ipynb index a814f132db..efdf34d26e 100644 --- a/examples/migrate_from_ptl.ipynb +++ b/examples/migrate_from_ptl.ipynb @@ -301,9 +301,10 @@ "source": [ "from composer import Trainer\n", "from composer.algorithms import BlurPool\n", + "from composer.optim import DecoupledSGDW\n", "\n", "model = MosaicResnet()\n", - "optimizer = torch.optim.SGD(\n", + "optimizer = DecoupledSGDW(\n", " model.parameters(),\n", " lr=0.05,\n", " momentum=0.9,\n",