Lightning-AI · carmocca · Jul 27, 2022 · Jul 26, 2022 · Jul 26, 2022 · Jul 26, 2022
@@ -73,6 +73,7 @@ jobs:
         CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
         pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
         pip install -e .[strategies]
+        pip install deepspeed==0.6.4  # TODO: remove when docker images are upgraded
         pip install --requirement requirements/pytorch/devel.txt
         pip list
       env:

@@ -1,5 +1,5 @@
 fairscale>=0.4.5, <=0.4.6
-deepspeed<0.6.0
+deepspeed>=0.6.0, <0.6.5
 # no need to install with [pytorch] as pytorch is already installed
 horovod>=0.21.2, !=0.24.0, <0.25.1
 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
@@ -407,15 +407,21 @@ def run(self):
             model = BoringModel()
             optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
             model, optimizer = self.setup(model, optimizer)
-            state_dict = deepcopy(model.state_dict())
 
-            for _ in range(2):
+            for i in range(2):
                 optimizer.zero_grad()
                 x = model(torch.randn(1, 32).to(self.device))
                 loss = x.sum()
+                if i == 0:
+                    # the weights are not initialized with stage 3 until backward is run once
+                    assert all(w.nelement() == 0 for w in model.state_dict().values())
                 self.backward(loss, model=model)
+                if i == 0:
+                    # save for later to check that the weights were updated
+                    state_dict = deepcopy(model.state_dict())
                 optimizer.step()
 
+            # check that the model trained, the weights from step 1 do not match the weights from step 2
             for mw_b, mw_a in zip(state_dict.values(), model.state_dict().values()):
                 assert not torch.allclose(mw_b, mw_a)
 
@@ -433,6 +439,7 @@ def run(self):
             model_1, optimizer_1 = self.setup(model_1, optimizer_1)
             model_2, optimizer_2 = self.setup(model_2, optimizer_2)
 
+            # train model_1 first
             self.seed_everything(42)
             data_list = []
             for _ in range(2):
@@ -444,16 +451,19 @@ def run(self):
                 self.backward(loss, model=model_1)
                 optimizer_1.step()
 
-            for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()):
-                assert not torch.allclose(mw_1, mw_2)
+            # the weights do not match
+            assert all(w.nelement() > 1 for w in model_1.state_dict().values())
+            assert all(w.nelement() == 0 for w in model_2.state_dict().values())
 
+            # now train model_2 with the same data
             for data in data_list:
                 optimizer_2.zero_grad()
                 x = model_2(data)
                 loss = x.sum()
                 self.backward(loss, model=model_2)
                 optimizer_2.step()
 
+            # the weights should match
             for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()):
                 assert torch.allclose(mw_1, mw_2)