facebookresearch · blefaudeux · Sep 15, 2020 · Sep 12, 2020 · Sep 14, 2020 · Sep 14, 2020
diff --git a/fairscale/optim/oss.py b/fairscale/optim/oss.py
@@ -103,8 +103,11 @@ def step(self, closure: Optional[Callable[[], float]] = None, **kwargs: Any) ->
         # Sync oss param_groups attributes in case they've been updated by a scheduler.
         self._sync_param_groups()
 
-        # Run the optimizer step on this shard only
-        loss = self.optim.step(closure=closure, **kwargs)  # type: ignore
+        # Run the optimizer step on this shard only:
+        if closure is not None:
+            loss = self.optim.step(closure=closure, **kwargs)  # type: ignore
+        else:
+            loss = self.optim.step(**kwargs)
 
         # Sync all the states. Broadcast requests are issued async, we check completeness before moving on
         requests = []

diff --git a/tests/optim/test_oss.py b/tests/optim/test_oss.py
@@ -103,13 +103,12 @@ def test_lr_scheduler():
         assert x == x2
 
 
-class SGDWithStepKWArg(torch.optim.SGD):
-    def step(self, closure=None, kwarg=[]):
-        super().step()
-        kwarg.append(5)
-
-
 def test_step_with_kwargs():
+    class SGDWithStepKWArg(torch.optim.SGD):
+        def step(self, closure=None, kwarg=[]):
+            super().step()
+            kwarg.append(5)
+
     kwarg = []
     x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
     o = optim.OSS([x], SGDWithStepKWArg, lr=0.1)
@@ -119,6 +118,18 @@ def test_step_with_kwargs():
     assert x == torch.tensor([0.9], device=DEVICE)
 
 
+def test_step_without_closure():
+    class SGDWithoutClosure(torch.optim.SGD):
+        def step(self):
+            return super().step()
+
+    x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
+    o = optim.OSS([x], SGDWithoutClosure, lr=0.1)
+    x.backward()
+    o.step()
+    assert x == torch.tensor([0.9], device=DEVICE)
+
+
 def test_local_state_dict():
     x = torch.tensor([1.0], device=DEVICE, requires_grad=True)
     o = optim.OSS([x], lr=0.1)