Fix extra gpu memory in flow.load and module.load_state_dict (#8301)

* hotfix Signed-off-by: daquexian <daquexian566@gmail.com> * fix bug and add checkpoint changes Signed-off-by: daquexian <daquexian566@gmail.com> * refine Signed-off-by: daquexian <daquexian566@gmail.com> * fix other_cpu_placement Signed-off-by: daquexian <daquexian566@gmail.com> * auto format by CI Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
Oneflow-Inc · May 25, 2022 · 3bf33ad · 3bf33ad
1 parent 351022c
commit 3bf33ad
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 17 deletions.
diff --git a/python/oneflow/framework/check_point_v2.py b/python/oneflow/framework/check_point_v2.py
@@ -130,13 +130,11 @@ def _LoadSingleVariable(
         if rank == global_src_rank:
             assert isinstance(path, str)
             file_backed_blob = FileBackendVariableBlob(path)
-            loaded = flow.tensor(
-                file_backed_blob.numpy(), dtype=file_backed_blob.dtype
-            ).to("cuda")
+            loaded = flow.tensor(file_backed_blob.numpy(), dtype=file_backed_blob.dtype)
         else:
-            loaded = flow.tensor([]).to("cuda")
+            loaded = flow.tensor([])
         loaded = loaded.to_global(
-            flow.placement("cuda", [global_src_rank]), flow.sbp.broadcast
+            flow.placement("cpu", [global_src_rank]), flow.sbp.broadcast
         )
         return loaded
 

diff --git a/python/oneflow/framework/tensor.py b/python/oneflow/framework/tensor.py
@@ -862,22 +862,44 @@ def _init_by_initializer_conf(tensor, initializer_conf, random_seed=None):
 
 
 def _copy(self, other: Union[Tensor, np.ndarray]):
+    # Possibility 1: self and other are tensors on the same device/placement and have the same sbp.
+    if isinstance(other, Tensor):
+        if self.is_global:
+            assert (
+                other.is_global
+            ), "Only global tensor can be assigned to global tensor."
+            if self.placement == other.placement and self.sbp == other.sbp:
+                flow._C.assign_local_tensor(self.to_local(), other.to_local())
+                return
+        else:
+            assert (
+                not other.is_global
+            ), "Only local tensor can be assigned to local tensor."
+            if self.device == other.device:
+                flow._C.assign_local_tensor(self, other)
+                return
+
+    # Possibility 2: `other` is a numpy array, or `self` and `other` are tensors on different devices/placements.
+    # In this case, we run boxing through cpu to avoid extra gpu memory usage.
     if self.is_global:
-        if not isinstance(other, Tensor):
-            assert isinstance(other, np.ndarray)
-            other = flow.tensor(
-                other, dtype=self.dtype, placement=self.placement, sbp=self.sbp
+        self_cpu_placement = flow.placement("cpu", self.placement.ranks)
+        if isinstance(other, Tensor):
+            other_cpu_placement = flow.placement("cpu", other.placement.ranks)
+            other = other.to_global(placement=other_cpu_placement).to_global(
+                placement=self_cpu_placement, sbp=self.sbp
             )
         else:
-            assert other.is_global
-            other = other.to_global(placement=self.placement, sbp=self.sbp)
-        flow._C.assign_local_tensor(self.to_local(), other.to_local())
+            other = flow.tensor(
+                other, dtype=self.dtype, placement=self_cpu_placement, sbp=self.sbp
+            )
+        _copy_from_numpy_to_eager_local_tensor(
+            self.to_local(), other.to_local().numpy()
+        )
     else:
-        if not isinstance(other, (Tensor)):
-            assert isinstance(other, np.ndarray)
-            _copy_from_numpy_to_eager_local_tensor(self, other)
-        else:
-            flow._C.assign_local_tensor(self, other.to(device=self.device))
+        if isinstance(other, Tensor):
+            other = other.numpy()
+
+        _copy_from_numpy_to_eager_local_tensor(self, other)
 
 
 def _flip(self, dims):

diff --git a/python/oneflow/test/tensor/test_consistent_tensor.py b/python/oneflow/test/tensor/test_consistent_tensor.py
@@ -178,6 +178,56 @@ def test_global_tensor_2d_sbp_init(test_case):
         flow.nn.init.normal_(wte, std=0.02)
         flow.nn.init.normal_(wpe, std=0.02)
 
+    @flow.unittest.skip_unless_1n2d()
+    def test_copy(test_case):
+        x = flow.zeros(2, 3)
+        y = flow.ones(2, 3)
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+        x = flow.zeros(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        y = flow.ones(
+            4, 6, placement=flow.placement("cpu", [0]), sbp=flow.sbp.broadcast
+        )
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+        x = flow.zeros(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        y = flow.ones(
+            4, 6, placement=flow.placement("cuda", [0]), sbp=flow.sbp.broadcast
+        )
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+        x = flow.zeros(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.split(0)
+        )
+        y = flow.ones(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+        x = flow.zeros(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        y = flow.ones(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))
+
+        x = flow.zeros(
+            4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
+        )
+        y = np.ones((4, 6), dtype=np.float32)
+        x.copy_(y)
+        test_case.assertTrue(np.array_equal(x.numpy(), y))
+
 
 if __name__ == "__main__":
     unittest.main()