Skip to content

Commit

Permalink
Fix extra gpu memory in flow.load and module.load_state_dict (#8301)
Browse files Browse the repository at this point in the history
* hotfix

Signed-off-by: daquexian <daquexian566@gmail.com>

* fix bug and add checkpoint changes

Signed-off-by: daquexian <daquexian566@gmail.com>

* refine

Signed-off-by: daquexian <daquexian566@gmail.com>

* fix other_cpu_placement

Signed-off-by: daquexian <daquexian566@gmail.com>

* auto format by CI

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
Co-authored-by: oneflow-ci-bot <ci-bot@oneflow.org>
  • Loading branch information
3 people authored May 25, 2022
1 parent 351022c commit 3bf33ad
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 17 deletions.
8 changes: 3 additions & 5 deletions python/oneflow/framework/check_point_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,11 @@ def _LoadSingleVariable(
if rank == global_src_rank:
assert isinstance(path, str)
file_backed_blob = FileBackendVariableBlob(path)
loaded = flow.tensor(
file_backed_blob.numpy(), dtype=file_backed_blob.dtype
).to("cuda")
loaded = flow.tensor(file_backed_blob.numpy(), dtype=file_backed_blob.dtype)
else:
loaded = flow.tensor([]).to("cuda")
loaded = flow.tensor([])
loaded = loaded.to_global(
flow.placement("cuda", [global_src_rank]), flow.sbp.broadcast
flow.placement("cpu", [global_src_rank]), flow.sbp.broadcast
)
return loaded

Expand Down
46 changes: 34 additions & 12 deletions python/oneflow/framework/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,22 +862,44 @@ def _init_by_initializer_conf(tensor, initializer_conf, random_seed=None):


def _copy(self, other: Union[Tensor, np.ndarray]):
# Possibility 1: self and other are tensors on the same device/placement and have the same sbp.
if isinstance(other, Tensor):
if self.is_global:
assert (
other.is_global
), "Only global tensor can be assigned to global tensor."
if self.placement == other.placement and self.sbp == other.sbp:
flow._C.assign_local_tensor(self.to_local(), other.to_local())
return
else:
assert (
not other.is_global
), "Only local tensor can be assigned to local tensor."
if self.device == other.device:
flow._C.assign_local_tensor(self, other)
return

# Possibility 2: `other` is a numpy array, or `self` and `other` are tensors on different devices/placements.
# In this case, we run boxing through cpu to avoid extra gpu memory usage.
if self.is_global:
if not isinstance(other, Tensor):
assert isinstance(other, np.ndarray)
other = flow.tensor(
other, dtype=self.dtype, placement=self.placement, sbp=self.sbp
self_cpu_placement = flow.placement("cpu", self.placement.ranks)
if isinstance(other, Tensor):
other_cpu_placement = flow.placement("cpu", other.placement.ranks)
other = other.to_global(placement=other_cpu_placement).to_global(
placement=self_cpu_placement, sbp=self.sbp
)
else:
assert other.is_global
other = other.to_global(placement=self.placement, sbp=self.sbp)
flow._C.assign_local_tensor(self.to_local(), other.to_local())
other = flow.tensor(
other, dtype=self.dtype, placement=self_cpu_placement, sbp=self.sbp
)
_copy_from_numpy_to_eager_local_tensor(
self.to_local(), other.to_local().numpy()
)
else:
if not isinstance(other, (Tensor)):
assert isinstance(other, np.ndarray)
_copy_from_numpy_to_eager_local_tensor(self, other)
else:
flow._C.assign_local_tensor(self, other.to(device=self.device))
if isinstance(other, Tensor):
other = other.numpy()

_copy_from_numpy_to_eager_local_tensor(self, other)


def _flip(self, dims):
Expand Down
50 changes: 50 additions & 0 deletions python/oneflow/test/tensor/test_consistent_tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,56 @@ def test_global_tensor_2d_sbp_init(test_case):
flow.nn.init.normal_(wte, std=0.02)
flow.nn.init.normal_(wpe, std=0.02)

@flow.unittest.skip_unless_1n2d()
def test_copy(test_case):
x = flow.zeros(2, 3)
y = flow.ones(2, 3)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

x = flow.zeros(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
y = flow.ones(
4, 6, placement=flow.placement("cpu", [0]), sbp=flow.sbp.broadcast
)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

x = flow.zeros(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
y = flow.ones(
4, 6, placement=flow.placement("cuda", [0]), sbp=flow.sbp.broadcast
)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

x = flow.zeros(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.split(0)
)
y = flow.ones(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

x = flow.zeros(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
y = flow.ones(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y.numpy()))

x = flow.zeros(
4, 6, placement=flow.placement("cuda", [0, 1]), sbp=flow.sbp.broadcast
)
y = np.ones((4, 6), dtype=np.float32)
x.copy_(y)
test_case.assertTrue(np.array_equal(x.numpy(), y))


if __name__ == "__main__":
unittest.main()

0 comments on commit 3bf33ad

Please sign in to comment.