Skip to content

Commit

Permalink
[XPU], support unified ckpt function
Browse files Browse the repository at this point in the history
  • Loading branch information
cqulilujia committed Oct 24, 2024
1 parent 7551730 commit 2ec9a6d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 2 deletions.
4 changes: 2 additions & 2 deletions paddlenlp/trainer/plugins/unified_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,7 +1307,7 @@ def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serializa
else:
local_resume = False
local_resume = paddle.to_tensor([local_resume])
dist.all_reduce(local_resume, op=dist.ReduceOp.PROD)
dist.all_reduce(local_resume, op=dist.ReduceOp.MIN)

Check warning on line 1310 in paddlenlp/trainer/plugins/unified_checkpoint.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/plugins/unified_checkpoint.py#L1310

Added line #L1310 was not covered by tests
local_resume = local_resume.item()
return local_resume

Expand Down Expand Up @@ -1425,7 +1425,7 @@ def check_dynamic_load(args, weight_map, existed_files, is_master_weights=False,
else:
local_resume = False
local_resume = paddle.to_tensor([local_resume])
dist.all_reduce(local_resume, op=dist.ReduceOp.PROD)
dist.all_reduce(local_resume, op=dist.ReduceOp.MIN)

Check warning on line 1428 in paddlenlp/trainer/plugins/unified_checkpoint.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/plugins/unified_checkpoint.py#L1428

Added line #L1428 was not covered by tests
return local_resume.item()

# check whether the optimizer checkpoint files are complete.
Expand Down
6 changes: 6 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1793,6 +1793,12 @@ def _load_rng_state(self, checkpoint):
for i in range(core.get_cuda_device_count()):
core.default_cuda_generator(i).set_state(checkpoint_rng_state["cuda"][i])

if core.is_compiled_with_xpu():
if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
raise ValueError("Length of xpu state list shoule be equal to the xpu device count")
for i in range(core.get_xpu_device_count()):
core.default_xpu_generator(i).set_state(checkpoint_rng_state["cuda"][i])

Check warning on line 1800 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L1796-L1800

Added lines #L1796 - L1800 were not covered by tests

if paddle.device.get_all_custom_device_type() is not None:
custom_device_type = paddle.device.get_all_custom_device_type()
for device in custom_device_type:
Expand Down

0 comments on commit 2ec9a6d

Please sign in to comment.