diff --git a/paddlenlp/trainer/plugins/unified_checkpoint.py b/paddlenlp/trainer/plugins/unified_checkpoint.py index 4c5b54a20ddb..4bd1a1ffb08e 100644 --- a/paddlenlp/trainer/plugins/unified_checkpoint.py +++ b/paddlenlp/trainer/plugins/unified_checkpoint.py @@ -1307,7 +1307,7 @@ def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serializa else: local_resume = False local_resume = paddle.to_tensor([local_resume]) - dist.all_reduce(local_resume, op=dist.ReduceOp.PROD) + dist.all_reduce(local_resume, op=dist.ReduceOp.MIN) local_resume = local_resume.item() return local_resume @@ -1425,7 +1425,7 @@ def check_dynamic_load(args, weight_map, existed_files, is_master_weights=False, else: local_resume = False local_resume = paddle.to_tensor([local_resume]) - dist.all_reduce(local_resume, op=dist.ReduceOp.PROD) + dist.all_reduce(local_resume, op=dist.ReduceOp.MIN) return local_resume.item() # check whether the optimizer checkpoint files are complete. diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 042924412290..a1fb78de63ba 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1793,6 +1793,12 @@ def _load_rng_state(self, checkpoint): for i in range(core.get_cuda_device_count()): core.default_cuda_generator(i).set_state(checkpoint_rng_state["cuda"][i]) + if core.is_compiled_with_xpu(): + if not len(checkpoint_rng_state["cuda"]) == core.get_xpu_device_count(): + raise ValueError("Length of xpu state list shoule be equal to the xpu device count") + for i in range(core.get_xpu_device_count()): + core.default_xpu_generator(i).set_state(checkpoint_rng_state["cuda"][i]) + if paddle.device.get_all_custom_device_type() is not None: custom_device_type = paddle.device.get_all_custom_device_type() for device in custom_device_type: