Skip to content

Commit

Permalink
feat(pt): support CPU parallel training with PT
Browse files Browse the repository at this point in the history
  • Loading branch information
iProzd committed Oct 16, 2024
1 parent 5050f61 commit 6acfd3b
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,16 @@ def get_trainer(
local_rank = os.environ.get("LOCAL_RANK")
if local_rank is not None:
local_rank = int(local_rank)
assert dist.is_nccl_available()
dist.init_process_group(backend="nccl")
nccl_available = dist.is_nccl_available()
gloo_available = dist.is_gloo_available()
# nccl first
if nccl_available:
backend = "nccl"
elif gloo_available:
backend = "gloo"
else:
raise RuntimeError("No suitable backend found. Neither NCCL nor Gloo is available.")
dist.init_process_group(backend=backend)

def prepare_trainer_input_single(
model_params_single, data_dict_single, rank=0, seed=None
Expand Down

0 comments on commit 6acfd3b

Please sign in to comment.