Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix error for PT backend when pytorch.distributed is not available #3652

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@
stat_file_path_single,
)

rank = dist.get_rank() if dist.is_initialized() else 0
rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0

Check warning on line 162 in deepmd/pt/entrypoints/main.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/entrypoints/main.py#L162

Added line #L162 was not covered by tests
if not multi_task:
(
train_data,
Expand Down
2 changes: 1 addition & 1 deletion deepmd/pt/optimizer/LKF.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
# the first param, because this helps with casting in load_state_dict
self._state = self.state[self._params[0]]
self._state.setdefault("kalman_lambda", kalman_lambda)
self.dist_init = dist.is_initialized()
self.dist_init = dist.is_available() and dist.is_initialized()

Check warning on line 50 in deepmd/pt/optimizer/LKF.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/optimizer/LKF.py#L50

Added line #L50 was not covered by tests
self.rank = dist.get_rank() if self.dist_init else 0
self.dindex = []
self.remainder = 0
Expand Down
50 changes: 39 additions & 11 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,14 @@
self.model_keys = (
list(model_params["model_dict"]) if self.multi_task else ["Default"]
)
self.rank = dist.get_rank() if dist.is_initialized() else 0
self.world_size = dist.get_world_size() if dist.is_initialized() else 1
self.rank = (

Check warning on line 125 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L125

Added line #L125 was not covered by tests
dist.get_rank() if dist.is_available() and dist.is_initialized() else 0
)
self.world_size = (

Check warning on line 128 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L128

Added line #L128 was not covered by tests
dist.get_world_size()
if dist.is_available() and dist.is_initialized()
else 1
)
self.num_model = len(self.model_keys)

# Iteration config
Expand Down Expand Up @@ -169,7 +175,9 @@
_data,
sampler=_sampler,
batch_size=None,
num_workers=NUM_WORKERS, # setting to 0 diverges the behavior of its iterator; should be >=1
num_workers=NUM_WORKERS
if dist.is_available()
else 0, # setting to 0 diverges the behavior of its iterator; should be >=1
drop_last=False,
pin_memory=True,
)
Expand Down Expand Up @@ -607,7 +615,7 @@
if shared_links is not None:
self.wrapper.share_params(shared_links, resume=resuming or self.rank != 0)

if dist.is_initialized():
if dist.is_available() and dist.is_initialized():

Check warning on line 618 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L618

Added line #L618 was not covered by tests
torch.cuda.set_device(LOCAL_RANK)
# DDP will guarantee the model parameters are identical across all processes
self.wrapper = DDP(
Expand Down Expand Up @@ -673,7 +681,7 @@
record_file = f"Sample_rank_{self.rank}.txt"
fout1 = open(record_file, mode="w", buffering=1)
log.info("Start to train %d steps.", self.num_steps)
if dist.is_initialized():
if dist.is_available() and dist.is_initialized():

Check warning on line 684 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L684

Added line #L684 was not covered by tests
log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}")
if self.enable_tensorboard:
from torch.utils.tensorboard import (
Expand Down Expand Up @@ -734,7 +742,11 @@
elif self.opt_type == "LKF":
if isinstance(self.loss, EnergyStdLoss):
KFOptWrapper = KFOptimizerWrapper(
self.wrapper, self.optimizer, 24, 6, dist.is_initialized()
self.wrapper,
self.optimizer,
24,
6,
dist.is_available() and dist.is_initialized(),
)
pref_e = self.opt_param["kf_start_pref_e"] * (
self.opt_param["kf_limit_pref_e"]
Expand All @@ -753,7 +765,9 @@
# [coord, atype, natoms, mapping, shift, nlist, box]
model_pred = {"energy": p_energy, "force": p_force}
module = (
self.wrapper.module if dist.is_initialized() else self.wrapper
self.wrapper.module
if dist.is_available() and dist.is_initialized()
else self.wrapper
)

def fake_model():
Expand All @@ -768,10 +782,16 @@
)
elif isinstance(self.loss, DenoiseLoss):
KFOptWrapper = KFOptimizerWrapper(
self.wrapper, self.optimizer, 24, 6, dist.is_initialized()
self.wrapper,
self.optimizer,
24,
6,
dist.is_available() and dist.is_initialized(),
)
module = (
self.wrapper.module if dist.is_initialized() else self.wrapper
self.wrapper.module
if dist.is_available() and dist.is_initialized()
else self.wrapper
)
model_pred = KFOptWrapper.update_denoise_coord(
input_dict,
Expand Down Expand Up @@ -924,7 +944,11 @@
# Handle the case if rank 0 aborted and re-assigned
self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt")

module = self.wrapper.module if dist.is_initialized() else self.wrapper
module = (

Check warning on line 947 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L947

Added line #L947 was not covered by tests

Check notice

Code scanning / CodeQL

Unused local variable Note

Variable module is not used.
self.wrapper.module
if dist.is_available() and dist.is_initialized()
else self.wrapper
)
self.save_model(self.latest_model, lr=cur_lr, step=_step_id)
log.info(f"Saved model to {self.latest_model}")
symlink_prefix_files(self.latest_model.stem, self.save_ckpt)
Expand Down Expand Up @@ -990,7 +1014,11 @@
prof.stop()

def save_model(self, save_path, lr=0.0, step=0):
module = self.wrapper.module if dist.is_initialized() else self.wrapper
module = (

Check warning on line 1017 in deepmd/pt/train/training.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/train/training.py#L1017

Added line #L1017 was not covered by tests
self.wrapper.module
if dist.is_available() and dist.is_initialized()
else self.wrapper
)
module.train_infos["lr"] = lr
module.train_infos["step"] = step
torch.save(
Expand Down
11 changes: 8 additions & 3 deletions deepmd/pt/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@

with Pool(
os.cpu_count()
// (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_initialized() else 1)
// (
int(os.environ["LOCAL_WORLD_SIZE"])
if dist.is_available() and dist.is_initialized()
else 1
)
) as pool:
self.systems = pool.map(construct_dataset, systems)

Expand Down Expand Up @@ -127,7 +131,7 @@
self.batch_sizes = batch_size * np.ones(len(systems), dtype=int)
assert len(self.systems) == len(self.batch_sizes)
for system, batch_size in zip(self.systems, self.batch_sizes):
if dist.is_initialized():
if dist.is_available() and dist.is_initialized():

Check warning on line 134 in deepmd/pt/utils/dataloader.py

View check run for this annotation

Codecov / codecov/patch

deepmd/pt/utils/dataloader.py#L134

Added line #L134 was not covered by tests
system_sampler = DistributedSampler(system)
self.sampler_list.append(system_sampler)
else:
Expand All @@ -138,7 +142,8 @@
num_workers=0, # Should be 0 to avoid too many threads forked
sampler=system_sampler,
collate_fn=collate_batch,
shuffle=(not dist.is_initialized()) and shuffle,
shuffle=(not (dist.is_available() and dist.is_initialized()))
and shuffle,
)
self.dataloaders.append(system_dataloader)
self.index.append(len(system_dataloader))
Expand Down