Skip to content

Commit

Permalink
chore: refactor training loop
Browse files Browse the repository at this point in the history
  • Loading branch information
caic99 authored and iProzd committed Nov 27, 2024
1 parent ce5ff0c commit 8ea6612
Showing 1 changed file with 8 additions and 18 deletions.
26 changes: 8 additions & 18 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,12 @@ def run(self) -> None:
prof.start()

def step(_step_id, task_key="Default") -> None:
if self.multi_task:
model_index = dp_random.choice(
np.arange(self.num_model),
p=self.model_prob,
)
task_key = self.model_keys[model_index]
# PyTorch Profiler
if self.enable_profiler or self.profiling:
prof.step()
Expand Down Expand Up @@ -929,24 +935,8 @@ def log_loss_valid(_task_key="Default"):

self.t0 = time.time()
self.total_train_time = 0.0
for step_id in range(self.num_steps):
if step_id < self.start_step:
continue
if self.multi_task:
chosen_index_list = dp_random.choice(
np.arange(
self.num_model, dtype=np.int32
), # int32 should be enough for # models...
p=np.array(self.model_prob),
size=self.world_size,
replace=True,
)
assert chosen_index_list.size == self.world_size
model_index = chosen_index_list[self.rank]
model_key = self.model_keys[model_index]
else:
model_key = "Default"
step(step_id, model_key)
for step_id in range(self.start_step, self.num_steps):
step(step_id)
if JIT:
break

Expand Down

0 comments on commit 8ea6612

Please sign in to comment.