-
Notifications
You must be signed in to change notification settings - Fork 1
/
torch_utils.py
64 lines (50 loc) · 1.91 KB
/
torch_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import random
import numpy as np
import torch
from torch.distributed import init_process_group, destroy_process_group
from utils import print_master
def pytorch_setup(cfg):
"""Returns device, rank, seed, etc and initialize DDP"""
ddp = int(os.environ.get('RANK', -1)) != -1 # check if DDP is enabled
if ddp:
init_process_group(backend='nccl')
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{local_rank}'
torch.cuda.device(device)
master_process = (rank == 0)
seed_offset = rank
else:
master_process = True
seed_offset = 0
local_rank = None
world_size = 1
device = 'cpu'
if torch.cuda.is_available():
device = 'cuda'
elif torch.backends.mps.is_available():
device = 'mps' # NOTE: macOS metal support to be tested
random.seed(cfg.seed + seed_offset)
np.random.seed(cfg.seed + seed_offset)
torch.manual_seed(cfg.seed + seed_offset)
# allow TF32
torch.backends.cuda.matmul.allow_tf32 = getattr(cfg, 'cuda_allow_tf32', False)
torch.backends.cudnn.allow_tf32 = getattr(cfg, 'cudnn_allow_tf32', False)
# limit CUDA memory
if hasattr(cfg, 'set_memory_fraction'):
tot_mem_gb = torch.cuda.get_device_properties(device).total_memory / 1e9
red_mem_gb = tot_mem_gb * cfg.set_memory_fraction
print_master(f"Limit GPU memory from {tot_mem_gb:.2f}GB to: {red_mem_gb:.2f}GB")
torch.cuda.set_per_process_memory_fraction(cfg.set_memory_fraction, device=device)
# deterministic run
if getattr(cfg, 'determinisitc', False):
torch.use_deterministic_algorithms(True)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"
torch.backends.cudnn.benchmark = False
return local_rank, world_size, device, master_process
def destroy_ddp():
if torch.distributed.is_initialized():
torch.distributed.barrier()
destroy_process_group()