-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add cuda to multigpu (xpu) bench (#8386)
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Damian Szwichtenberg <damian.szwichtenberg@intel.com>
- Loading branch information
1 parent
3af88bd
commit aff3a99
Showing
5 changed files
with
129 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,21 @@ | ||
# Training Benchmark | ||
|
||
## Environment setup | ||
## Running benchmark on CUDA GPU | ||
|
||
Optional, XPU only: | ||
Run benchmark, e.g. assuming you have `n` NVIDIA GPUs: | ||
``` | ||
python training_benchmark_cuda.py --dataset ogbn-products --model edge_cnn --num-epochs 3 --n_gpus <n> | ||
``` | ||
|
||
## Running benchmark on Intel GPU | ||
|
||
## Environment setup | ||
``` | ||
install intel_extension_for_pytorch | ||
install oneccl_bindings_for_pytorch | ||
``` | ||
|
||
## Running benchmark | ||
|
||
Run benchmark, e.g. assuming you have 2 GPUs: | ||
Run benchmark, e.g. assuming you have `n` XPUs: | ||
``` | ||
mpirun -np 2 python training_benchmark.py --dataset ogbn-products --model edge_cnn --num-epochs 3 | ||
mpirun -np <n> python training_benchmark_xpu.py --dataset ogbn-products --model edge_cnn --num-epochs 3 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import argparse | ||
import os | ||
from typing import Union | ||
|
||
import torch | ||
import torch.distributed as dist | ||
import torch.multiprocessing as mp | ||
|
||
from benchmark.multi_gpu.training.common import ( | ||
get_predefined_args, | ||
run, | ||
supported_sets, | ||
) | ||
from benchmark.utils import get_dataset | ||
from torch_geometric.data import Data, HeteroData | ||
|
||
|
||
def run_cuda(rank: int, world_size: int, args: argparse.ArgumentParser, | ||
num_classes: int, data: Union[Data, HeteroData]): | ||
os.environ['MASTER_ADDR'] = 'localhost' | ||
os.environ['MASTER_PORT'] = '12355' | ||
dist.init_process_group('nccl', rank=rank, world_size=world_size) | ||
run(rank, world_size, args, num_classes, data) | ||
|
||
|
||
if __name__ == '__main__': | ||
argparser = get_predefined_args() | ||
argparser.add_argument('--n-gpus', default=1, type=int) | ||
args = argparser.parse_args() | ||
setattr(args, 'device', 'cuda') | ||
|
||
assert args.dataset in supported_sets.keys(), \ | ||
f"Dataset {args.dataset} isn't supported." | ||
data, num_classes = get_dataset(args.dataset, args.root) | ||
|
||
max_world_size = torch.cuda.device_count() | ||
chosen_world_size = args.n_gpus | ||
if chosen_world_size <= max_world_size: | ||
world_size = chosen_world_size | ||
else: | ||
print(f'User selected {chosen_world_size} GPUs ' | ||
f'but only {max_world_size} GPUs are available') | ||
world_size = max_world_size | ||
print(f'Let\'s use {world_size} GPUs!') | ||
|
||
mp.spawn( | ||
run_cuda, | ||
args=(world_size, args, num_classes, data), | ||
nprocs=world_size, | ||
join=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import os | ||
from typing import Any, Tuple | ||
|
||
import intel_extension_for_pytorch as ipex | ||
import oneccl_bindings_for_pytorch # noqa | ||
import torch.distributed as dist | ||
|
||
from benchmark.multi_gpu.training.common import ( | ||
get_predefined_args, | ||
run, | ||
supported_sets, | ||
) | ||
from benchmark.utils import get_dataset | ||
|
||
|
||
def get_dist_params() -> Tuple[int, int, str]: | ||
master_addr = "127.0.0.1" | ||
master_port = "29500" | ||
os.environ["MASTER_ADDR"] = master_addr | ||
os.environ["MASTER_PORT"] = master_port | ||
|
||
mpi_rank = int(os.environ.get("PMI_RANK", -1)) | ||
mpi_world_size = int(os.environ.get("PMI_SIZE", -1)) | ||
rank = mpi_rank if mpi_world_size > 0 else os.environ.get("RANK", 0) | ||
world_size = (mpi_world_size if mpi_world_size > 0 else os.environ.get( | ||
"WORLD_SIZE", 1)) | ||
|
||
os.environ["RANK"] = str(rank) | ||
os.environ["WORLD_SIZE"] = str(world_size) | ||
|
||
init_method = f"tcp://{master_addr}:{master_port}" | ||
|
||
return rank, world_size, init_method | ||
|
||
|
||
def custom_optimizer(model: Any, optimizer: Any) -> Tuple[Any, Any]: | ||
return ipex.optimize(model, optimizer=optimizer) | ||
|
||
|
||
if __name__ == '__main__': | ||
rank, world_size, init_method = get_dist_params() | ||
dist.init_process_group(backend="ccl", init_method=init_method, | ||
world_size=world_size, rank=rank) | ||
|
||
argparser = get_predefined_args() | ||
args = argparser.parse_args() | ||
setattr(args, 'device', 'xpu') | ||
|
||
assert args.dataset in supported_sets.keys(), \ | ||
f"Dataset {args.dataset} isn't supported." | ||
data, num_classes = get_dataset(args.dataset, args.root) | ||
|
||
run(rank, world_size, args, num_classes, data, custom_optimizer) |