Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect CUDA/CPU profiling info into result sheets. #5921

Merged
merged 2 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 46 additions & 5 deletions benchmarks/experiment_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import torch
from tqdm import tqdm
from torch.profiler import profile, record_function, ProfilerActivity
from torch.autograd import DeviceType
from typing import List

try:
from .benchmark_model import ModelLoader
Expand Down Expand Up @@ -249,8 +251,41 @@ def dump_profile_info(self, prof, model_name):
with open(os.path.join(file_path, "kernel_dump.txt"), "a") as f:
f.write(kernel_dump)

def timed_run(self, benchmark_experiment, benchmark_model):
def collect_profile_to_metrics(self, prof, metrics):
assert prof is not None, 'Expecting profiler to be defined!'
if not self._args.profile_cuda_cpu_collect:
logger.warning(
'Profiling enabled, but collection of CPU/CUDA profiling info disabled.'
)
return

kernel_dump = prof.profiler.total_average()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious where is this total_average() defined?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

total_cuda_time = 0
total_cpu_time = kernel_dump.self_cpu_time_total

# Avoid double counting CUDA time for inductor. Copied here, since the interface is not really exposed via any interface.
# The alternative is regex matching resulting string dump for CUDA kernel time.
# Source: https://github.com/pytorch/pytorch/blob/2f3beb715c608a060934c237de402faa40ea211f/torch/autograd/profiler_util.py#L1025-L1037
for evt in prof.profiler.key_averages():
if evt.device_type == DeviceType.CPU:
# in legacy profiler, kernel info is stored in cpu events
if evt.is_legacy:
if not use_device:
total_cuda_time += evt.self_cuda_time_total
elif evt.device_type == DeviceType.CUDA:
# in kineto profiler, there're events with the correct device type (e.g. CUDA)
total_cuda_time += evt.self_cuda_time_total

total_cpu_time /= 1000000
total_cuda_time /= 1000000
metrics["total_cpu_time_s"] = total_cpu_time
metrics["total_cuda_time_s"] = total_cuda_time
metrics[
"per_iter_cpu_time_s"] = total_cpu_time / self._args.iterations_per_run
metrics[
"per_iter_cuda_time_s"] = total_cuda_time / self._args.iterations_per_run

def timed_run(self, benchmark_experiment, benchmark_model):
reset_rng_state(benchmark_experiment)

inputs_list = self.prepare_inputs(benchmark_model.example_inputs,
Expand Down Expand Up @@ -282,6 +317,7 @@ def loop(prof=None):

if prof:
prof.step()
self._synchronize(benchmark_experiment)
return output

if enable_prof:
Expand All @@ -291,11 +327,10 @@ def loop(prof=None):
else:
output = loop()

self._synchronize(benchmark_experiment)

t_end = time.perf_counter()
if enable_prof:
self.dump_profile_info(prof, benchmark_model.model_name)
self.collect_profile_to_metrics(prof, metrics)

metrics["total_time"] = t_end - t_start
metrics[
Expand All @@ -306,7 +341,7 @@ def loop(prof=None):
return metrics, output


def append_filter_by_tier(filter_list: list[str], filter_by_tier: list[int]):
def append_filter_by_tier(filter_list: List[str], filter_by_tier: List[int]):
_FILTER_BY_TIER = {
1:
r"^(BERT_pytorch|cm3leon_generate|DALLE2_pytorch|dlrm|hf_GPT2|hf_GPT2_large|GPT_3|hf_T5|hf_T5_base|hf_T5_generate|hf_T5_large|llama_v2_7b_16h|stable_diffusion_xl)$",
Expand Down Expand Up @@ -521,7 +556,13 @@ def parse_args(args=None):
"--profile-cuda-dump",
type=str,
default="./output/",
help="Directory specifying where to dump profiling information (summary, and trace)"
help="Directory specifying where to dump profiling information (summary, and trace)",
),

parser.add_argument(
"--profile-cuda-cpu-collect",
action="store_true",
help="Whether to collect CPU/GPU profiling information in the resulting file.",
),

parser.add_argument(
Expand Down
17 changes: 17 additions & 0 deletions benchmarks/result_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,23 @@ def get_calculated_metrics(self, d, dataline):
d["xla_median_trace_per_iter_time"] = -1
d["xla_compile_time"] = -1

if "total_cpu_time_s" in dataline["metrics"]:
total_cpu_time = np.asarray(
dataline["metrics"]["total_cpu_time_s"], dtype="float")
d["median_total_cpu_time_s"] = np.median(total_cpu_time)
if "per_iter_cpu_time_s" in dataline["metrics"]:
per_iter_cpu_time = np.asarray(
dataline["metrics"]["per_iter_cpu_time_s"], dtype="float")
d["median_per_iter_cpu_time_s"] = np.median(per_iter_cpu_time)
if "total_cuda_time_s" in dataline["metrics"]:
total_cuda_time = np.asarray(
dataline["metrics"]["total_cuda_time_s"], dtype="float")
d["median_total_cuda_time_s"] = np.median(total_cuda_time)
if "per_iter_cuda_time_s" in dataline["metrics"]:
per_iter_cuda_time = np.asarray(
dataline["metrics"]["per_iter_cuda_time_s"], dtype="float")
d["median_per_iter_cuda_time_s"] = np.median(per_iter_cuda_time)

if dataline["experiment"]["dynamo"]:
d["dynamo_compile_time"] = np.max(total_time) - np.median(total_time)
else:
Expand Down