Skip to content

Commit

Permalink
[1/2] Script to compare measured (trace) runtimes with estimated runt…
Browse files Browse the repository at this point in the history
…imes (pytorch#109076)

Summary:
X-link: pytorch/benchmark#1891


Reviewed By: xmfan, xuzhao9, xw285cornell

Differential Revision: D48523883
  • Loading branch information
mjanderson09 authored and facebook-github-bot committed Oct 2, 2023
1 parent 7e6cf04 commit 5e01ca1
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
2 changes: 1 addition & 1 deletion torch/_inductor/codegen/triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -2543,7 +2543,7 @@ def codegen_node_schedule(self, node_schedule, numel, reduction_numel):
node.mark_run()

kernel_name = self.define_kernel(src_code, node_schedule)

log.debug("Generating kernel code with kernel_name: %s", kernel_name)
self.codegen_comment(node_schedule)
kernel.call_kernel(kernel_name)
V.graph.removed_buffers |= kernel.removed_buffers
Expand Down
25 changes: 22 additions & 3 deletions torch/_inductor/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,8 @@ def get_estimated_runtime(self) -> float:
dtype = None
if not self.node:
assert self.snodes
if not self.snodes[0].node:
return 0
layout = self.snodes[0].node.get_layout()
dtype = self.snodes[0].node.get_dtype()
else:
Expand Down Expand Up @@ -530,19 +532,23 @@ def get_estimated_runtime(self) -> float:
from .ir import ir_node_to_tensor

fake_inputs = [
ir_node_to_tensor(input) for input in self.node.inputs
ir_node_to_tensor(input, guard_shape=False)
for input in self.node.inputs
]
cls = self.node.__class__
cls.process_kernel(op, *fake_inputs, **self.node.kwargs)

# TODO(xmfan): find a better heuristic to model FLOPS/latency relationship
factor = 0.5
factor = 1.0
counted_flops = flop_counter_mode.get_total_flops()
return factor * counted_flops / gpu_flops

# Return estimated runtime in nanoseconds
return (factor * counted_flops / gpu_flops) * 1e9

elif isinstance(self, FusedSchedulerNode) or isinstance(
self.node, ComputedBuffer
):
# Return estimated runtime in nanoseconds (bytes / gbps)
return self.get_read_write_buffers_sizes() / gpu_memory_bandwidth

# TODO(xmfan): add support for CollectiveKernel
Expand Down Expand Up @@ -944,6 +950,7 @@ def __init__(
else:
self.scheduler = scheduler
self.snodes = nodes
self.node = None

self.node = None
self.users = None
Expand Down Expand Up @@ -1822,6 +1829,18 @@ def get_order(n):
@dynamo_timed
def codegen(self):
for node in self.nodes:
try:
log.debug(
"Generating code for node %s with estimated runtime %f",
node.get_name(),
node.get_estimated_runtime(),
)
except Exception:
log.error(
"Generating code for node %s with estimated runtime 0.0",
node.get_name(),
)

self.enter_context(node)

if not isinstance(node, NopKernelSchedulerNode):
Expand Down

0 comments on commit 5e01ca1

Please sign in to comment.