Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow benchmark_model to accept args and kwargs #586

Merged
merged 1 commit into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions benchmarks/benchmark_aq.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,14 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, kwargs=None):
# warmup
WARMUP = 5
RUNS = 100
input_tensor = example_inputs[0]
m = torch.compile(m, mode='max-autotune', fullgraph=True)

benchmark_model(m, WARMUP, input_tensor)
elapsed_time = benchmark_model(m, RUNS, input_tensor)
benchmark_model(m, WARMUP, example_inputs)
elapsed_time = benchmark_model(m, RUNS, example_inputs)

m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True)
benchmark_model(m_ref, WARMUP, input_tensor)
ref_elapsed_time = benchmark_model(m_ref, RUNS, input_tensor)
benchmark_model(m_ref, WARMUP, example_inputs)
ref_elapsed_time = benchmark_model(m_ref, RUNS, example_inputs)

print(f"elapsed time: {elapsed_time}, ref elapsed time: {ref_elapsed_time}")
assert elapsed_time < 1.05 * ref_elapsed_time
Expand Down
2 changes: 1 addition & 1 deletion test/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -1532,7 +1532,7 @@ def run_benchmark_model(self, device):
example_inputs = m.example_inputs(dtype=dtype, device=device)
m_bf16 = torch.compile(m_bf16, mode='max-autotune')
num_runs = 1
return benchmark_model(m_bf16, num_runs, example_inputs[0])
return benchmark_model(m_bf16, num_runs, example_inputs)

@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
def test_benchmark_model_cuda(self):
Expand Down
4 changes: 2 additions & 2 deletions torchao/quantization/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,9 @@ from torchao.utils import benchmark_model

num_runs = 100
torch._dynamo.reset()
bf16_time = benchmark_model(m_bf16, num_runs, example_inputs[0])
bf16_time = benchmark_model(m_bf16, num_runs, example_inputs)
print(f"bf16 mean time: {bf16_time}")
int4_time = benchmark_model(m, num_runs, example_inputs[0])
int4_time = benchmark_model(m, num_runs, example_inputs)
print(f"int4 weight only quantized mean time: {int4_time}")
print(f"speedup: {bf16_time / int4_time}")

Expand Down
20 changes: 14 additions & 6 deletions torchao/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,16 @@ def _assert_and_get_unique_device(module: torch.nn.Module) -> Any:
return device


def benchmark_model(model, num_runs, input_tensor):
device_type = _assert_and_get_unique_device(model).type
def benchmark_model(model, num_runs, args=(), kwargs=None, device_type=None):
"""Benchmark model runs with `args` and `kwargs` both are optional
"""
if kwargs is None:
kwargs = {}

if device_type is None:
assert isinstance(model, torch.nn.Module), "Expecting `model` to be torch.nn.Module if device_type is not provided"
device_type = _assert_and_get_unique_device(model).type

if device_type == "cuda":
torch.cuda.synchronize()
start_event = torch.cuda.Event(enable_timing=True)
Expand All @@ -53,7 +61,7 @@ def benchmark_model(model, num_runs, input_tensor):
# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)
model(*args, **kwargs)

end_event.record()
torch.cuda.synchronize()
Expand All @@ -68,7 +76,7 @@ def benchmark_model(model, num_runs, input_tensor):
# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)
model(*args, **kwargs)

end_event.record()
torch.mps.synchronize()
Expand All @@ -81,7 +89,7 @@ def benchmark_model(model, num_runs, input_tensor):
# benchmark
for _ in range(num_runs):
with torch.autograd.profiler.record_function("timed region"):
model(input_tensor)
model(*args, **kwargs)

end_time = time.time()
torch.cpu.synchronize()
Expand Down Expand Up @@ -264,7 +272,7 @@ def unwrap_tensor_subclass(model, filter_fn=None):
parametrize.register_parametrization(child, "weight", UnwrapTensorSubclass())
unwrap_tensor_subclass(child)
return model

def is_fbcode():
return not hasattr(torch.version, "git_version")

Expand Down
8 changes: 4 additions & 4 deletions tutorials/quantize_vit/run_vit_b.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@
model.eval().cuda().to(torch.bfloat16)

# Input tensor (batch_size, channels, height, width)
input_tensor = torch.randn(1, 3, 224, 224, dtype=torch.bfloat16, device='cuda')
inputs = (torch.randn(1, 3, 224, 224, dtype=torch.bfloat16, device='cuda'),)

model = torch.compile(model, mode='max-autotune')

# Must run with no_grad when optimizing for inference
with torch.no_grad():
# warmup
benchmark_model(model, 5, input_tensor)
benchmark_model(model, 5, inputs)
# benchmark
print("elapsed_time: ", benchmark_model(model, 100, input_tensor), " milliseconds")
print("elapsed_time: ", benchmark_model(model, 100, inputs), " milliseconds")
# Create a trace
profiler_runner("bfloat16.json.gz", benchmark_model, model, 5, input_tensor)
profiler_runner("bfloat16.json.gz", benchmark_model, model, 5, inputs)
8 changes: 4 additions & 4 deletions tutorials/quantize_vit/run_vit_b_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
model.eval().cuda().to(torch.bfloat16)

# Input tensor (batch_size, channels, height, width)
input_tensor = torch.randn(1, 3, 224, 224, dtype=torch.bfloat16, device='cuda')
inputs = (torch.randn(1, 3, 224, 224, dtype=torch.bfloat16, device='cuda'),)

## Quantization code - start
# int8 dynamic quantization act, int8 weight, see ao/torchao/quantization/README.md
Expand All @@ -39,8 +39,8 @@
# Must run with no_grad when optimizing for inference
with torch.no_grad():
# warmup
benchmark_model(model, 20, input_tensor)
benchmark_model(model, 20, inputs)
# benchmark
print("elapsed_time: ", benchmark_model(model, 1000, input_tensor), " milliseconds")
print("elapsed_time: ", benchmark_model(model, 1000, inputs), " milliseconds")
# Create a trace
profiler_runner("quant.json.gz", benchmark_model, model, 5, input_tensor)
profiler_runner("quant.json.gz", benchmark_model, model, 5, inputs)
Loading