Skip to content

Commit

Permalink
Merge branch 'main' into sminakov/tensor-constr-rem
Browse files Browse the repository at this point in the history
  • Loading branch information
sminakov-tt authored Jan 22, 2025
2 parents b64c70d + 1c5542a commit f001e79
Show file tree
Hide file tree
Showing 54 changed files with 2,112 additions and 144 deletions.
19 changes: 17 additions & 2 deletions .github/workflows/ttnn-run-sweeps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ on:
- eltwise.unary.reciprocal.reciprocal_sharded
- eltwise.unary.silu.silu
- eltwise.unary.silu.silu_pytorch2
- eltwise.unary.silu.silu_llama
- eltwise.unary.glu.glu
- eltwise.unary.geglu.geglu
- eltwise.unary.swiglu.swiglu
Expand Down Expand Up @@ -190,8 +191,10 @@ on:
- eltwise.unary.selu.selu
- eltwise.unary.selu.selu_sharded
- eltwise.unary.softshrink.softshrink_sharded
- eltwise.unary_backward.fill_zero_bw
- eltwise.unary_backward.log_sigmoid_bw
- eltwise.unary_backward.fill_zero_bw.fill_zero_bw
- eltwise.unary_backward.fill_zero_bw.fill_zero_bw_sharded
- eltwise.unary_backward.log_sigmoid_bw.log_sigmoid_bw
- eltwise.unary_backward.log_sigmoid_bw.log_sigmoid_bw_sharded
- eltwise.unary_backward.logit_bw
- eltwise.unary_backward.neg_bw.neg_bw
- eltwise.unary_backward.neg_bw.neg_bw_sharded
Expand All @@ -200,19 +203,26 @@ on:
- eltwise.unary_backward.acos_bw.acos_bw
- eltwise.unary_backward.acos_bw.acos_bw_sharded
- eltwise.unary_backward.acosh_bw.acosh_bw
- eltwise.unary_backward.acosh_bw.acosh_bw_sharded
- eltwise.unary_backward.atan_bw.atan_bw
- eltwise.unary_backward.atan_bw.atan_bw_sharded
- eltwise.unary_backward.cos_bw.cos_bw
- eltwise.unary_backward.cos_bw.cos_bw_sharded
- eltwise.unary_backward.frac_bw.frac_bw
- eltwise.unary_backward.frac_bw.frac_bw_sharded
- eltwise.unary_backward.i0_bw.i0_bw
- eltwise.unary_backward.i0_bw.i0_bw_sharded
- eltwise.unary_backward.rad2deg_bw.rad2deg_bw
- eltwise.unary_backward.rad2deg_bw.rad2deg_bw_sharded
- eltwise.unary_backward.relu_bw.relu_bw
- eltwise.unary_backward.relu_bw.relu_bw_sharded
- eltwise.unary_backward.rsqrt_bw.rsqrt_bw
- eltwise.unary_backward.rsqrt_bw.rsqrt_bw_sharded
- eltwise.unary_backward.sigmoid_bw.sigmoid_bw
- eltwise.unary_backward.tan_bw.tan_bw
- eltwise.unary_backward.tan_bw.tan_bw_sharded
- eltwise.unary_backward.trunc_bw.trunc_bw
- eltwise.unary_backward.trunc_bw.trunc_bw_sharded
- eltwise.unary_backward.clamp_bw.clamp_bw
- eltwise.unary_backward.hardtanh_bw.hardtanh_bw
- eltwise.unary_backward.mul_bw.mul_bw
Expand Down Expand Up @@ -288,6 +298,7 @@ on:
- eltwise.binary.add.add_set2_pytorch2
- eltwise.binary.add.add_different_memory_configs
- eltwise.binary.add.add_forge
- eltwise.binary.add.add_llama
- eltwise.unary.gtz.gtz
- eltwise.unary.ltz.ltz
- eltwise.unary.gez.gez
Expand All @@ -314,6 +325,8 @@ on:
- eltwise.binary.multiply.mul_tensor_pytorch2
- eltwise.binary.multiply.multiply_scalar_pytorch2
- eltwise.binary.multiply.multiply_forge
- eltwise.binary.multiply.multiply_llama
- eltwise.binary.multiply.mul_no_act_llama
- eltwise.binary.div.div
- eltwise.binary.div.div_tensor_pytorch2
- eltwise.binary.div.div_forge
Expand Down Expand Up @@ -411,7 +424,9 @@ on:
- fused.layer_norm_traces
- reduction.backward.prod_bw.prod_bw
- reduction.topk.topk
- reduction.topk.topk_output
- reduction.argmax.argmax
- reduction.argmax.argmax_output
- reduction.prod
- reduction.sum
- reduction.var.var
Expand Down
2 changes: 1 addition & 1 deletion build_metal.sh
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ if [ "$build_ttnn_tests" = "ON" ]; then
cmake_args+=("-DTTNN_BUILD_TESTS=ON")
fi

if [ "$build_tt_umd_tests" = "ON" ]; then
if [ "$build_umd_tests" = "ON" ]; then
cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
fi

Expand Down
55 changes: 44 additions & 11 deletions tests/scripts/test_moreh_microbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,14 @@ def append_to_csv(file_path, header, data, write_header=True):
def profile_results():
setup = device_post_proc_config.default_setup()
setup.deviceInputLog = profiler_log_path
setup.timerAnalysis = {
"device_fw_duration": {
"across": "device",
"type": "session_first_last",
"start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-FW" for risc in setup.riscTypes]},
"end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-FW" for risc in setup.riscTypes]},
},
}
devices_data = import_log_run_stats(setup)
deviceID = list(devices_data["devices"].keys())[0]
total_cycle = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["device_fw_duration"]["stats"][
Expand All @@ -89,6 +97,14 @@ def profile_results():
def profile_results_kernel_duration():
setup = device_post_proc_config.default_setup()
setup.deviceInputLog = profiler_log_path
setup.timerAnalysis = {
"device_kernel_duration": {
"across": "device",
"type": "session_first_last",
"start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
"end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
},
}
devices_data = import_log_run_stats(setup)
deviceID = list(devices_data["devices"].keys())[0]
total_cycle = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["device_kernel_duration"]["stats"][
Expand All @@ -108,6 +124,14 @@ def get_device_freq():
def profile_noc_results():
setup = device_post_proc_config.test_noc()
setup.deviceInputLog = profiler_log_path
setup.timerAnalysis = {
"NoC For Loop": {
"across": "device",
"type": "session_first_last",
"start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
"end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
},
}
devices_data = import_log_run_stats(setup)
deviceID = list(devices_data["devices"].keys())[0]
min = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["NoC For Loop"]["stats"]["Min"]
Expand Down Expand Up @@ -820,8 +844,10 @@ def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_for
data.append([throughput])
# check within range
dev_freq = get_device_freq()
bw_bound = 260.0 * dev_freq / 1000.0
assert bw_bound <= throughput
bw_lower_bound = 260.0 * dev_freq / 1000.0
bw_upper_bound = bw_lower_bound + 10.0
assert bw_lower_bound <= throughput
assert throughput <= bw_upper_bound


@pytest.mark.parametrize(
Expand Down Expand Up @@ -949,15 +975,19 @@ def test_dram_read_l1_write_core(
data.append([throughput])
# check within range
if arch == "grayskull":
bw_bound = 70.0 # Equals 85 GB/s with 1200 MHz
bw_lower_bound = 70.0 # Equals 85 GB/s with 1200 MHz
elif arch == "wormhole_b0":
bw_bound = 260.0
bw_lower_bound = 260.0
elif arch == "blackhole":
bw_bound = 340.0
bw_lower_bound = 340.0
if bw_target is not None:
bw_bound = bw_target
bw_bound = bw_bound * dev_freq / 1000.0 # Adjust for device frequency; target is based on max device frequency
assert bw_bound <= throughput
bw_lower_bound = bw_target
bw_lower_bound = (
bw_lower_bound * dev_freq / 1000.0
) # Adjust for device frequency; target is based on max device frequency
assert bw_lower_bound <= throughput
bw_upper_bound = bw_lower_bound + 10.0
assert throughput <= bw_upper_bound


@pytest.mark.parametrize(
Expand Down Expand Up @@ -1055,15 +1085,18 @@ def test_dram_read_remote_cb_sync(
logger.info("DRAM read throughput: " + str(throughput))
data.append([throughput])
# check within range
bw_lower_bound = 0.0
if test == None:
if arch == "wormhole_b0":
bw_bound = 21.5
bw_lower_bound = 21.5
elif test == "Matmul":
if arch == "wormhole_b0":
bw_bound = 18.0
bw_lower_bound = 18.0
bw_upper_bound = bw_lower_bound + 4.0
if use_sub_devices:
pytest.xfail("Tests using sub-devices is not correctly set up for BW measurements")
assert bw_bound <= throughput
assert bw_lower_bound <= throughput
assert throughput <= bw_upper_bound


@pytest.mark.parametrize(
Expand Down
102 changes: 102 additions & 0 deletions tests/sweep_framework/sweeps/eltwise/binary/add/add_llama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

from typing import Optional, Tuple
from functools import partial

import torch
import ttnn
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt

from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
from models.utility_functions import torch_random


# Parameters provided to the test vector generator are defined here.
# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
# Developers can create their own generator functions and pass them to the parameters as inputs.


parameters = {
"nightly": {
"input_shape": [
{"self": [1, 1, 32, 1024], "other": [1, 1, 32, 1024], "input_dtype": "ttnn.bfloat16"},
{"self": [1, 1, 32, 4096], "other": [1, 1, 32, 4096], "input_dtype": "ttnn.bfloat16"},
],
"input_a_layout": [ttnn.TILE_LAYOUT],
"input_b_layout": [ttnn.TILE_LAYOUT],
},
}


# Invalidate vector is called during the generation phase where each vector will be passed in.
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_b_layout"] == ttnn.ROW_MAJOR_LAYOUT:
return True, "Row Major layout is not supported"
return False, None


# This is the run instructions for the test, defined by the developer.
# The run function must take the above-defined parameters as inputs.
# The runner will call this run function with each test vector, and the returned results from this function will be stored.
# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
def run(
input_shape,
input_a_layout,
input_b_layout,
*,
device,
) -> list:
torch.manual_seed(0)
if input_shape["input_dtype"] == "ttnn.bfloat16":
input_dtype = ttnn.bfloat16
elif input_shape["input_dtype"] == "ttnn.float32":
input_dtype = ttnn.float32
elif input_shape["input_dtype"] == "ttnn.int32":
input_dtype = ttnn.int32

torch_input_tensor_a = gen_func_with_cast_tt(
partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
)(input_shape["self"])
torch_input_tensor_b = gen_func_with_cast_tt(
partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
)(input_shape["other"])

sharded_config = ttnn.create_sharded_memory_config_(
shape=input_shape["self"],
core_grid=ttnn.CoreGrid(y=4, x=8),
strategy=ttnn.ShardStrategy.WIDTH,
orientation=ttnn.ShardOrientation.ROW_MAJOR,
use_height_and_width_as_shard_shape=False,
halo=0,
tile_layout=True,
)
golden_function = ttnn.get_golden_function(ttnn.add)
torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)

input_tensor_a = ttnn.from_torch(
torch_input_tensor_a,
dtype=input_dtype,
layout=input_a_layout,
device=device,
memory_config=sharded_config,
)

input_tensor_b = ttnn.from_torch(
torch_input_tensor_b,
dtype=input_dtype,
layout=input_b_layout,
device=device,
memory_config=sharded_config,
)

start_time = start_measuring_time()
result = ttnn.add(input_tensor_a, input_tensor_b, memory_config=sharded_config)
output_tensor = ttnn.to_torch(result)
e2e_perf = stop_measuring_time(start_time)

return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

from typing import Optional, Tuple
from functools import partial

import torch
import ttnn
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt

from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
from models.utility_functions import torch_random


# Parameters provided to the test vector generator are defined here.
# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
# Developers can create their own generator functions and pass them to the parameters as inputs.


parameters = {
"nightly": {
"input_spec": [
{"self": [1, 1, 32, 3584], "other": [1, 1, 32, 3584], "input_dtype": "ttnn.bfloat16", "y": 2},
{"self": [1, 1, 32, 14336], "other": [1, 1, 32, 14336], "input_dtype": "ttnn.bfloat16", "y": 4},
],
"input_a_layout": [ttnn.TILE_LAYOUT],
"input_b_layout": [ttnn.TILE_LAYOUT],
},
}


# Invalidate vector is called during the generation phase where each vector will be passed in.
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_b_layout"] == ttnn.ROW_MAJOR_LAYOUT:
return True, "Row Major layout is not supported"
return False, None


# This is the run instructions for the test, defined by the developer.
# The run function must take the above-defined parameters as inputs.
# The runner will call this run function with each test vector, and the returned results from this function will be stored.
# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
def run(
input_spec,
input_a_layout,
input_b_layout,
*,
device,
) -> list:
torch.manual_seed(0)
if input_spec["input_dtype"] == "ttnn.bfloat16":
input_dtype = ttnn.bfloat16
elif input_spec["input_dtype"] == "ttnn.float32":
input_dtype = ttnn.float32
elif input_spec["input_dtype"] == "ttnn.int32":
input_dtype = ttnn.int32

torch_input_tensor_a = gen_func_with_cast_tt(
partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
)(input_spec["self"])
torch_input_tensor_b = gen_func_with_cast_tt(
partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
)(input_spec["other"])

golden_function = ttnn.get_golden_function(ttnn.multiply)
torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)

sharded_config = ttnn.create_sharded_memory_config_(
shape=input_spec["self"],
core_grid=ttnn.CoreGrid(y=input_spec["y"], x=8),
strategy=ttnn.ShardStrategy.WIDTH,
orientation=ttnn.ShardOrientation.ROW_MAJOR,
use_height_and_width_as_shard_shape=False,
halo=0,
tile_layout=True,
)

input_tensor_a = ttnn.from_torch(
torch_input_tensor_a,
dtype=input_dtype,
layout=input_a_layout,
device=device,
memory_config=sharded_config,
)

input_tensor_b = ttnn.from_torch(
torch_input_tensor_b,
dtype=input_dtype,
layout=input_b_layout,
device=device,
memory_config=sharded_config,
)

start_time = start_measuring_time()
result = ttnn.multiply(input_tensor_a, input_tensor_b, memory_config=sharded_config)
output_tensor = ttnn.to_torch(result)
e2e_perf = stop_measuring_time(start_time)

return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
Loading

0 comments on commit f001e79

Please sign in to comment.