Merge branch 'main' into sminakov/tensor-constr-rem

tenstorrent · Jan 22, 2025 · f001e79 · f001e79
2 parents b64c70d + 1c5542a
commit f001e79
Show file tree

Hide file tree

Showing 54 changed files with 2,112 additions and 144 deletions.
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -145,6 +145,7 @@ on:
           - eltwise.unary.reciprocal.reciprocal_sharded
           - eltwise.unary.silu.silu
           - eltwise.unary.silu.silu_pytorch2
+          - eltwise.unary.silu.silu_llama
           - eltwise.unary.glu.glu
           - eltwise.unary.geglu.geglu
           - eltwise.unary.swiglu.swiglu
@@ -190,8 +191,10 @@ on:
           - eltwise.unary.selu.selu
           - eltwise.unary.selu.selu_sharded
           - eltwise.unary.softshrink.softshrink_sharded
-          - eltwise.unary_backward.fill_zero_bw
-          - eltwise.unary_backward.log_sigmoid_bw
+          - eltwise.unary_backward.fill_zero_bw.fill_zero_bw
+          - eltwise.unary_backward.fill_zero_bw.fill_zero_bw_sharded
+          - eltwise.unary_backward.log_sigmoid_bw.log_sigmoid_bw
+          - eltwise.unary_backward.log_sigmoid_bw.log_sigmoid_bw_sharded
           - eltwise.unary_backward.logit_bw
           - eltwise.unary_backward.neg_bw.neg_bw
           - eltwise.unary_backward.neg_bw.neg_bw_sharded
@@ -200,19 +203,26 @@ on:
           - eltwise.unary_backward.acos_bw.acos_bw
           - eltwise.unary_backward.acos_bw.acos_bw_sharded
           - eltwise.unary_backward.acosh_bw.acosh_bw
+          - eltwise.unary_backward.acosh_bw.acosh_bw_sharded
           - eltwise.unary_backward.atan_bw.atan_bw
+          - eltwise.unary_backward.atan_bw.atan_bw_sharded
           - eltwise.unary_backward.cos_bw.cos_bw
           - eltwise.unary_backward.cos_bw.cos_bw_sharded
           - eltwise.unary_backward.frac_bw.frac_bw
+          - eltwise.unary_backward.frac_bw.frac_bw_sharded
           - eltwise.unary_backward.i0_bw.i0_bw
+          - eltwise.unary_backward.i0_bw.i0_bw_sharded
           - eltwise.unary_backward.rad2deg_bw.rad2deg_bw
+          - eltwise.unary_backward.rad2deg_bw.rad2deg_bw_sharded
           - eltwise.unary_backward.relu_bw.relu_bw
           - eltwise.unary_backward.relu_bw.relu_bw_sharded
           - eltwise.unary_backward.rsqrt_bw.rsqrt_bw
           - eltwise.unary_backward.rsqrt_bw.rsqrt_bw_sharded
           - eltwise.unary_backward.sigmoid_bw.sigmoid_bw
           - eltwise.unary_backward.tan_bw.tan_bw
+          - eltwise.unary_backward.tan_bw.tan_bw_sharded
           - eltwise.unary_backward.trunc_bw.trunc_bw
+          - eltwise.unary_backward.trunc_bw.trunc_bw_sharded
           - eltwise.unary_backward.clamp_bw.clamp_bw
           - eltwise.unary_backward.hardtanh_bw.hardtanh_bw
           - eltwise.unary_backward.mul_bw.mul_bw
@@ -288,6 +298,7 @@ on:
           - eltwise.binary.add.add_set2_pytorch2
           - eltwise.binary.add.add_different_memory_configs
           - eltwise.binary.add.add_forge
+          - eltwise.binary.add.add_llama
           - eltwise.unary.gtz.gtz
           - eltwise.unary.ltz.ltz
           - eltwise.unary.gez.gez
@@ -314,6 +325,8 @@ on:
           - eltwise.binary.multiply.mul_tensor_pytorch2
           - eltwise.binary.multiply.multiply_scalar_pytorch2
           - eltwise.binary.multiply.multiply_forge
+          - eltwise.binary.multiply.multiply_llama
+          - eltwise.binary.multiply.mul_no_act_llama
           - eltwise.binary.div.div
           - eltwise.binary.div.div_tensor_pytorch2
           - eltwise.binary.div.div_forge
@@ -411,7 +424,9 @@ on:
           - fused.layer_norm_traces
           - reduction.backward.prod_bw.prod_bw
           - reduction.topk.topk
+          - reduction.topk.topk_output
           - reduction.argmax.argmax
+          - reduction.argmax.argmax_output
           - reduction.prod
           - reduction.sum
           - reduction.var.var

diff --git a/build_metal.sh b/build_metal.sh
@@ -250,7 +250,7 @@ if [ "$build_ttnn_tests" = "ON" ]; then
     cmake_args+=("-DTTNN_BUILD_TESTS=ON")
 fi
 
-if [ "$build_tt_umd_tests" = "ON" ]; then
+if [ "$build_umd_tests" = "ON" ]; then
     cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
 fi
 

diff --git a/tests/scripts/test_moreh_microbenchmark.py b/tests/scripts/test_moreh_microbenchmark.py
@@ -78,6 +78,14 @@ def append_to_csv(file_path, header, data, write_header=True):
 def profile_results():
     setup = device_post_proc_config.default_setup()
     setup.deviceInputLog = profiler_log_path
+    setup.timerAnalysis = {
+        "device_fw_duration": {
+            "across": "device",
+            "type": "session_first_last",
+            "start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-FW" for risc in setup.riscTypes]},
+            "end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-FW" for risc in setup.riscTypes]},
+        },
+    }
     devices_data = import_log_run_stats(setup)
     deviceID = list(devices_data["devices"].keys())[0]
     total_cycle = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["device_fw_duration"]["stats"][
@@ -89,6 +97,14 @@ def profile_results():
 def profile_results_kernel_duration():
     setup = device_post_proc_config.default_setup()
     setup.deviceInputLog = profiler_log_path
+    setup.timerAnalysis = {
+        "device_kernel_duration": {
+            "across": "device",
+            "type": "session_first_last",
+            "start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
+            "end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
+        },
+    }
     devices_data = import_log_run_stats(setup)
     deviceID = list(devices_data["devices"].keys())[0]
     total_cycle = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["device_kernel_duration"]["stats"][
@@ -108,6 +124,14 @@ def get_device_freq():
 def profile_noc_results():
     setup = device_post_proc_config.test_noc()
     setup.deviceInputLog = profiler_log_path
+    setup.timerAnalysis = {
+        "NoC For Loop": {
+            "across": "device",
+            "type": "session_first_last",
+            "start": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
+            "end": {"core": "ANY", "risc": "ANY", "zone_name": [f"{risc}-KERNEL" for risc in setup.riscTypes]},
+        },
+    }
     devices_data = import_log_run_stats(setup)
     deviceID = list(devices_data["devices"].keys())[0]
     min = devices_data["devices"][deviceID]["cores"]["DEVICE"]["analysis"]["NoC For Loop"]["stats"]["Min"]
@@ -820,8 +844,10 @@ def test_dram_read_all_core(arch, freq, test_vector, num_tests, nblock, data_for
     data.append([throughput])
     # check within range
     dev_freq = get_device_freq()
-    bw_bound = 260.0 * dev_freq / 1000.0
-    assert bw_bound <= throughput
+    bw_lower_bound = 260.0 * dev_freq / 1000.0
+    bw_upper_bound = bw_lower_bound + 10.0
+    assert bw_lower_bound <= throughput
+    assert throughput <= bw_upper_bound
 
 
 @pytest.mark.parametrize(
@@ -949,15 +975,19 @@ def test_dram_read_l1_write_core(
     data.append([throughput])
     # check within range
     if arch == "grayskull":
-        bw_bound = 70.0  # Equals 85 GB/s with 1200 MHz
+        bw_lower_bound = 70.0  # Equals 85 GB/s with 1200 MHz
     elif arch == "wormhole_b0":
-        bw_bound = 260.0
+        bw_lower_bound = 260.0
     elif arch == "blackhole":
-        bw_bound = 340.0
+        bw_lower_bound = 340.0
     if bw_target is not None:
-        bw_bound = bw_target
-    bw_bound = bw_bound * dev_freq / 1000.0  # Adjust for device frequency; target is based on max device frequency
-    assert bw_bound <= throughput
+        bw_lower_bound = bw_target
+    bw_lower_bound = (
+        bw_lower_bound * dev_freq / 1000.0
+    )  # Adjust for device frequency; target is based on max device frequency
+    assert bw_lower_bound <= throughput
+    bw_upper_bound = bw_lower_bound + 10.0
+    assert throughput <= bw_upper_bound
 
 
 @pytest.mark.parametrize(
@@ -1055,15 +1085,18 @@ def test_dram_read_remote_cb_sync(
     logger.info("DRAM read throughput: " + str(throughput))
     data.append([throughput])
     # check within range
+    bw_lower_bound = 0.0
     if test == None:
         if arch == "wormhole_b0":
-            bw_bound = 21.5
+            bw_lower_bound = 21.5
     elif test == "Matmul":
         if arch == "wormhole_b0":
-            bw_bound = 18.0
+            bw_lower_bound = 18.0
+    bw_upper_bound = bw_lower_bound + 4.0
     if use_sub_devices:
         pytest.xfail("Tests using sub-devices is not correctly set up for BW measurements")
-    assert bw_bound <= throughput
+    assert bw_lower_bound <= throughput
+    assert throughput <= bw_upper_bound
 
 
 @pytest.mark.parametrize(

diff --git a/tests/sweep_framework/sweeps/eltwise/binary/add/add_llama.py b/tests/sweep_framework/sweeps/eltwise/binary/add/add_llama.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+
+
+parameters = {
+    "nightly": {
+        "input_shape": [
+            {"self": [1, 1, 32, 1024], "other": [1, 1, 32, 1024], "input_dtype": "ttnn.bfloat16"},
+            {"self": [1, 1, 32, 4096], "other": [1, 1, 32, 4096], "input_dtype": "ttnn.bfloat16"},
+        ],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "input_b_layout": [ttnn.TILE_LAYOUT],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_b_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Row Major layout is not supported"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_shape,
+    input_a_layout,
+    input_b_layout,
+    *,
+    device,
+) -> list:
+    torch.manual_seed(0)
+    if input_shape["input_dtype"] == "ttnn.bfloat16":
+        input_dtype = ttnn.bfloat16
+    elif input_shape["input_dtype"] == "ttnn.float32":
+        input_dtype = ttnn.float32
+    elif input_shape["input_dtype"] == "ttnn.int32":
+        input_dtype = ttnn.int32
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+    )(input_shape["self"])
+    torch_input_tensor_b = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+    )(input_shape["other"])
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_shape["self"],
+        core_grid=ttnn.CoreGrid(y=4, x=8),
+        strategy=ttnn.ShardStrategy.WIDTH,
+        orientation=ttnn.ShardOrientation.ROW_MAJOR,
+        use_height_and_width_as_shard_shape=False,
+        halo=0,
+        tile_layout=True,
+    )
+    golden_function = ttnn.get_golden_function(ttnn.add)
+    torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor_b = ttnn.from_torch(
+        torch_input_tensor_b,
+        dtype=input_dtype,
+        layout=input_b_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.add(input_tensor_a, input_tensor_b, memory_config=sharded_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/multiply/mul_no_act_llama.py b/tests/sweep_framework/sweeps/eltwise/binary/multiply/mul_no_act_llama.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+
+
+parameters = {
+    "nightly": {
+        "input_spec": [
+            {"self": [1, 1, 32, 3584], "other": [1, 1, 32, 3584], "input_dtype": "ttnn.bfloat16", "y": 2},
+            {"self": [1, 1, 32, 14336], "other": [1, 1, 32, 14336], "input_dtype": "ttnn.bfloat16", "y": 4},
+        ],
+        "input_a_layout": [ttnn.TILE_LAYOUT],
+        "input_b_layout": [ttnn.TILE_LAYOUT],
+    },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT or test_vector["input_b_layout"] == ttnn.ROW_MAJOR_LAYOUT:
+        return True, "Row Major layout is not supported"
+    return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+    input_spec,
+    input_a_layout,
+    input_b_layout,
+    *,
+    device,
+) -> list:
+    torch.manual_seed(0)
+    if input_spec["input_dtype"] == "ttnn.bfloat16":
+        input_dtype = ttnn.bfloat16
+    elif input_spec["input_dtype"] == "ttnn.float32":
+        input_dtype = ttnn.float32
+    elif input_spec["input_dtype"] == "ttnn.int32":
+        input_dtype = ttnn.int32
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+    )(input_spec["self"])
+    torch_input_tensor_b = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+    )(input_spec["other"])
+
+    golden_function = ttnn.get_golden_function(ttnn.multiply)
+    torch_output_tensor = golden_function(torch_input_tensor_a, torch_input_tensor_b)
+
+    sharded_config = ttnn.create_sharded_memory_config_(
+        shape=input_spec["self"],
+        core_grid=ttnn.CoreGrid(y=input_spec["y"], x=8),
+        strategy=ttnn.ShardStrategy.WIDTH,
+        orientation=ttnn.ShardOrientation.ROW_MAJOR,
+        use_height_and_width_as_shard_shape=False,
+        halo=0,
+        tile_layout=True,
+    )
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    input_tensor_b = ttnn.from_torch(
+        torch_input_tensor_b,
+        dtype=input_dtype,
+        layout=input_b_layout,
+        device=device,
+        memory_config=sharded_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.multiply(input_tensor_a, input_tensor_b, memory_config=sharded_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    return [check_with_pcc(torch_output_tensor, output_tensor, 0.999), e2e_perf]