diff --git a/.clang-format-ignore b/.clang-format-ignore
index 4a9d3656fd9..dd0b7408c8b 100644
--- a/.clang-format-ignore
+++ b/.clang-format-ignore
@@ -67,6 +67,7 @@ tests/tt_metal/test_utils/env_vars.hpp
tests/tt_metal/tt_metal/api/allocator/test_free_list_opt_allocator.cpp
tests/tt_metal/tt_metal/api/test_global_semaphores.cpp
tests/tt_metal/tt_metal/dispatch/sub_device_test_utils.hpp
+tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/pgm_dispatch_golden.json
tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp
tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp
tests/tt_metal/tt_metal/perf_microbenchmark/routing/kernels/traffic_gen_rx.cpp
diff --git a/.github/actions/prepare-metal-run/action.yml b/.github/actions/prepare-metal-run/action.yml
index 874c0223d37..a6784eddc4f 100644
--- a/.github/actions/prepare-metal-run/action.yml
+++ b/.github/actions/prepare-metal-run/action.yml
@@ -2,9 +2,6 @@ name: Prepare Metal Run
description: "Installs Python Dependencies from cache or from PyPI if cache is not available."
inputs:
- arch:
- description: "The architecture to use"
- required: true
is_profiler:
description: "Whether to load with profiler"
required: false
@@ -24,14 +21,14 @@ runs:
- uses: actions/download-artifact@v4
if: ${{ inputs.is_profiler == 'false' }}
with:
- name: TTMetal_build_${{ inputs.arch }}
+ name: TTMetal_build_any
- uses: actions/download-artifact@v4
if: ${{ inputs.is_profiler == 'true' }}
with:
- name: TTMetal_build_${{ inputs.arch }}_profiler
+ name: TTMetal_build_any_profiler
- name: Extract files
shell: bash
- run: tar -xvf ttm_${{ inputs.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
with:
python-version: ${{ inputs.python-version }}
diff --git a/.github/workflows/_build-wheels-impl.yaml b/.github/workflows/_build-wheels-impl.yaml
index 239729947f0..70e211af017 100644
--- a/.github/workflows/_build-wheels-impl.yaml
+++ b/.github/workflows/_build-wheels-impl.yaml
@@ -6,9 +6,6 @@ on:
os:
required: True
type: string
- arch:
- required: True
- type: string
from-precompiled:
required: True
default: True
@@ -17,8 +14,6 @@ on:
jobs:
build-wheel:
runs-on: ${{ inputs.os }}
- env:
- ARCH_NAME: ${{ inputs.arch }}
steps:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
with:
@@ -57,7 +52,6 @@ jobs:
- uses: ./.github/actions/prepare-metal-run
if: ${{ inputs.from-precompiled }}
with:
- arch: ${{ inputs.arch }}
python-version: ${{ env.python-version }}
- name: Set precompiled dir for precompile builds
if: ${{ inputs.from-precompiled }}
@@ -69,5 +63,5 @@ jobs:
- name: Upload distribution as artifact
uses: actions/upload-artifact@v4
with:
- name: eager-dist-${{ inputs.os }}-${{ inputs.arch }}
+ name: eager-dist-${{ inputs.os }}-any
path: dist/
diff --git a/.github/workflows/_test-wheels-impl.yaml b/.github/workflows/_test-wheels-impl.yaml
index 6049068510c..b61afa66161 100644
--- a/.github/workflows/_test-wheels-impl.yaml
+++ b/.github/workflows/_test-wheels-impl.yaml
@@ -37,7 +37,7 @@ jobs:
os: ${{ matrix.os }}
- uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ matrix.runner-hw-info.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: Set up end-to-end tests environment
run: ./tests/scripts/set_up_end_to_end_tests_env.sh
- name: Activate env and run release tests - host
@@ -61,7 +61,7 @@ jobs:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ matrix.runner-hw-info.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: Set up end-to-end tests environment
run: ./tests/scripts/set_up_end_to_end_tests_env.sh
- name: Activate env and run release tests - silicon
diff --git a/.github/workflows/all-post-commit-workflows.yaml b/.github/workflows/all-post-commit-workflows.yaml
index 09072086616..f42e4d21e9b 100644
--- a/.github/workflows/all-post-commit-workflows.yaml
+++ b/.github/workflows/all-post-commit-workflows.yaml
@@ -40,11 +40,9 @@ jobs:
# Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
# The full 22.04 flow can be tested without precompiled
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: true
secrets: inherit
test-wheels:
@@ -74,9 +72,9 @@ jobs:
needs: build-docker-image-2004
uses: ./.github/workflows/build-artifact.yaml
with:
- tracy: true
build-docker: false
build-type: ${{ inputs.build-type || 'Release' }}
+ tracy: true
secrets: inherit
# Slow Dispatch Unit Tests
sd-unit-tests:
diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml
index 2cdfc3b17b4..12bda76c1fc 100644
--- a/.github/workflows/bisect-dispatch.yaml
+++ b/.github/workflows/bisect-dispatch.yaml
@@ -31,8 +31,6 @@ run-name: ${{ inputs.description }}
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '[ "${{ inputs.arch }}" ]'
secrets: inherit
test-dispatch:
needs: build-artifact
@@ -50,9 +48,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ inputs.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ inputs.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests in a loop
run: |
diff --git a/.github/workflows/blackhole-post-commit.yaml b/.github/workflows/blackhole-post-commit.yaml
index 72ba467fa92..2a3158a42a8 100644
--- a/.github/workflows/blackhole-post-commit.yaml
+++ b/.github/workflows/blackhole-post-commit.yaml
@@ -32,20 +32,13 @@ jobs:
secrets: inherit
with:
os: "ubuntu-22.04-amd64"
- arch: '["blackhole"]'
build-docker: false
build-wheels:
needs: build-artifact
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: "ubuntu-22.04"
- arch: "blackhole"
from-precompiled: true
-# build-artifact-profiler:
-# uses: ./.github/workflows/build-artifact.yaml
-# with:
-# profiler-build: true
-# secrets: inherit
umd-unit-tests:
secrets: inherit
uses: ./.github/workflows/umd-unit-tests.yaml
diff --git a/.github/workflows/build-and-test-wheels.yaml b/.github/workflows/build-and-test-wheels.yaml
index 3f2385121a2..d21c08d1f76 100644
--- a/.github/workflows/build-and-test-wheels.yaml
+++ b/.github/workflows/build-and-test-wheels.yaml
@@ -23,11 +23,9 @@ jobs:
# Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
# The full 22.04 flow can be tested without precompiled
os: ${{ fromJson((github.event_name == 'schedule' || inputs.from-precompiled) && '["ubuntu-20.04"]' || '["ubuntu-20.04", "ubuntu-22.04"]') }}
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: ${{ inputs.from-precompiled }}
test-wheels:
needs: build-wheels
diff --git a/.github/workflows/build-and-unit-tests.yaml b/.github/workflows/build-and-unit-tests.yaml
index ef77466c208..489c4e75d8a 100644
--- a/.github/workflows/build-and-unit-tests.yaml
+++ b/.github/workflows/build-and-unit-tests.yaml
@@ -76,8 +76,6 @@ jobs:
steps:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ inputs.arch }}
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ inputs.timeout }}
uses: ./.github/actions/docker-run
diff --git a/.github/workflows/build-artifact.yaml b/.github/workflows/build-artifact.yaml
index 5add3efdde1..e1e436fadc4 100644
--- a/.github/workflows/build-artifact.yaml
+++ b/.github/workflows/build-artifact.yaml
@@ -3,10 +3,6 @@ name: "Build tt-metal artifacts"
on:
workflow_call:
inputs:
- arch:
- required: false
- type: string
- default: '["grayskull", "wormhole_b0"]'
build-type:
required: false
type: string
@@ -32,10 +28,6 @@ on:
type: boolean
default: true
description: "Build docker image"
- arch:
- required: false
- type: string
- default: '["grayskull", "wormhole_b0"]'
build-type:
required: false
type: string
@@ -63,12 +55,7 @@ jobs:
needs: build-docker-image
if: always()
timeout-minutes: 30
- strategy:
- matrix:
- arch: ${{ fromJson(inputs.arch || '["grayskull", "wormhole_b0", "blackhole"]') }}
env:
- TT_METAL_ENV: ${{ vars.TT_METAL_ENV }}
- ARCH_NAME: ${{ matrix.arch }}
SILENT: 0
VERBOSE: 1
runs-on:
@@ -123,7 +110,6 @@ jobs:
-v /etc/bashrc:/etc/bashrc:ro
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-v /mnt/MLPerf/ccache:/mnt/MLPerf/ccache
- -e ARCH_NAME=${{ matrix.arch }}
-e CARGO_HOME=${{ github.workspace }}/.cargo
-w ${{ github.workspace }}
run: |
@@ -151,9 +137,9 @@ jobs:
cat build/ccache.stats >> $GITHUB_STEP_SUMMARY
echo '```' >> $GITHUB_STEP_SUMMARY
- name: 'Tar files'
- run: tar -cvhf ttm_${{ matrix.arch }}.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime
+ run: tar -cvhf ttm_any.tar ttnn/ttnn/*.so build/lib ttnn/ttnn/*.so build/programming_examples build/test build/tools build/tt-train data runtime
- name: 'Upload Artifact'
uses: actions/upload-artifact@v4
with:
- name: TTMetal_build_${{ matrix.arch }}${{ (inputs.tracy && '_profiler') || '' }}
- path: ttm_${{ matrix.arch }}.tar
+ name: TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }}
+ path: ttm_any.tar
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index abd651be8ca..dbebda60b94 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -20,13 +20,11 @@ jobs:
#{type: RelWithDebInfo, cxx_compiler: g++-12, c_compiler: gcc-12, runs-on: ["build", "in-service"], os: ubuntu-22.04},
{type: Release, cxx_compiler: g++-12, c_compiler: gcc-12, runs-on: ["build", "in-service"], os: ubuntu-22.04},
]
- arch: [grayskull, wormhole_b0, blackhole]
env:
- ARCH_NAME: ${{ matrix.arch }}
# So we can get all the makefile output we want
VERBOSE: 1
runs-on: ${{ matrix.build.runs-on }}
- name: ${{ matrix.build.type }} ${{ matrix.build.cxx_compiler }} ${{ matrix.arch }} ${{ matrix.build.os }}
+ name: ${{ matrix.build.type }} ${{ matrix.build.cxx_compiler }} any ${{ matrix.build.os }}
steps:
- name: Verify ccache availability
shell: bash
@@ -51,7 +49,6 @@ jobs:
docker_username: ${{ github.actor }}
docker_password: ${{ secrets.GITHUB_TOKEN }}
docker_opts: |
- -e ARCH_NAME=${{ matrix.arch }}
--group-add 1457
-v /home/ubuntu/.ccache-ci:/home/ubuntu/.ccache
-e CCACHE_DIR=/home/ubuntu/.ccache
diff --git a/.github/workflows/cpp-post-commit.yaml b/.github/workflows/cpp-post-commit.yaml
index ff8bb335639..c716f23f796 100644
--- a/.github/workflows/cpp-post-commit.yaml
+++ b/.github/workflows/cpp-post-commit.yaml
@@ -81,8 +81,6 @@ jobs:
steps:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ inputs.arch }}
- name: ${{ matrix.test-group.name }} tests
#GH Issue 16167
if: ${{ !(inputs.runner-label == 'BH' && matrix.test-group.name == 'tools') }}
diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml
index 7737395d5d5..2afe136086e 100644
--- a/.github/workflows/docs-latest-public.yaml
+++ b/.github/workflows/docs-latest-public.yaml
@@ -42,9 +42,9 @@ jobs:
os: ubuntu-20.04
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Build Docs
timeout-minutes: 15
diff --git a/.github/workflows/docs-release.yaml b/.github/workflows/docs-release.yaml
index b6c21291ca9..b68f85e61b2 100644
--- a/.github/workflows/docs-release.yaml
+++ b/.github/workflows/docs-release.yaml
@@ -51,9 +51,9 @@ jobs:
os: ubuntu-20.04
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Build Doxygen Docs
timeout-minutes: 15
diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
index cfbaf686cd5..c3e1c4f3879 100644
--- a/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
+++ b/.github/workflows/fast-dispatch-build-and-unit-tests-wrapper.yaml
@@ -11,9 +11,21 @@ jobs:
needs: build-docker-artifact
uses: ./.github/workflows/build-artifact.yaml
secrets: inherit
+ build-wheels:
+ needs: build-artifact
+ strategy:
+ matrix:
+ # Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
+ # The full 22.04 flow can be tested without precompiled
+ os: [ubuntu-20.04]
+ uses: ./.github/workflows/_build-wheels-impl.yaml
+ with:
+ os: ${{ matrix.os }}
+ from-precompiled: true
+ secrets: inherit
# FD Unit Tests
fast-dispatch-unit-tests:
- needs: build-artifact
+ needs: build-wheels
secrets: inherit
strategy:
fail-fast: false
@@ -29,7 +41,7 @@ jobs:
runner-label: ${{ matrix.test-group.runner-label}}
# TTNN FD Unit tests
ttnn-unit-tests:
- needs: build-artifact
+ needs: build-wheels
secrets: inherit
strategy:
fail-fast: false
@@ -46,7 +58,7 @@ jobs:
# FD Model Tests
models-unit-tests:
- needs: build-artifact
+ needs: build-wheels
secrets: inherit
strategy:
fail-fast: false
@@ -63,7 +75,7 @@ jobs:
# FD C++ Unit Tests
cpp-unit-tests:
- needs: build-artifact
+ needs: build-wheels
secrets: inherit
strategy:
fail-fast: false
diff --git a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
index 2c55a940034..8042f7cd7ca 100644
--- a/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/fast-dispatch-build-and-unit-tests.yaml
@@ -73,7 +73,7 @@ jobs:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ inputs.timeout }}
uses: ./.github/actions/docker-run
@@ -81,6 +81,8 @@ jobs:
docker_os_arch: tt-metalium/${{ inputs.os }}-amd64
install_wheel: true
docker_password: ${{ secrets.GITHUB_TOKEN }}
+ docker_opts: |
+ -e ARCH_NAME=${{ inputs.arch }}
run_args: |
${{ matrix.test-group.cmd }}
- uses: ./.github/actions/slack-report
diff --git a/.github/workflows/fast-dispatch-frequent-tests-impl.yaml b/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
index 2dbc84d446b..e5dcf724344 100644
--- a/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
+++ b/.github/workflows/fast-dispatch-frequent-tests-impl.yaml
@@ -16,15 +16,13 @@ jobs:
fail-fast: false
matrix:
test-group:
- [
- {
- name: "WH N300 pgm dispatch nightly",
- arch: wormhole_b0,
- runs-on: ["cloud-virtual-machine", "N300", "in-service"],
- cmd: ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.sh,
- timeout: 10
- },
- ]
+ - name: "WH N300 pgm dispatch nightly"
+ arch: wormhole_b0
+ runs-on: ["cloud-virtual-machine", "N300", "in-service"]
+ run-args: |
+ ./build/test/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch_wormhole_b0 --benchmark_out_format=json --benchmark_out=bench.json
+ ./tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/compare_pgm_dispatch_perf_ci.py bench.json
+ timeout: 10
name: ${{ matrix.test-group.name }}
env:
LOGURU_LEVEL: INFO
@@ -32,8 +30,6 @@ jobs:
steps:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ matrix.test-group.arch }}
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ matrix.test-group.timeout }}
uses: ./.github/actions/docker-run
@@ -44,8 +40,7 @@ jobs:
-e TT_METAL_HOME=${{ github.workspace }}
-e ARCH_NAME=${{ matrix.test-group.arch }}
-e LD_LIBRARY_PATH=${{ github.workspace }}/build/lib
- run_args: |
- ${{ matrix.test-group.cmd }}
+ run_args: ${{ matrix.test-group.run-args }}
- uses: ./.github/actions/slack-report
if: ${{ failure() }}
with:
@@ -57,3 +52,9 @@ jobs:
path: |
generated/test_reports/
prefix: "test_reports_"
+ - uses: ./.github/actions/upload-artifact-with-job-uuid
+ if: ${{ !cancelled() }}
+ with:
+ path: |
+ bench.json
+ prefix: "pgm_benchmarks_json_"
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
index 8b71190eb2b..196bfe013f7 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models-impl.yaml
@@ -66,8 +66,6 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ matrix.test-group.arch }}
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: ${{ matrix.test-group.timeout }}
@@ -112,8 +110,6 @@ jobs:
run: |
echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: wormhole_b0
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 30
@@ -179,8 +175,6 @@ jobs:
run: |
echo "WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: wormhole_b0
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 60
diff --git a/.github/workflows/full-regressions-and-models.yaml b/.github/workflows/full-regressions-and-models.yaml
index b5c4cb3a483..0c424f5e4f5 100644
--- a/.github/workflows/full-regressions-and-models.yaml
+++ b/.github/workflows/full-regressions-and-models.yaml
@@ -34,9 +34,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent reg tests scripts
timeout-minutes: 210
diff --git a/.github/workflows/models-post-commit-wrapper.yaml b/.github/workflows/models-post-commit-wrapper.yaml
index 86533af4570..ccdccc25a4a 100644
--- a/.github/workflows/models-post-commit-wrapper.yaml
+++ b/.github/workflows/models-post-commit-wrapper.yaml
@@ -22,11 +22,9 @@ jobs:
# Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
# The full 22.04 flow can be tested without precompiled
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: true
secrets: inherit
models-unit-tests:
diff --git a/.github/workflows/models-post-commit.yaml b/.github/workflows/models-post-commit.yaml
index 8e14f413db4..6784790f115 100644
--- a/.github/workflows/models-post-commit.yaml
+++ b/.github/workflows/models-post-commit.yaml
@@ -63,13 +63,15 @@ jobs:
command: ./.github/scripts/cloud_utils/mount_weka.sh
- uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ inputs.timeout }}
uses: ./.github/actions/docker-run
with:
install_wheel: true
docker_password: ${{ secrets.GITHUB_TOKEN }}
+ docker_opts: |
+ -e ARCH_NAME=${{ inputs.arch }}
run_args: |
source tests/scripts/run_python_model_tests.sh && run_python_model_tests_${{ inputs.arch }}
- uses: ./.github/actions/slack-report
diff --git a/.github/workflows/package-and-release.yaml b/.github/workflows/package-and-release.yaml
index d695c8245e9..b8834c1a6f5 100644
--- a/.github/workflows/package-and-release.yaml
+++ b/.github/workflows/package-and-release.yaml
@@ -123,11 +123,9 @@ jobs:
strategy:
matrix:
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: false
# Candidate for breaking up
create-and-upload-draft-release:
@@ -139,7 +137,6 @@ jobs:
strategy:
matrix:
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
# May accidentally create two releases without restricting to 1 job
concurrency: create_upload_draft_release
runs-on: ubuntu-latest
@@ -149,7 +146,7 @@ jobs:
- name: Download eager Python packages
uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ matrix.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: Create VERSION
run: echo ${{ needs.create-tag.outputs.version }} > VERSION
- name : Download release notes
diff --git a/.github/workflows/perf-device-models-impl.yaml b/.github/workflows/perf-device-models-impl.yaml
index a95f650a568..43610aa2cfd 100644
--- a/.github/workflows/perf-device-models-impl.yaml
+++ b/.github/workflows/perf-device-models-impl.yaml
@@ -34,7 +34,6 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
with:
- arch: ${{ matrix.test-info.arch }}
is_profiler: 'true'
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ matrix.test-info.timeout }}
diff --git a/.github/workflows/perf-models-impl.yaml b/.github/workflows/perf-models-impl.yaml
index 13159b61f16..153e303001e 100644
--- a/.github/workflows/perf-models-impl.yaml
+++ b/.github/workflows/perf-models-impl.yaml
@@ -34,9 +34,9 @@ jobs:
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-info.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-info.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run performance regressions
id: performance_tests
diff --git a/.github/workflows/publish-release-image-wrapper.yaml b/.github/workflows/publish-release-image-wrapper.yaml
index cc22e9f23c8..45ff119d4d4 100644
--- a/.github/workflows/publish-release-image-wrapper.yaml
+++ b/.github/workflows/publish-release-image-wrapper.yaml
@@ -13,11 +13,9 @@ jobs:
# Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
# The full 22.04 flow can be tested without precompiled
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: true
publish-release-image:
needs: build-wheels
diff --git a/.github/workflows/publish-release-image.yaml b/.github/workflows/publish-release-image.yaml
index 64f8a2f3d29..586cb2c79a3 100644
--- a/.github/workflows/publish-release-image.yaml
+++ b/.github/workflows/publish-release-image.yaml
@@ -37,7 +37,7 @@ jobs:
- name: Download wheels
uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ matrix.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: Get the name of the wheel and set up env variables
id: generate-tag-name
run: |
diff --git a/.github/workflows/run-profiler-regression.yaml b/.github/workflows/run-profiler-regression.yaml
index 07290e2fc8a..adbef02dea0 100644
--- a/.github/workflows/run-profiler-regression.yaml
+++ b/.github/workflows/run-profiler-regression.yaml
@@ -31,9 +31,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.runner-info.arch }}_profiler
+ name: TTMetal_build_any_profiler
- name: Extract files
- run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run profiler regression tests
timeout-minutes: 30
diff --git a/.github/workflows/single-card-demo-tests-impl.yaml b/.github/workflows/single-card-demo-tests-impl.yaml
index 43780149629..3368012b3b9 100644
--- a/.github/workflows/single-card-demo-tests-impl.yaml
+++ b/.github/workflows/single-card-demo-tests-impl.yaml
@@ -46,8 +46,6 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ matrix.test-group.arch }}
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
timeout-minutes: 70
diff --git a/.github/workflows/single-card-demo-tests.yaml b/.github/workflows/single-card-demo-tests.yaml
index ef7c101d8fb..0e98fd9052a 100644
--- a/.github/workflows/single-card-demo-tests.yaml
+++ b/.github/workflows/single-card-demo-tests.yaml
@@ -10,8 +10,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
single-card-demo-tests:
needs: build-artifact
diff --git a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
index c45b33ccccf..205e86cceb9 100644
--- a/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-fast-dispatch-build-and-unit-tests.yaml
@@ -41,9 +41,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.runner-info.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.runner-info.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests in a loop
run: |
diff --git a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
index 1976249eba1..ce01df49a5c 100644
--- a/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
+++ b/.github/workflows/stress-slow-dispatch-build-and-unit-tests.yaml
@@ -40,9 +40,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests in a loop
run: |
diff --git a/.github/workflows/t3000-demo-tests-impl.yaml b/.github/workflows/t3000-demo-tests-impl.yaml
index 9ad4ab1b818..744f6475d44 100644
--- a/.github/workflows/t3000-demo-tests-impl.yaml
+++ b/.github/workflows/t3000-demo-tests-impl.yaml
@@ -44,9 +44,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
shell: bash {0}
diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml
index 5ed80a3861d..9d1a5ad7e57 100644
--- a/.github/workflows/t3000-demo-tests.yaml
+++ b/.github/workflows/t3000-demo-tests.yaml
@@ -9,8 +9,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
t3000-demo-tests:
needs: build-artifact
diff --git a/.github/workflows/t3000-frequent-tests-impl.yaml b/.github/workflows/t3000-frequent-tests-impl.yaml
index 11a2df7b146..f538f9ba3cf 100644
--- a/.github/workflows/t3000-frequent-tests-impl.yaml
+++ b/.github/workflows/t3000-frequent-tests-impl.yaml
@@ -46,9 +46,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
shell: bash {0}
diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml
index dd56ffe0aa1..8ab4ed51dbe 100644
--- a/.github/workflows/t3000-frequent-tests.yaml
+++ b/.github/workflows/t3000-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
t3000-frequent-tests:
needs: build-artifact
diff --git a/.github/workflows/t3000-model-perf-tests-impl.yaml b/.github/workflows/t3000-model-perf-tests-impl.yaml
index 387a18d15a2..d63b96dd421 100644
--- a/.github/workflows/t3000-model-perf-tests-impl.yaml
+++ b/.github/workflows/t3000-model-perf-tests-impl.yaml
@@ -49,17 +49,17 @@ jobs:
if: ${{ matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+ name: TTMetal_build_any_profiler
continue-on-error: true
- name: Download build artifact
id: download-artifact
if: ${{ !matrix.test-group.tracy }}
uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
if: ${{ matrix.test-group.tracy && steps.download-profiler-artifact.outcome == 'success' || !matrix.test-group.tracy }}
diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
index 15d96746889..aa31d66e875 100644
--- a/.github/workflows/t3000-model-perf-tests.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -8,13 +8,10 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
- arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
t3000-model-perf-tests:
diff --git a/.github/workflows/t3000-nightly-tests-impl.yaml b/.github/workflows/t3000-nightly-tests-impl.yaml
index 7b445b3204b..d2bc182e92f 100644
--- a/.github/workflows/t3000-nightly-tests-impl.yaml
+++ b/.github/workflows/t3000-nightly-tests-impl.yaml
@@ -35,9 +35,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
shell: bash {0}
diff --git a/.github/workflows/t3000-nightly-tests.yaml b/.github/workflows/t3000-nightly-tests.yaml
index 58944fa282f..a62267b3b12 100644
--- a/.github/workflows/t3000-nightly-tests.yaml
+++ b/.github/workflows/t3000-nightly-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
t3000-nightly-tests:
needs: build-artifact
diff --git a/.github/workflows/t3000-perplexity-tests-impl.yaml b/.github/workflows/t3000-perplexity-tests-impl.yaml
index 6779624d550..9b6384bb491 100644
--- a/.github/workflows/t3000-perplexity-tests-impl.yaml
+++ b/.github/workflows/t3000-perplexity-tests-impl.yaml
@@ -34,9 +34,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run perplexity tests
shell: bash {0}
diff --git a/.github/workflows/t3000-perplexity-tests.yaml b/.github/workflows/t3000-perplexity-tests.yaml
index c7d8f2d16ea..680a564d646 100644
--- a/.github/workflows/t3000-perplexity-tests.yaml
+++ b/.github/workflows/t3000-perplexity-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
t3000-model-accuracy-perplexity-tests:
needs: build-artifact
diff --git a/.github/workflows/t3000-profiler-tests-impl.yaml b/.github/workflows/t3000-profiler-tests-impl.yaml
index c1d5cf01247..d9847249087 100644
--- a/.github/workflows/t3000-profiler-tests-impl.yaml
+++ b/.github/workflows/t3000-profiler-tests-impl.yaml
@@ -35,9 +35,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+ name: TTMetal_build_any_profiler
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run profiler regression tests
timeout-minutes: 30
diff --git a/.github/workflows/t3000-profiler-tests.yaml b/.github/workflows/t3000-profiler-tests.yaml
index ccc9dda2876..08d5f6ea0dd 100644
--- a/.github/workflows/t3000-profiler-tests.yaml
+++ b/.github/workflows/t3000-profiler-tests.yaml
@@ -10,7 +10,6 @@ jobs:
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
- arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
t3000-profiler-tests:
diff --git a/.github/workflows/t3000-unit-tests-impl.yaml b/.github/workflows/t3000-unit-tests-impl.yaml
index f983a14b43a..ea077571775 100644
--- a/.github/workflows/t3000-unit-tests-impl.yaml
+++ b/.github/workflows/t3000-unit-tests-impl.yaml
@@ -47,9 +47,9 @@ jobs:
- uses: ./.github/actions/ensure-active-weka-mount
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run unit regression tests
shell: bash {0}
diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
index c753e82c4ac..9950b40a295 100644
--- a/.github/workflows/t3000-unit-tests.yaml
+++ b/.github/workflows/t3000-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
t3000-unit-tests:
needs: build-artifact
diff --git a/.github/workflows/test-dispatch.yaml b/.github/workflows/test-dispatch.yaml
index c0b7ef92c91..d14ec14f6df 100644
--- a/.github/workflows/test-dispatch.yaml
+++ b/.github/workflows/test-dispatch.yaml
@@ -52,7 +52,6 @@ jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
with:
- arch: '[ "${{ inputs.arch }}" ]'
build-type: ${{ inputs.build-type }}
tracy: ${{ inputs.tracy }}
secrets: inherit
@@ -75,9 +74,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ inputs.arch }}${{ (inputs.tracy && '_profiler') || '' }}
+ name: TTMetal_build_any${{ (inputs.tracy && '_profiler') || '' }}
- name: Extract files
- run: tar -xvf ttm_${{ inputs.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run pre/post regression tests in a loop
run: |
diff --git a/.github/workflows/tg-demo-tests-impl.yaml b/.github/workflows/tg-demo-tests-impl.yaml
index f4956749a5e..b5547d2abd6 100644
--- a/.github/workflows/tg-demo-tests-impl.yaml
+++ b/.github/workflows/tg-demo-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-demo-tests.yaml b/.github/workflows/tg-demo-tests.yaml
index b8e31e4c49d..343b047db67 100644
--- a/.github/workflows/tg-demo-tests.yaml
+++ b/.github/workflows/tg-demo-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tg-demo-tests:
needs: build-artifact
diff --git a/.github/workflows/tg-frequent-tests-impl.yaml b/.github/workflows/tg-frequent-tests-impl.yaml
index fbc89ab24d0..a1577350e10 100644
--- a/.github/workflows/tg-frequent-tests-impl.yaml
+++ b/.github/workflows/tg-frequent-tests-impl.yaml
@@ -32,9 +32,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-frequent-tests.yaml b/.github/workflows/tg-frequent-tests.yaml
index 85fb5e16342..285d65e5e27 100644
--- a/.github/workflows/tg-frequent-tests.yaml
+++ b/.github/workflows/tg-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tg-frequent-tests:
needs: build-artifact
diff --git a/.github/workflows/tg-model-perf-tests-impl.yaml b/.github/workflows/tg-model-perf-tests-impl.yaml
index 2ba22cfc0c8..5ce68339f04 100644
--- a/.github/workflows/tg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tg-model-perf-tests-impl.yaml
@@ -51,9 +51,9 @@ jobs:
- name: Download profiler build artifact
uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}_profiler
+ name: TTMetal_build_any_profiler
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
diff --git a/.github/workflows/tg-model-perf-tests.yaml b/.github/workflows/tg-model-perf-tests.yaml
index 4202cc46ad3..a8bb64dff46 100644
--- a/.github/workflows/tg-model-perf-tests.yaml
+++ b/.github/workflows/tg-model-perf-tests.yaml
@@ -9,7 +9,6 @@ jobs:
build-artifact-profiler:
uses: ./.github/workflows/build-artifact.yaml
with:
- arch: '["wormhole_b0"]'
tracy: true
secrets: inherit
tg-model-perf-tests:
diff --git a/.github/workflows/tg-nightly-tests.yaml b/.github/workflows/tg-nightly-tests.yaml
index bee91a86e0b..ce8f9897ffb 100644
--- a/.github/workflows/tg-nightly-tests.yaml
+++ b/.github/workflows/tg-nightly-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tg-nightly-tests:
needs: build-artifact
@@ -39,9 +37,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
shell: bash {0}
diff --git a/.github/workflows/tg-unit-tests-impl.yaml b/.github/workflows/tg-unit-tests-impl.yaml
index 500717f87d0..a3d3b109d53 100644
--- a/.github/workflows/tg-unit-tests-impl.yaml
+++ b/.github/workflows/tg-unit-tests-impl.yaml
@@ -65,9 +65,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run unit regression tests
timeout-minutes: ${{ matrix.test-group.timeout }}
diff --git a/.github/workflows/tg-unit-tests.yaml b/.github/workflows/tg-unit-tests.yaml
index a28497faedb..dfa3483896b 100644
--- a/.github/workflows/tg-unit-tests.yaml
+++ b/.github/workflows/tg-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
TG-Unit-tests:
needs: build-artifact
diff --git a/.github/workflows/tgg-demo-tests.yaml b/.github/workflows/tgg-demo-tests.yaml
index 13f9fc3b8c5..0cab3fdd13d 100644
--- a/.github/workflows/tgg-demo-tests.yaml
+++ b/.github/workflows/tgg-demo-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tgg-demo-tests:
needs: build-artifact
@@ -38,9 +36,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
timeout-minutes: 180
diff --git a/.github/workflows/tgg-frequent-tests-impl.yaml b/.github/workflows/tgg-frequent-tests-impl.yaml
index af54e8e89be..b042635fece 100644
--- a/.github/workflows/tgg-frequent-tests-impl.yaml
+++ b/.github/workflows/tgg-frequent-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run frequent regression tests
timeout-minutes: 90
diff --git a/.github/workflows/tgg-frequent-tests.yaml b/.github/workflows/tgg-frequent-tests.yaml
index 36355e3a27b..4c15f1c7209 100644
--- a/.github/workflows/tgg-frequent-tests.yaml
+++ b/.github/workflows/tgg-frequent-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tgg-frequent-tests:
needs: build-artifact
diff --git a/.github/workflows/tgg-model-perf-tests-impl.yaml b/.github/workflows/tgg-model-perf-tests-impl.yaml
index c79b84b8e01..c487d43d7e3 100644
--- a/.github/workflows/tgg-model-perf-tests-impl.yaml
+++ b/.github/workflows/tgg-model-perf-tests-impl.yaml
@@ -43,9 +43,9 @@ jobs:
echo "PYTHONPATH=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
diff --git a/.github/workflows/tgg-model-perf-tests.yaml b/.github/workflows/tgg-model-perf-tests.yaml
index c65fc7408d6..6b76f5ab177 100644
--- a/.github/workflows/tgg-model-perf-tests.yaml
+++ b/.github/workflows/tgg-model-perf-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
tgg-model-perf-tests:
needs: build-artifact
diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml
index 12d03f7686b..5313e0610c4 100644
--- a/.github/workflows/tgg-unit-tests-impl.yaml
+++ b/.github/workflows/tgg-unit-tests-impl.yaml
@@ -30,9 +30,9 @@ jobs:
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: actions/download-artifact@v4
with:
- name: TTMetal_build_${{ matrix.test-group.arch }}
+ name: TTMetal_build_any
- name: Extract files
- run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
+ run: tar -xvf ttm_any.tar
- uses: ./.github/actions/install-python-deps
- name: Run unit regression tests
timeout-minutes: 60
diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml
index f9be79c02f2..6c42ff61f4f 100644
--- a/.github/workflows/tgg-unit-tests.yaml
+++ b/.github/workflows/tgg-unit-tests.yaml
@@ -8,8 +8,6 @@ on:
jobs:
build-artifact:
uses: ./.github/workflows/build-artifact.yaml
- with:
- arch: '["wormhole_b0"]'
secrets: inherit
TGG-tests:
needs: build-artifact
diff --git a/.github/workflows/tt-train-post-commit.yaml b/.github/workflows/tt-train-post-commit.yaml
index 7a8f3971f1e..1ecdcabfd17 100644
--- a/.github/workflows/tt-train-post-commit.yaml
+++ b/.github/workflows/tt-train-post-commit.yaml
@@ -59,8 +59,6 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ inputs.arch }}
- name: ${{ matrix.test-group.name }} tests
timeout-minutes: ${{ inputs.timeout }}
run: |
diff --git a/.github/workflows/ttnn-post-commit-wrapper.yaml b/.github/workflows/ttnn-post-commit-wrapper.yaml
index 0f6f1f4a56f..324f6582f5d 100644
--- a/.github/workflows/ttnn-post-commit-wrapper.yaml
+++ b/.github/workflows/ttnn-post-commit-wrapper.yaml
@@ -18,11 +18,9 @@ jobs:
# Since pre-compiled builds only run on 20.04, we can only test on 20.04 for now
# The full 22.04 flow can be tested without precompiled
os: [ubuntu-20.04]
- arch: [grayskull, wormhole_b0]
uses: ./.github/workflows/_build-wheels-impl.yaml
with:
os: ${{ matrix.os }}
- arch: ${{ matrix.arch }}
from-precompiled: true
secrets: inherit
ttnn-unit-tests:
diff --git a/.github/workflows/ttnn-post-commit.yaml b/.github/workflows/ttnn-post-commit.yaml
index 3f4a7601bfb..15642748dcc 100644
--- a/.github/workflows/ttnn-post-commit.yaml
+++ b/.github/workflows/ttnn-post-commit.yaml
@@ -79,7 +79,7 @@ jobs:
- uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main
- uses: actions/download-artifact@v4
with:
- name: eager-dist-${{ matrix.os }}-${{ inputs.arch }}
+ name: eager-dist-${{ matrix.os }}-any
- name: Set ttnn fast runtime if exists in config
if: ${{ matrix.test-group.fast_runtime_mode_off }}
run: |
@@ -90,6 +90,8 @@ jobs:
with:
docker_username: ${{ github.actor }}
docker_password: ${{ secrets.GITHUB_TOKEN }}
+ docker_opts: |
+ -e ARCH_NAME=${{ inputs.arch }}
run_args: |
WHEEL_FILENAME=$(ls -1 *.whl)
pip3 install --user $WHEEL_FILENAME
diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
index 70354311c6c..8b511dee0a3 100644
--- a/.github/workflows/ttnn-run-sweeps.yaml
+++ b/.github/workflows/ttnn-run-sweeps.yaml
@@ -47,6 +47,7 @@ on:
- eltwise.unary.sin.sin
- eltwise.unary.sin.sin_pytorch2
- eltwise.unary.sin.sin_forge
+ - eltwise.unary.sin.sin_sharded
- eltwise.unary.tril.tril_pytorch2
- eltwise.unary.clamp.clamp
- eltwise.unary.clamp.clamp_forge
@@ -74,8 +75,11 @@ on:
- eltwise.unary.clone.clone
- eltwise.unary.elu.elu
- eltwise.unary.elu.elu_pytorch2
+ - eltwise.unary.elu.elu_sharded
- eltwise.unary.erfc.erfc
- eltwise.unary.erfc.erfc_sharded
+ - eltwise.unary.eqz.eqz
+ - eltwise.unary.eqz.eqz_sharded
- eltwise.unary.exp.exp
- eltwise.unary.exp.exp_sharded
- eltwise.unary.exp.exp_forge
@@ -133,9 +137,12 @@ on:
- eltwise.unary.neg.neg_pytorch2
- eltwise.unary.neg.neg_forge
- eltwise.unary.erf.erf
+ - eltwise.unary.erf.erf_sharded
- eltwise.unary.erfinv.erfinv
- eltwise.unary.erfinv.erfinv_sharded
- eltwise.unary.i0.i0
+ - eltwise.unary.reciprocal.reciprocal
+ - eltwise.unary.reciprocal.reciprocal_sharded
- eltwise.unary.silu.silu
- eltwise.unary.silu.silu_pytorch2
- eltwise.unary.glu.glu
@@ -191,9 +198,11 @@ on:
- eltwise.unary_backward.hardshrink_bw
- eltwise.unary_backward.softshrink_bw
- eltwise.unary_backward.acos_bw.acos_bw
+ - eltwise.unary_backward.acos_bw.acos_bw_sharded
- eltwise.unary_backward.acosh_bw.acosh_bw
- eltwise.unary_backward.atan_bw.atan_bw
- eltwise.unary_backward.cos_bw.cos_bw
+ - eltwise.unary_backward.cos_bw.cos_bw_sharded
- eltwise.unary_backward.frac_bw.frac_bw
- eltwise.unary_backward.i0_bw.i0_bw
- eltwise.unary_backward.rad2deg_bw.rad2deg_bw
@@ -226,9 +235,12 @@ on:
- eltwise.unary_backward.tanh_bw.tanh_bw
- eltwise.unary_backward.sqrt_bw.sqrt_bw
- eltwise.unary_backward.add_bw.add_bw
+ - eltwise.unary_backward.add_bw.add_bw_sharded
- eltwise.unary_backward.assign_bw.assign_bw
- eltwise.unary_backward.fill_bw.fill_bw
+ - eltwise.unary_backward.fill_bw.fill_bw_sharded
- eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw
+ - eltwise.unary_backward.hardsigmoid_bw.hardsigmoid_bw_sharded
- eltwise.unary_backward.lgamma_bw.lgamma_bw
- eltwise.unary_backward.multigammaln_bw.multigammaln_bw
- eltwise.unary_backward.leaky_relu_bw.leaky_relu_bw
@@ -251,6 +263,8 @@ on:
- eltwise.binary_complex.add_bw.add_bw
- eltwise.binary_complex.sub_bw.sub_bw
- eltwise.binary_complex.mul_bw.mul_bw
+ - eltwise.unary.digamma.digamma
+ - eltwise.unary.digamma.digamma_sharded
- eltwise.unary.lgamma.lgamma
- eltwise.unary.lgamma.lgamma_sharded
- eltwise.unary.logit.logit
@@ -278,12 +292,14 @@ on:
- eltwise.unary.ltz.ltz
- eltwise.unary.gez.gez
- eltwise.unary.lez.lez
+ - eltwise.unary.lez.lez_sharded
- eltwise.unary.nez.nez
- eltwise.unary.prelu.prelu
- eltwise.unary.prelu.prelu_sharded
- eltwise.unary.hardswish.hardswish_pytorch2
- eltwise.unary.hardtanh.hardtanh_pytorch2
- eltwise.unary.leaky_relu.leaky_relu
+ - eltwise.unary.leaky_relu.leaky_relu_sharded
- eltwise.unary.reglu.reglu
- eltwise.unary.round.round_sharded
- eltwise.unary_complex.polar.polar
@@ -494,8 +510,6 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: wormhole_b0
- name: Run ttnn sweeps generation (single sweep)
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sweep_name != 'ALL SWEEPS (Nightly)' }}
run: |
@@ -554,8 +568,6 @@ jobs:
run: |
echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV
- uses: ./.github/actions/prepare-metal-run
- with:
- arch: ${{ matrix.test-group.arch }}
- name: Run ttnn sweeps (single sweep)
if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.sweep_name != 'ALL SWEEPS (Nightly)' }}
run: |
diff --git a/Doxyfile b/Doxyfile
index 0714e2f5fff..eaeb342c93c 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -922,9 +922,9 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
-INPUT = tt_metal/hw/inc/dataflow_api.h \
+INPUT = tt_metal/api/tt-metalium/dataflow_api.h \
tt_metal/hw/inc/ethernet/dataflow_api.h \
- tt_metal/host_api.hpp \
+ tt_metal/api/tt-metalium/host_api.hpp \
tt_metal/include/compute_kernel_api/eltwise_unary/erf_erfc.h \
tt_metal/include/compute_kernel_api/eltwise_unary/erfinv.h \
tt_metal/include/compute_kernel_api/eltwise_unary/exp.h \
@@ -953,12 +953,12 @@ INPUT = tt_metal/hw/inc/dataflow_api.h \
tt_metal/include/compute_kernel_api.h \
tt_metal/impl/kernels/kernel_args.hpp \
tt_metal/include/tt_metal/metal.hpp \
- tt_metal/include/tt_metal/types.hpp \
+ tt_metal/api/tt-metalium/types.hpp \
tt_metal/include/tt_metal/buffer.hpp \
tt_metal/include/tt_metal/command_queue.hpp \
tt_metal/include/tt_metal/device.hpp \
tt_metal/include/tt_metal/event.hpp \
- tt_metal/include/tt_metal/global_circular_buffer.hpp \
+ tt_metal/api/tt-metalium/global_circular_buffer.hpp \
tt_metal/include/tt_metal/kernel.hpp \
tt_metal/include/tt_metal/program.hpp \
tt_metal/include/tt_metal/trace.hpp
diff --git a/build_metal.sh b/build_metal.sh
index 7ff69bc1c2d..3e47dd263a3 100755
--- a/build_metal.sh
+++ b/build_metal.sh
@@ -240,7 +240,6 @@ fi
if [ "$build_tests" = "ON" ]; then
cmake_args+=("-DTT_METAL_BUILD_TESTS=ON")
cmake_args+=("-DTTNN_BUILD_TESTS=ON")
- cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
fi
if [ "$build_metal_tests" = "ON" ]; then
@@ -276,7 +275,6 @@ fi
if [ "$build_all" = "ON" ]; then
cmake_args+=("-DTT_METAL_BUILD_TESTS=ON")
cmake_args+=("-DTTNN_BUILD_TESTS=ON")
- cmake_args+=("-DTT_UMD_BUILD_TESTS=ON")
cmake_args+=("-DBUILD_PROGRAMMING_EXAMPLES=ON")
cmake_args+=("-DBUILD_TT_TRAIN=ON")
fi
diff --git a/cmake/helper_functions.cmake b/cmake/helper_functions.cmake
index ca0cdbbbcee..60bc56372b6 100644
--- a/cmake/helper_functions.cmake
+++ b/cmake/helper_functions.cmake
@@ -53,14 +53,7 @@ function(CREATE_PGM_EXAMPLES_EXE TESTLIST SUBDIR)
m
pthread
)
- target_include_directories(
- ${TEST_TARGET}
- PRIVATE
- ${PROJECT_SOURCE_DIR}
- ${PROJECT_SOURCE_DIR}/tt_metal
- ${PROJECT_SOURCE_DIR}/tt_metal/common
- ${CMAKE_CURRENT_SOURCE_DIR}
- )
+ target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
set_target_properties(
${TEST_TARGET}
PROPERTIES
diff --git a/dependencies/CMakeLists.txt b/dependencies/CMakeLists.txt
index 3064d846fe5..f7f9d6c065f 100644
--- a/dependencies/CMakeLists.txt
+++ b/dependencies/CMakeLists.txt
@@ -111,3 +111,24 @@ CPMAddPackage(
OPTIONS
"XTENSOR_ENABLE_TESTS OFF"
)
+
+############################################################################################################################
+# benchmark : https://github.com/google/benchmark
+############################################################################################################################
+
+CPMAddPackage(NAME benchmark GITHUB_REPOSITORY google/benchmark GIT_TAG v1.9.1)
+
+if(benchmark_ADDED)
+ set_target_properties(
+ benchmark
+ PROPERTIES
+ LIBRARY_OUTPUT_DIRECTORY
+ "${CMAKE_BINARY_DIR}/lib"
+ )
+endif()
+
+# TODO(afuller): Move this to CPM and use upstream's CMake file, AFTER we move to Ubuntu 22.04 and drop 20.04 and bump
+# our minimum CMake version accordingly. Taskflow's CMake wants v3.18+
+add_library(Taskflow INTERFACE)
+add_library(Taskflow::Taskflow ALIAS Taskflow)
+target_include_directories(Taskflow SYSTEM INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/third_party/taskflow)
diff --git a/models/utility_functions.py b/models/utility_functions.py
index f13fd48d8ca..88dada95370 100644
--- a/models/utility_functions.py
+++ b/models/utility_functions.py
@@ -61,6 +61,27 @@ def torch_random(shape, low, high, dtype):
return torch.zeros(shape, dtype=dtype).uniform_(low, high)
+def torch_random_with_zeros(shape, low, high, dtype, zero_fraction=0.1):
+ total_elements = torch.prod(torch.tensor(shape)).item()
+ num_zeros = int(total_elements * zero_fraction)
+ num_random = total_elements - num_zeros
+
+ # Generate random values between low and high
+ random_values = torch.empty(num_random).uniform_(low, high)
+ zeros = torch.zeros(num_zeros)
+
+ # Combine zeros and random values
+ combined = torch.cat([zeros, random_values])
+
+ # Shuffle the tensor
+ shuffled = combined[torch.randperm(combined.size(0))]
+
+ # Reshape to the desired shape
+ result_tensor = shuffled.view(shape)
+ result_tensor.to(dtype)
+ return result_tensor
+
+
### Profiling ###
class Profiler:
def __init__(self):
diff --git a/setup.py b/setup.py
index 6cd62f16410..b8ab90ff22f 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@ def get_is_srcdir_build():
def get_arch_name():
- return attempt_get_env_var("ARCH_NAME")
+ return "any"
def get_metal_local_version_scheme(metal_build_config, version):
diff --git a/tech_reports/LLMs/llms.md b/tech_reports/LLMs/llms.md
index 3cd4da7eba5..1ae7f25d0b6 100644
--- a/tech_reports/LLMs/llms.md
+++ b/tech_reports/LLMs/llms.md
@@ -77,7 +77,7 @@ k_heads = ttnn.experimental.rotary_embedding_llama(
)
```
-#### Setting up inputs to RoPE
+#### 2.2.1 Setting up inputs to RoPE
Fused operation uses a different parallelization scheme internally depending on if the model is in *prefill* or *decode* mode. The following table describes various shapes and memory configurations for *prefill* and *decode* modes:
@@ -90,7 +90,7 @@ Fused operation uses a different parallelization scheme internally depending on
*Note: (TH, TW) = (TILE_HEIGHT, TILE_WIDTH)*
-#### Decode mode specifics
+#### 2.2.2 Decode mode specifics
The cos/sin matrices, are generated in two slightly different ways, depending on the mode of operation. For *prefill* mode, the cos/sin matrices are computed once at intialization using the *prefill* sequence length, and then passed into the RoPE OP. However, in *decode* mode, since the position index of each user is updated from token-to-token, the cos/sin matrices must be updated across iterations. Here, we leverage our `TtLlamaRotarySetup` module, that can be used at each decode iteration to get the corresponding cos/sin matrices.
The following code sample shows how `TtLlamaRotarySetup` can be used in decode mode:
@@ -139,7 +139,7 @@ out = ttnn.experimental.rotary_embedding_llama(
Normalization is a critical operation in Large Language Models (LLMs), ensuring stable training and efficient inference. Two widely adopted normalization techniques in modern LLMs, **LayerNorm** and **RMSNorm**, are fully supported in TT-NN.
-#### Implementations of Normalization Operations
+#### 2.3.1 Implementations of Normalization Operations
TT-NN includes two primary implementations of normalization operations to handle diverse activation layouts efficiently:
@@ -147,7 +147,7 @@ TT-NN includes two primary implementations of normalization operations to handle
2. **Distributed Norm**
-#### 1. Non-Distributed Norm
+#### 2.3.1.1 Non-Distributed Norm
**Non-Distributed Norm** refers to the standard implementation of normalization operations applied to activations that are not distributed across multiple devices. This type of normalization is suitable for setups where the entire activation or embedding is available locally on a single device or is replicated identically across multiple devices in a data-parallel setup. This implementation supports both sharded and interleaved inputs.
@@ -205,14 +205,11 @@ ttnn_gamma_rm = ttnn.as_tensor(
)
```
-
-
-
-#### 2. Distributed Norm
+#### 2.3.1.2 Distributed Norm
The distributed implementation is designed for cases where activations are **sharded along the embedding dimension** across multiple devices. It ensures the correct computation of mean and variance across shards by leveraging cross-device communication. Both interleaved and width-sharded inputs are supported.
-#### Steps to Perform Distributed Normalization on TT-Devices
+##### 2.3.2.2.1 Steps to Perform Distributed Normalization on TT-Devices
1. **Compute Local Statistics** - Each device computes the required statistics (e.g., \(E[x]\), \(E[x^2]\)) locally on its shard of the input tensor.
- For **RMSNorm**, only \(E[x^2]\) is required.
@@ -258,7 +255,6 @@ The distributed implementation is designed for cases where activations are **sha
```
- **Output**: A tensor of shape `[1, 1, batch, embedding_dim // num_devices]`.
-
> [!NOTE]
> The following inputs are valid for both implementations.
> - **Interleaved Inputs**:
@@ -268,8 +264,7 @@ The distributed implementation is designed for cases where activations are **sha
For width-sharded inputs, the kernel splits the work across the embedding dimension.
This design is more **optimal for decode cases**, where the sequence length is typically `seq_len=1`.
-
-#### References
+#### 2.3.1.3 References
- Non-Distributed Norm Op Code [[1]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/layernorm) [[2]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/rmsnorm)
- Distributed Norm Op Code [[3]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/layernorm_distributed) [[4]](https://github.com/tenstorrent/tt-metal/tree/main/ttnn/cpp/ttnn/operations/normalization/rmsnorm_distributed)
- Non-Distributed Norms Unit Tests [[5]](https://github.com/tenstorrent/tt-metal/blob/main/tests/tt_eager/python_api_testing/unit_testing/misc/test_layernorm_sharded.py) [[6]](https://github.com/tenstorrent/tt-metal/blob/main/tests/tt_eager/python_api_testing/unit_testing/misc/test_layernorm.py)
@@ -345,7 +340,7 @@ Common Terminology:
| bsz | Batch Size |
| batch_id | Batch Index (used for prefill) |
| cur_pos/cur_pos_tensor | List/tensor of current positions in the sequence for each batch. |
-| cache_len | Length of the KV Cache |
+| cache_len | Length of the KV Cache. |
| seqlen | Sequence Length |
| dim | Hidden dimension of input x. |
| head_dim | Hidden dimension of Q, K, V. |
@@ -430,38 +425,39 @@ The attention module in decode mode expects input shape `(1, seqlen=1, bsz, hidd
An end-to-end example of the decode attention module is in the `models/demos/llama3/tt/llama_attention.py` file, under the `forward_decode` method. The decode mode is broken down into the following steps:
-1. QKV projections matmuls.
+1. **QKV Projections Matmuls**
- This works the same as in prefill mode, using `ttnn.linear`. Note that the input shape is `(1, 1, bsz, dim)` instead of `(1, 1, seqlen, dim)`.
- Input/Output shapes:
```python
(1, 1, bsz, dim) -> (1, 1, bsz, (n_q_heads+2*n_kv_heads)*head_dim)
```
-2. Reshape Q, K, V to match the expected input shape for scaled dot product attention.
+2. **Reshape QKV**
+ - Reshape Q, K, and V to match the expected input shape for scaled dot product attention.
- We split the fused QKV tensor into individual Q, K, V tensors using `ttnn.experimental.nlp_create_qkv_heads_decode`.
> [!NOTE]
> This is a different OP than `ttnn.experimental.nlp_create_qkv_heads` used in prefill mode. For example:
->
- ```python
- Q, K, V = ttnn.experimental.nlp_create_qkv_heads_decode(
- xqkv_fused,
- num_heads=n_q_heads,
- num_kv_heads=n_kv_heads,
- memory_config=ttnn.MemoryConfig(
- ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1
- )
- )
- ```
+> ```python
+> Q, K, V = ttnn.experimental.nlp_create_qkv_heads_decode(
+> xqkv_fused,
+> num_heads=n_q_heads,
+> num_kv_heads=n_kv_heads,
+> memory_config=ttnn.MemoryConfig(
+> ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ttnn.BufferType.L1
+> )
+> )
+> ```
+
- **Input/Output Shapes**: The output is height sharded across the batch dimension on `bsz` number of cores.
```python
(1, 1, bsz, (n_q_heads+2*n_kv_heads)*head_dim) -> (1, bsz, n_q_heads, head_dim), (1, bsz, n_kv_heads, head_dim), (1, bsz, n_kv_heads, head_dim)
```
-3. Apply RoPE to Q and K
- - Again, apply the RoPE transformation to Q and K using the rotary embedding op outlined in [2.2 RoPE](#22-rope). The input/output shapes remain the same as in step 2.
+3. **Apply RoPE to Q and K**
+ - Again, apply the RoPE transformation to Q and K using the rotary embedding OP outlined in [2.2 RoPE](#22-rope). The input/output shapes remain the same as in step 2.
-4. Cache K and V
- - Populate the KV cache at `cur_pos` for all batches with the current K and V tensors using the `ttnn.experimental.paged_update_cache` op. This OP takes in an optional `page_table` argument to support paged KV cache updates. Example:
+4. **Cache K and V**
+ - Populate the KV cache at `cur_pos` for all batches with the current K and V tensors using the `ttnn.experimental.paged_update_cache` OP. This OP takes in an optional `page_table` argument to support paged KV cache updates. Example:
```python
ttnn.experimental.paged_update_cache(keys, K, update_idxs=cur_pos, page_table=page_table)
ttnn.experimental.paged_update_cache(values, V, update_idxs=cur_pos, page_table=page_table)
@@ -471,13 +467,13 @@ An end-to-end example of the decode attention module is in the `models/demos/lla
ttnn.experimental.paged_update_cache(keys, K, update_idxs_tensor=cur_pos_tensor, page_table=page_table)
```
-5. Scaled Dot Product Attention Decode
+5. **Scaled Dot Product Attention Decode**
- Perform scaled dot product attention using custom flash attention kernel optimized for decode mode, `ttnn.transformer.scaled_dot_product_attention_decode` and `ttnn.transformer.paged_scaled_dot_product_attention_decode` for paged KV cache.
- `ttnn.transformer.scaled_dot_product_attention_decode` considers the following arguments:
- `q`: Query tensor of shape `(1, bsz, n_q_heads, head_dim)`.
- `k`: Key tensor of shape `(1, bsz, cache_len, head_dim)`.
- `v`: Value tensor of shape `(1, bsz, cache_len, head_dim)`.
- - `is_causal`: bool, defaults to `true`. Whether to apply causal masking.
+ - `is_causal`: Bool, defaults to `true`. Whether to apply causal masking.
- `attn_mask`: Optional attention mask tensor. Defaults to `None` and only used if `is_causal=False`.
- `cur_pos`: (Required for is_causal=True) List of current positions in the sequence for each batch. Defaults to `None`. Must be provided if `cur_pos_tensor` is not provided.
- `cur_pos_tensor`: (Required for is_causal=True) Optional current position tensor. Defaults to `None`. Must be provided if `cur_pos` is not provided.
@@ -495,7 +491,7 @@ An end-to-end example of the decode attention module is in the `models/demos/lla
attn_output = ttnn.transformer.paged_scaled_dot_product_at tention_decode(Q, K, V, attn_mask=mask, is_causal=False)
```
-6. Output Reshape and Output Matmul
+6. **Output Reshape and Output Matmul**
- Finally, use `ttnn.experimental.nlp_concat_heads_decode` to reshape the output of the attention OP, followed by a standard `ttnn.linear` to do the output projection. For example:
```python
attn_output = ttnn.experimental.nlp_concat_heads_decode(attn_output, num_heads=n_q_heads)
@@ -511,8 +507,7 @@ Flash attention and flash decode are the major OPs for attention. They are optim
Here are some useful details regarding attention OPs for efficient and bug-free code writing:
-1. **Program Configs** in flash attention (and flash decode) OPs:
- The Program config has the following parameters:
+1. Program Configs in flash attention (and flash decode) OPs. The Program config has the following parameters:
- `compute_with_storage_grid_size`: The grid size.
- `q_chunk_size`: The size of a chunk to process at a time for Q.
- `k_chunk_size`: The size of a chunk to process at a time for K and V.
@@ -525,9 +520,9 @@ Flash decode processes the entire Q (since query in decode mode is small) and K/
Finally, the `exp_approx_mode` field is to set the exponential approximation mode for softmax in flash attention and flash decode. We recommend setting this to `true` for small `seqlen/chunk_size` values. For large `seqlen/chunk_size` values, the error introduced by the exponential approximation can accumulate through chunk accumulation, causing major degradation in pcc. For example in Llama3 models, we use `q_chunk_size` and `k_chunk_size` of 512, and `exp_approx_mode` set to `false` for long sequence lengths greater than 16K.
-2. **Current Position Tensor** for flash decode and kv cache OPs:
+2. Current Position Tensor for flash decode and kv cache OPs:
-In decode mode provide a list of current positions or a tensor. The tensor version can be more efficient because it supports **tracing**. For more information about tracing, see: [4.1 Tracing](#41-tracing). Tracing requires the traced variables to be statically known at the compile time. If you provide a list of current positions, you cannot modify it for the next token generation. However, if you provide a tensor, the position values are stored in device memory and can be updated using binary addition op, e.g. `ttnn.add`.
+In decode mode provide a list of current positions or a tensor. The tensor version can be more efficient because it supports tracing**. For more information about tracing, see: [4.1 Tracing](#41-tracing). Tracing requires the traced variables to be statically known at the compile time. If you provide a list of current positions, you cannot modify it for the next token generation. However, if you provide a tensor, the position values are stored in device memory and can be updated using binary addition op, e.g. `ttnn.add`.
### 2.5 MLP
@@ -544,7 +539,7 @@ y = FF2(w2_in)
Let's dive into our implementation of MLP, and discuss what makes it performant across different WH systems.
-#### 1. Setup
+#### 2.5.1 Setup
When used in the model by the `TtLlamaDecoder` module class, the MLP class is initialized at the start, where the weights for `w1`, `w2`, and `w3` are loaded and fractured across devices in specific schemes, as outlined in the [Multi-Device](#33-multi-device) section. Specifically, in n300 and T3000 systems the weights are 1D column fractured, and in TG systems the weights are 2D fractured.
```py
@@ -559,7 +554,7 @@ self.feed_forward = TtLlamaMLP(
)
```
-#### 2. Inputs
+#### 2.5.2 Inputs
At runtime, the `forward` function of `TtLlamaMLP` is called with either *'prefill'* or *'decode'* mode, with inputs replicated across devices, for all WH system configurations.
> [!NOTE]
> In the actual model, the input `ff_in` is the output of the `norm` step prior to MLP.
@@ -597,9 +592,7 @@ ff_in_memory_config = ttnn.DRAM_MEMORY_CONFIG
> ff_in = ttnn.reshape(ff_in, [1, seq_len // 1024, 1024, -1])
> ```
-
-
-#### 2. Setting Up Program Configurations For Matmuls
+#### 2.5.3 Setting Up Program Configurations For Matmuls
Depending on the mode of operation, the `forward` function of `TtLlamaMLP` instantiates different program configs for matmuls of FF1/FF3, and FF2.
**Decode mode**
@@ -661,7 +654,6 @@ def matmul_config(
fuse_batch=fuse_batch,
)
-
_, _, m, k = ff_in.shape
n = hidden_dim // num_devices
pc1 = matmul_config(
@@ -674,8 +666,7 @@ pc1 = matmul_config(
)
```
-
-#### 3. FF1/FF3 Matmul
+#### 2.5.4 FF1/FF3 Matmul
The first set of operations in the MLP are:
```py
w1_out = FF1(x)
@@ -712,7 +703,7 @@ w3_out = ttnn.linear(
)
```
-#### 3.1 FF1/FF3 Matmul With 2D Weight Fracturing
+#### 2.5.5 FF1/FF3 Matmul With 2D Weight Fracturing
In the case of TG systems, where we have access to a 2D device mesh, we can leverage 2D weight fracturing. For a weight tensor with shape `[1, 1, K, N]`, using 2D weight fracturing on a `(8, 4)` device mesh, the resulting shape on each device would be: `[1, 1, K / 4, N / 8]`. In other words, the inner dimension (K) of the matmul is spread out across four devices, and to complete the entire matmul operation, a reduction step across the partials is necessary. We do this using an all-reduce operation along the four devices in `cluster_axis=1` of the device mesh.
```py
@@ -734,7 +725,7 @@ In the case of TG systems, where we have access to a 2D device mesh, we can leve
)
```
-#### 4. Multiply + Fused SiLU Activation
+#### 2.5.6 Multiply + Fused SiLU Activation
The output of the FF1/FF3 matmuls are column fractured tensors (the extra all-reduce operation for TG systems ensures this). The next operation is:
```py
@@ -757,7 +748,7 @@ w2_in = ttnn.multiply(
Following our pattern mentioned before, the outputs are L1 sharded in `decode` mode and DRAM interleaved in `prefill` mode.
-#### 5. FF2 Matmul
+#### 2.5.7 FF2 Matmul
The last computation in MLP is:
```py
y = FF2(w2_in)
@@ -784,7 +775,7 @@ if seq_len >= 1024: # Reshape back to intended shape
w2_out = ttnn.reshape(w2_out, [1, 1, seq_len, -1])
```
-###### 5.1 Accumulating the partial outputs of FF2
+###### 2.5.7.1 Accumulating the partial outputs of FF2
Since the output of FF2 is the correct shape but only a partial on each device, the output of the MLP module is required to be fractured where each device has fully accumulated the inner dim of the matmul, but only has a fraction of the outer dim. There are two different ways to handle this, depending on if the WH system has a 1D or 2D device mesh.
@@ -812,14 +803,12 @@ Since the output of FF2 is the correct shape but only a partial on each device,
```
### 2.6 Decoder
-
@@ -929,7 +924,7 @@ for i, split_size in enumerate(split_sizes):
We use DRAM-sharded matmul for LMHead with `program_config` and `memory_config` generated by the code below.
For more information check [Section: Op Configs](#44-op-configs).
-The primary reason for having multiple `program_configs` is that the weight shapes may result in unequal split sizes. This variability means the same configuration cannot be used for every matrix multiplication.
+The primary reason for having multiple `program_configs` is that the weight shapes may result in unequal split sizes. This variability means the same configuration cannot be used for every matmul.
```py
# Generate dram-sharded memory_config
@@ -950,7 +945,7 @@ self.program_configs = [
Once weights are pushed to the devices and the decoders are executed, the `LMHead` forward pass needs to be executed in iterations.
The code below shows that after each iteration outputs are converted from sharded to interleaved tensors. Once all iterations are completed, the final output is produced by concatenation over the last dim and returned as `output`.
-When executing the model, it is essential to ensure that the output of the last decoder is already replicated across tensors. Since this replication is enforced earlier, no additional code is required in the `LMHead` forward pass to handle it.
+When executing the model, you must ensure that the output of the last decoder is already replicated across tensors. Since this replication is enforced earlier, no additional code is required in the `LMHead` forward pass to handle it.
```py
def forward(self, x: ttnn.Tensor):
@@ -972,19 +967,18 @@ def forward(self, x: ttnn.Tensor):
return output
```
-
### 2.8 Model
-Once the model components (discussed in previous sections) are implemented, there isn’t much left to finalize. In our implementation, embeddings are managed outside the model class, as explained in [Section 2.1 Embedding](#21-embedding).
+Once the previous model components are implemented, there isn’t much left to finalize. In our implementation, embeddings are managed outside the model class, as explained in [Section 2.1 Embedding](#21-embedding).
-The model’s constructor initializes N decoders (e.g. 80 for Llama3.1-70b), the `RMSNorm` and the `LMHead`, ensuring that weights for all components are loaded onto the appropriate devices.
+The model’s constructor initializes N decoders, for example 80 for Llama3.1-70b, the `RMSNorm` and the `LMHead`, ensuring that weights for all components are loaded onto the appropriate devices.
-During the forward pass, the decoders are executed sequentially, followed by normalization and the `LMHead` computation at the end.
-A specific optimization is applied for the prefill mode: since only the last token is relevant, the `LMHead` is executed only on the final tile in this mode.
+During the forward pass, decoders are executed sequentially, followed by normalization and `LMHead` computation at the end.
+A specific optimization is applied for the prefill mode; only the last token is relevant, the `LMHead` is executed only on the final tile.
In prefill mode, the RMSNorm output is interleaved, but the LMHead requires a sharded tensor. To accommodate this, the `interleaved_to_sharded` function is used to prepare the output accordingly.
@@ -1026,31 +1020,31 @@ def forward(
## 3. Features
### 3.1 Generative Decoding
-Almost every LLM generates text in the same manner: Given a prompt from the user, the LLM predicts the next token. Then, the LLM takes that new token and uses it as context to predict the following token. This process repeats until the LLM generates a token that indicates the end of the sequence, or until the user decides to stop the generation. The process is called "autoregressive generation" because each new token is used to predict the next token.
+Almost every LLM generates text in the same manner; given a prompt from the user, the LLM predicts the next token. Then, the LLM takes that new token and uses it as context to predict the following token. This process repeats until the LLM generates a token that indicates the end of the sequence, or until the user decides to stop the generation. The process is called "autoregressive generation" because each new token is used to predict the next token.
-#### Model Inputs and Outputs
+#### 3.1.1 Model Inputs and Outputs
Inputs to the model for generative decoding are generally:
-- tokens: produced by the tokenizer
-- position ids: the position of the tokens in the sequence
-- KV cache: an inference optimization that caches intermediate values
+- **Tokens:** Produced by the tokenizer.
+- **Position IDs:** Position of the tokens in the sequence.
+- **KV Cache:** Inference optimization that caches intermediate values.
-In the model, tokens are embedded from the vocabulary space to the embedding space. Position ids are necessary for updating the KV cache and for positional embeddings like RoPE.
+In the model, tokens are embedded from the vocabulary space to the embedding space. Position IDs are necessary for updating the KV cache and for positional embeddings like RoPE.
The model outputs:
-- logits for the next token
-- an updated KV cache
+- Logits for the next token
+- Updated KV Cache
-The logits are unnormalized probabilities over the vocabulary. Given these probabilities, the sampler must decide which of these tokens in the vocabulary will be chosen. There are a few sampling methods that are commonly used to pick the next token:
-- Greedy decoding (argmax of the logits, picks the most likely next token)
-- Top-p/top-k sampling (restricts the logits according to p and k values, then samples according to the remaining probabilities)
+The logits are unnormalized probabilities over the vocabulary. Given these probabilities, the sampler must decide which of these tokens in the vocabulary are chosen. There are a few sampling methods that are commonly used to pick the next token:
+- **Greedy Decoding:** Argmax of the logits, picks the most likely next token.
+- **Top-p/top-k Sampling:** Restricts the logits according to P and K values, then samples according to the remaining probabilities.
-#### KV cache
-The KV cache is an inference optimization. It allows us to cache some intermediate values during the first inference step which are reused in later steps.
+#### 3.1.2 KV Cache
+The KV cache is an inference optimization. It allows us to cache intermediate values during the first inference step for reuse in later steps.
On the first inference step, the model processes the full prompt and caches the K and V projections for each layer. Subsequent inference steps compute a Q, K, V projection only for the new token, then use the cached K and V projections in attention. Therefore the first step (prefill) creates the KV cache and subsequent steps (decode) use and update the cache.
-The size of the KV cache depends on the batch size and sequence length. Since accelerators have finite memory, it can be necessary to tradeoff batch size and sequence length to allow the KV cache to fit in memory.
+The size of the KV cache depends on the batch size and sequence length. Since accelerators have finite memory, it is necessary to tradeoff batch size and sequence length to allow the KV cache to fit in memory.
-#### Batching
+#### 3.1.3 Batching
LLMs use batching to process multiple sequences in parallel. There are a few reasons why batching is useful:
- Real-world LLM services need to handle multiple concurrent requests.
- LLM inference is bound by time to read model weights from DRAM. Batching allows model weight reuse across multiple sequences.
@@ -1060,15 +1054,15 @@ However, there are tradeoffs with batching. In decode mode, latency scales subli
It is typical to use different batch sizes for different use cases, depending on the goal of the system.
-#### Performance Metrics
-**Time to first token (TTFT)** measures the latency to generate the first token of the sequence. This is the time to prefill a prompt and generate the first token. It is a measure of interactivity.
-
-**Total throughput (tokens per second)** tells us the total number of tokens that the model can generate per second. `total throughput = batch size / decode step latency`. Total throughput is important for cost-sensitive systems or offline processing, where interactivity is less important than throughput. Generally, increasing batch size will increase total throughput.
+#### 3.1.4 Performance Metrics
+**Time to First Token (TTFT):** measures the latency to generate the first token of the sequence. This is the time to prefill a prompt and generate the first token. It is a measure of interactivity.
-**User throughput (tokens per second per user)** is calculated as `user throughput = 1 / decode step latency`. User throughput tells us how interactive the model is, and tells us how fast the generation is for a single user. Generally, decreasing batch size will increase user throughput.
+**Total Throughput (Tokens per Second):** tells us the total number of tokens that the model can generate per second. `total throughput = batch size / decode step latency`. Total throughput is important for cost-sensitive systems or offline processing, where interactivity is less important than throughput. Generally, increasing batch size will increase total throughput.
-Note that each of these metrics change with batch size and sequence length. When reporting TTFT, total throughput, and user throughput, the batch size and sequence length must be specified.
+**User Throughput (Tokens per Second per User):** is calculated as `user throughput = 1 / decode step latency`. User throughput tells us how interactive the model is, and tells us how fast the generation is for a single user. Generally, decreasing batch size will increase user throughput.
+> [!NOTE]
+> Each of these metrics change with batch size and sequence length. When reporting TTFT, total throughput, and user throughput, the batch size and sequence length must be specified.
### 3.2 Prefill and Decode
@@ -1078,15 +1072,15 @@ In our LLM implementations, the prefill phase is done sequentially for each user
The decode phase is parallel-computed for all users, but sequential for each token within a batch of users. Each new token can only be generated after the previous one, as the model must maintain causality in attention computations.
-#### **Technical Implementation Differences**
+#### 3.2.1 Technical Implementation Differences
-The intermediate activations in prefill mode are kept in DRAM, due to the large size of the tensors which contain the entire sequence length. In decode mode, the intermediate activations are kept in L1 memory instead, since in this mode the sequence length to compute is just 1 (one token at the time), reducing latency.
+The intermediate activations in prefill mode are kept in DRAM, due to the large size of the tensors which contain the entire sequence length. In decode mode, the intermediate activations are kept in L1 memory instead, since in this mode the sequence length to compute is just one token at the time, reducing latency.
-##### 1. Reshaping for Large Matrix Multiplications
+##### 3.2.1.1 Reshaping for Large Matrix Multiplications
Please see the [attention source code](../../models/demos/llama3/tt/llama_attention.py) for reference.
-In prefill mode, when the input sequence length is very large, the model reshapes its input tensors to process sequences in smaller chunks in parallel for larger matrix multiplications, such as `wqkv`, `wo` in the attention module, and `w1`, `w2`, `w3` in the MLP module. This reshaping prevents running out of memory in cases of long prefill sequence lengths. For instance:
+In prefill mode, when the input sequence length is very large, the model reshapes its input tensors to process sequences in smaller chunks in parallel for larger matmuls, such as `wqkv`, `wo` in the attention module, and `w1`, `w2`, `w3` in the MLP module. This reshaping prevents running out of memory in cases of long prefill sequence lengths. For example:
```python
if seq_len > 2048:
@@ -1102,9 +1096,9 @@ xqkv_fused = ttnn.linear(
)
```
-This reshaping is not needed for decode mode because it only processes one token at a time. Instead, the parallelization for decode mode is done over user batches, which currently only goes up to 32.
+Reshaping is not needed for decode mode because it only processes one token at a time. Instead, the parallelization for decode mode is done over user batches, which currently only goes up to 32.
-##### 2. KV Cache Management
+##### 3.2.1.2 KV Cache Management
The KV-cache is filled during prefill using the `ttnn.experimental.paged_fill_cache` operation. This supports page tables, which enables the hot-swapping of new users when the full model is deployed.
@@ -1130,8 +1124,8 @@ ttnn.experimental.paged_update_cache(
)
```
-##### 3. Attention Computation
-###### Prefill:
+##### 3.2.1.3 Attention Computation
+###### 3.2.1.3.1 Prefill:
```python
# Split q_heads into num_groups and kv_heads for parallel group computation for grouped query attention (GQA)
q_heads_84SD_8b = ttnn.reshape(
@@ -1149,7 +1143,7 @@ attn_output_84SD = ttnn.transformer.scaled_dot_product_attention(
)
```
-###### Decode:
+###### 3.2.1.3.2 Decode:
```python
# Decode uses cached states instead of recomputing
attn_output_11BH = ttnn.transformer.scaled_dot_product_attention_decode(
@@ -1160,27 +1154,27 @@ attn_output_11BH = ttnn.transformer.scaled_dot_product_attention_decode(
)
```
-##### 4. Slicing Before the LM Head
-At the end of prefill, the model should generate the first decoded token, then signaling the start of the decode phase. To this end, the model slices the output of the last decoder layer to the last tile before computing the LM head. This is necessary because only last token from prefill is needed to start the autoregressive decoding.
+##### 3.2.1.4 Slicing Before the LM Head
+At the end of prefill, the model should generate the first decoded token, then signal the start of the decode phase. To this end, the model slices the output of the last decoder layer to the last tile before computing the LM head. This is necessary because only the last token from prefill is needed to start the autoregressive decoding.
```python
x = ttnn.slice(x, (0, 0, get_last_token, 0), (1, 1, get_last_token + 32, x.shape[-1]))
```
-#### **Prefill vs. Decode: Comparison Summary**
+#### 3.2.2 Prefill vs. Decode: Comparison Summary
| | Prefill Mode | Decode Mode |
| --- | --- | --- |
-| Purpose | Bulk sequence processing for initialization or training | Incremental processing for autoregressive inference |
-| Demo Parallelization | Sequential for each user, parallel for the sequence length of each user | Parallel for 32 users, sequential for each token within a batch of users |
-| Batch and sequence Length | Processes long sequences (≥ 128 tokens), single user | Processes batch of users (≤ 32 users), single token |
-| Memory Use | DRAM, with reshaping into smaller chunks for long sequence lengths | L1 on-chip memory for fast, low-latency processing |
-| Attention | Handles sequences in bulk; more memory-intensive | Incremental attention with precomputed components |
-| LM head slicing | Slices to last tile before Lm head matmul to extract the last token | Slicing not required |
+| **Purpose** | Bulk sequence processing for initialization or training. | Incremental processing for autoregressive inference. |
+| **Demo Parallelization** | Sequential for each user, parallel for the sequence length of each user. | Parallel for 32 users, sequential for each token within a batch of users. |
+| **Batch and Sequence Length** | Processes long sequences (≥ 128 tokens), single user. | Processes batch of users (≤ 32 users), single token. |
+| **Memory Use** | DRAM, with reshaping into smaller chunks for long sequence lengths. | L1 on-chip memory for fast, low-latency processing. |
+| **Attention** | Handles sequences in bulk; more memory-intensive. | Incremental attention with precomputed components. |
+| **LM Head Slicing** | Slices to last tile before Lm head matmul to extract the last token. | Slicing not required. |
### 3.3 Multi-Device
-Please note that this section refers to sharding schemes across devices and not on a multi-core level. For details about different matmul versions and sharding on a core level, please see the [matmul configuration section](#44-op-configs).
+This section refers to sharding schemes across devices and not on a multi-core level. For details about different matmul versions and sharding on a core level, please see: [matmul configuration section](#44-op-configs).
There are two main approaches for scaling across multiple devices: `data parallel` and `tensor parallel`.
@@ -1190,14 +1184,12 @@ In tensor parallel scaling there is _one_ instance of the model executed on mult
There are also hybrid forms of those two modes where a cluster of devices runs multiple independent instances of the model, but each of those model instances uses multiple chips in a tensor parallel fashion.
-In the report [Programming Mesh of Devices with TT-NN](../Programming_Mesh_of_Devices/Programming_Mesh_of_Devices_with_TT-NN.md), there is a good introduction to using TTNN's key concepts for scaling to multiple devices. It shows how to use a single handle for a mesh of devices, and how a tensor can be sharded or replicated to that mesh of devices (tensor parallelism).
+In the report [Programming Mesh of Devices with TT-NN](../Programming_Mesh_of_Devices/Programming_Mesh_of_Devices_with_TT-NN.md), there is a good introduction to using TT-NN's key concepts for scaling to multiple devices. It shows how to use a single handle for a mesh of devices, and how a tensor can be sharded or replicated to that mesh of devices (tensor parallelism).
The tensor handle is used analogously to single device tensors, with the only difference being that all operations on that tensor are then executed in parallel on each device and operate on their respective local chunk of data.
-TT-Metal supports different multi-device topologies. The most important ones for us are `Ring` topology, where all devices are connected in a ring shape with each other, and `Line` topology, where a (sub-)group of devices is connected in a line with each other. `Line` topology can be a 1D or 2D grid of devices, where each row and column are connected in a line.
-
-Below is a summary and example code of the most important concepts for mapping a tensor to a mesh of devices in TTNN:
+TT-Metal supports different multi-device topologies. The most important ones for us are `Ring` topology, where all devices are connected in a ring shape with each other, and `Line` topology, where a subgroup of devices is connected in a line with each other. `Line` topology can be a 1D or 2D grid of devices, where each row and column are connected in a line.
-*Figure: Example usage of mesh_device, ShardTensorToMesh and ReplicateTensorToMesh*
+Below is a summary and example code of the most important concepts for mapping a tensor to a mesh of devices in TT-NN:
```python
import ttnn
@@ -1227,18 +1219,18 @@ mesh_tensor_replicated = ttnn.from_torch(
)
```
-The second key concept to scaling a model to multiple devices are Collective Communication Library (CCL) operations. They are used to efficiently exchange data between multiple devices. TTNN currently supports the following CCL Operations:
+The second key concept to scaling a model to multiple devices are Collective Communication Library (CCL) operations. They are used to efficiently exchange data between multiple devices. TT-NN currently supports the following CCL Operations:
- AllGather
- ReduceScatter
- AllReduce
See the [CCL Developer Guide](../EthernetMultichip/CclDeveloperGuide.md) for more comprehensive coverage about CCL and their implementation details. Our library of supported operations can be found [here](../EthernetMultichip/CclDeveloperGuide.md#op-list-op-list).
-#### AllGather
+#### 3.3.1 AllGather
The AllGather operation collects data from all devices, concatenating each chunk along a specified dimension. The result is stored on each device (replication).
- Supported Topologies: Ring, Linear
-- Supported number of links
+- Supported Number of Links
- N300, T3000: 1
- TG: 4 along cluster_axis=0, 3 along cluster_axis=1
- Arguments
@@ -1248,7 +1240,7 @@ The AllGather operation collects data from all devices, concatenating each chunk
- cluster_axis: cluster axis to gather along
- mesh_device: mesh device the tensor is mapped to
-*Figure: Example usage of Ring All-Gather on 2x4 mesh_device*
+The following is an example of Ring All-Gather on s 2x4 mesh_device:
```py
# Execute All-Gather on the sharded tensor
@@ -1256,7 +1248,7 @@ The AllGather operation collects data from all devices, concatenating each chunk
output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=1)
```
-*Figure: Example usage of Linear All-Gather on 2x4 mesh_device*
+The following is an example of Linear All-Gather on a 2x4 mesh_device:
```py
# Execute All-Gather on the sharded tensor
@@ -1264,19 +1256,19 @@ output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=1)
output_tensor = ttnn.all_gather(mesh_tensor_sharded, dim=3, num_links=2, cluster_axis=1, mesh_device=mesh_device, topology=ttnn.Topology.Linear)
```
-#### ReduceScatter
-The ReduceScatter operation reduces the data across all devices and shards the result of the reduction over a specified dimension across all devices.
+#### 3.3.2 ReduceScatter
+The ReduceScatter operation reduces data across all devices and shards the result of the reduction over a specified dimension across all devices.
-- Supported Topologies: Ring, Linear
-- Supported number of links: 1
-- Arguments
- - mesh_tensor: a tensor mapped to a mesh_device via mesh_mapper
- - dim: the dimension to concatenate
- - cluster_axis: cluster axis to gather along
- - num_links: number of ethernet links to be used
- - topology: topology configuration ttnn.Ring or ttn.Linear
+- **Supported Topologies:** Ring, Linear
+- **Supported Number of Links:** One
+- **Arguments:**
+ - **mesh_tensor:** a tensor mapped to a mesh_device via mesh_mapper
+ - **dim:** the dimension to concatenate
+ - **cluster_axis:** cluster axis to gather along
+ - **num_links:** number of ethernet links to be used
+ - **topology:** topology configuration ttnn.Ring or ttn.Linear
-*Figure: Example usage of Ring Reduce-Scatter on 2x4 mesh_device*
+The following is example usage of Ring Reduce-Scatter on a 2x4 mesh_device:
```py
# Execute Reduce-Scatter on the sharded tensor
@@ -1284,7 +1276,7 @@ The ReduceScatter operation reduces the data across all devices and shards the r
output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1)
```
-*Figure: Example usage of Linear Reduce-Scatter on 2x4 mesh_device*
+The following is example usage of Linear Reduce-Scatter on a 2x4 mesh_devcie:
```py
# Execute Reduce-Scatter on the sharded tensor
@@ -1292,66 +1284,66 @@ output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1)
output_tensor = ttnn.reduce_scatter(mesh_tensor_sharded, dim=3, num_links=1, cluster_axis=1, mesh_device=mesh_device, topology=ttnn.Topology.Linear)
```
-#### AllReduce
+#### 3.3.3 AllReduce
The AllReduce operation reduces data across all devices and stores the entire tensor on each device (replication). It is performed using an AllGather followed by a ReduceScatter.
A fused version of AllReduce is planned, but currently only the composite of AllGather+ReduceScatter is supported.
-#### Sharding schemes for decode
-In decode mode, activations are generally stored in L1 memory, while weights, which are too large, need to be stored in DRAM. The main bottleneck in decode mode is thereby DRAM bandwidth required to load model weights.
+#### 3.3.4 Sharding schemes for decode
+In decode mode, activations are generally stored in L1 memory, while weights, which are too large, must be stored in DRAM. The main bottleneck in decode mode is thereby DRAM bandwidth required to load model weights.
-The activations in decode mode are so small because they contain the batch size (=users) in the height dimension while sequence length is 1.
+The activations in decode mode are small because they contain the batch size (=users) in the height dimension while sequence length is one.
The only exception is the attention operations computing `softmax(Q*KˆT)*V`. The activation width is the model dim (e.g. 8192 for Llama3-70b).
Activations are not sharded in the height dimension; however, depending on the operation and model, they may be sharded in the width dimension.
-Matmul weights on the other hand can be sharded in width, height or both. Sharding weights across multiple devices significantly reduces DRAM pressure per device, resulting in notable latency improvements. Below is a summary of useful sharding schemes for sharding weights in decode mode. Which scheme to use will depend on the shape and size of the model weights and the target device topology.
+Matmul weights on the other hand can be sharded in width, height, or both. Sharding weights across multiple devices significantly reduces DRAM pressure per device, resulting in notable latency improvements. Below is a summary of useful sharding schemes for sharding weights in decode mode. Which scheme to use will depend on the shape and size of the model weights and the target device topology.
-##### **1D Column parallel**
+##### 3.3.5 1D Column parallel
-Weights are sharded in width, such that each device contains a horizontal slice of the weights. For this scheme the activations need to be gathered beforehead, i.e. each device processes the whole activation. The result of a column parallel matmul is an activation that is sharded in width. An AllGather operation is used on dim=3 to gather (i.e., replicate) activations.
+Weights are sharded in width, such that each device contains a horizontal slice of the weights. For this scheme the activations must be gathered beforehead, i.e. each device processes the whole activation. The result of a column parallel matmul is an activation that is sharded in width. Use an AllGather operation on dim=3 to gather (i.e., replicate) activations.
-##### **1D Row parallel**
+##### 3.3.6 1D Row parallel
-Weights are sharded in height, such that each device contains a vertical slice of the weights. For this scheme the activations need to be sharded beforehand, i.e. each device processes a width-shard of the activation. The result of a row parallel matmul are activation partials with the final result's output dimensions, each device containing a partial result. To reduce the activations, i.e. compute the final output, a ReduceScatter operation is used to compute the reduced result across all devices and shard the result along a specified dimension.
-Additionally an AllGather operation is used (ReduceScatter+AllGather = AllReduce) to gather the reduced shards and thus replicate the final output on each device.
+Weights are sharded in height, such that each device contains a vertical slice of the weights. For this scheme the activations must be sharded beforehand, i.e. each device processes a width-shard of the activation. The result of a row parallel matmul are activation partials with the final result's output dimensions, each device containing a partial result. To reduce the activations, i.e. compute the final output, use a ReduceScatter operation to compute the reduced result across all devices and shard the result along a specified dimension.
+Additionally use an AllGather operation (ReduceScatter+AllGather = AllReduce) to gather the reduced shards and thus replicate the final output on each device.
-##### **1D Column parallel followed by row parallel (1D weight sharding) **
+##### 3.3.7 1D Column Parallel Followed by Row Parallel (1D Weight Sharding)
-1D Weight Sharding is a sharding scheme that combines column and row parallel matmuls and can reduce the data volume sent over CCL operation and thus speed up computation. It consists of a column parallel matmul followed by a row parallel matmul. In this scheme the initial activations are gathered, and the column parallel matmul produces width-sharded outputs. The row parallel matmul consumes those sharded activations and produces parial outputs. We need an AllReduce (ReduceScatter+AllGather) operation to compute the final reduced and gathered outputs.
+1D Weight Sharding is a sharding scheme that combines column and row parallel matmuls and can reduce the data volume sent over CCL operation and thus speed up computation. It consists of a column parallel matmul followed by a row parallel matmul. In this scheme the initial activations are gathered, and the column parallel matmul produces width-sharded outputs. The row parallel matmul consumes those sharded activations and produces parial outputs. Use an AllReduce (ReduceScatter+AllGather) operation to compute the final reduced and gathered outputs.
-Optimization potential in this scheme depends highly on the input dimensions to the CCL operations. We can use this scheme for the MLP and any sequence of matmuls that expands and then narrows the output dimension again, becuase it moves the CCL operation to a more beneficial location in the computational graph and thus reduces the CCL data volume.
+Optimization potential in this scheme depends highly on the input dimensions to the CCL operations. Use this scheme for the MLP and any sequence of matmuls that expand and then narrow the output dimension again, because it moves the CCL operation to a more beneficial location in the computational graph and thus reduces the CCL data volume.
-Let's look at the MLP as concrete example: in Llama3-70b we have `FF1` and `FF3` with dimensions `[32, 8k] x [8k, 28k]` and then the `FF2` with dimension `[32, 28k] x [28k, 8k]`.
+Let's look at the MLP as a concrete example: in Llama3-70b we have `FF1` and `FF3` with dimensions `[32, 8k] x [8k, 28k]` and then the `FF2` with dimension `[32, 28k] x [28k, 8k]`.
If we gather after `FF1` and `FF3` we have to gather activations of size `[32, 28k/num_devices] -> [32, 28k]` for each of `FF1` and `FF3`; after the `FF2` we'd need to gather again `[32, 8k/num_devices] -> [32, 8k]`.
If instead, we use the 1D weight sharding scheme and thus move the CCL operation after the `FF2`, we only have to ReduceScatter #num_devices partials of size `[32, 8k] -> [32, 8k/num_devices]` and then optionally AllGather to obtain the `[32, 8k]` gathered outputs.
-##### **2D Weight Sharding**
+##### 3.3.8 2D Weight Sharding
In 2D Weight Sharding on a 2D cluster, weights are sharded both in width and height, such that each device contains a block of the weights.
For this scheme the activations are width-sharded along `cluster_axis=0` and are replicated along `cluster_axis=1`, and the weights are block-sharded. Thus, each device processes a width-shard of the activation, and a block of the weights where the activations are replicated over one axis but the weights are not.
The matmul result will be width-sharded along `cluster_axis=0` and contain partial results along `cluster_axis=1`.
-Typically an AllReduce (ReduceScatter+AllGather) is used to first reduce along `cluster_axis=1` and then gather the shards along `cluster_axis=0`.
+Typically we use an AllReduce (ReduceScatter+AllGather) to first reduce along `cluster_axis=1` and then gather the shards along `cluster_axis=0`.
-##### **Optimal strategy**
+##### 3.3.9 Optimal strategy
-The optimal usage strategy of different parallelisation schemes depends on the specific shapes and model architecture, as well as the target device topology. To select the best parallelisation strategy, the overall data movement for each scheme can be computed; selecting the parallelisation stratgy with the lowest overall data movement will generally result in the best performance.
+The optimal usage strategy of different parallelization schemes depends on the specific shapes and model architecture, as well as the target device topology. To select the best parallelization strategy, the overall data movement for each scheme can be computed; selecting the parallelization stratgy with the lowest overall data movement will generally result in the best performance.
-To compute the data movement for a given parallelisation strategy, first the required sequence of parallelisation strategies and corresponding CCL operations is sketched out, and then the resulting dat movement is computed. The following table shows constraints on input and output activations for each parallelisation strategy. A partial activation always has to be reduced (ReduceScatter or AllReduce), while fractured activations may or may not need to be gathered, dependent on the consumer operation. A binary op for example is executed on the fractured activaiton to parallelise computation, while a matmul 1D column parallel operation requires inputs to be gathered in k.
+To compute the data movement for a given parallelization strategy, first the required sequence of parallelization strategies and corresponding CCL operations is sketched out, and then the resulting dat movement is computed. The following table shows constraints on input and output activations for each parallelization strategy. A partial activation always has to be reduced (ReduceScatter or AllReduce), while fractured activations may or may not need to be gathered, dependent on the consumer operation. A binary OP for example is executed on the fractured activaiton to parallelize computation, while a matmul 1D column parallel operation requires inputs to be gathered in k.
-| Parallelisation strategy | Input activation requirement | Output activation requirement |
+| Parallelization strategy | Input activation requirement | Output activation requirement |
|---------------------------|-----------------|-----------------|
-| 1D Column parallel | Gathered in k | Fractured in k |
-| 1D row parallel | Fractured in k | Partials of full size |
-| 1D column + row parallel | Gathered in k | Partials of full size |
-| 2D parallel | Fractured in k | Partials over one cluster axis |
+| 1D Column Parallel | Gathered in K | Fractured in K |
+| 1D Row Parallel | Fractured in K | Partials of full size |
+| 1D Column + Row Parallel | Gathered in K | Partials of full size |
+| 2D Parallel | Fractured in K | Partials over one cluster axis |
The overall data movement (DM) is then computed using:
@@ -1360,21 +1352,19 @@ The overall data movement (DM) is then computed using:
| AllGather | DM = (K⋅N⋅DF/D)⋅(D−1)⋅D | DM = (K⋅N⋅DF)⋅D⋅log2(D) |
| ReduceScatter | DM = (K⋅N⋅DF)⋅(1-(1/D)) | DM = (K⋅N⋅DF) ⋅ (D-1) / D |
-where K and N are height and width of the weight tensor, DF is the data format multiplyer (number of bytes per datum) and D is the number of devices along the axis that the CCL operation is performed on. Ring topology is more optimised and results in less overall data movement.
-
+Where K and N are height and width of the weight tensor, DF is the data format multiplyer (number of bytes per datum) and D is the number of devices along the axis that the CCL operation is performed on. Ring topology is more optimized and results in less overall data movement.
-
-##### **Examplary parallelisation scheme: Llama3**
+##### 3.3.10 Examplary parallelization scheme: Llama3
For our [Llama3 family of models](../../models/demos/llama3) we are using the following sharding schemes in our multi-device architectures:
| Matmul | N300 | T3000 | TG |
|-------------------|-----------------|-----------------|-----------------|
-| [_QKV projection_](../../models/demos/llama3/tt/llama_attention.py) | Column parallel | Column parallel | 2D |
-| [_Dense out_](../../models/demos/llama3/tt/llama_attention.py) | Row parallel | Row parallel | 2D |
-| [_FF1_](../../models/demos/llama3/tt/llama_mlp.py) | Column parallel | Column parallel | 2D |
-| [_FF3_](../../models/demos/llama3/tt/llama_mlp.py) | Column parallel | Column parallel | 2D |
-| [_FF2_](../../models/demos/llama3/tt/llama_mlp.py) | Row parallel | Row parallel | 2D |
+| [_QKV projection_](../../models/demos/llama3/tt/llama_attention.py) | Column Parallel | Column Parallel | 2D |
+| [_Dense out_](../../models/demos/llama3/tt/llama_attention.py) | Row Parallel | Row Parallel | 2D |
+| [_FF1_](../../models/demos/llama3/tt/llama_mlp.py) | Column Parallel | Column Parallel | 2D |
+| [_FF3_](../../models/demos/llama3/tt/llama_mlp.py) | Column Parallel | Column Parallel | 2D |
+| [_FF2_](../../models/demos/llama3/tt/llama_mlp.py) | Row Parallel | Row Parallel | 2D |
### 3.4 Continuous Batching
@@ -1382,7 +1372,7 @@ Continuous batching is a serving optimization. To describe continuous batching,
Without continuous batching, an LLM service waits for `batch_size` requests to come in. The service then prefills each request. Then, the service decodes the batched requests token by token. Once all users in the batch finish generation, the service accepts new requests. This is suboptimal because 1) some requests might end generation early, so 2) some slots in the batch are not doing useful computation, while 3) new requests are waiting.
-In contrast, continuous batching allows the service to process new requests as soon as there is a free slot in the batch. The pseudo-code for this algorithm is shown below.
+In contrast, continuous batching allows the service to process new requests as soon as there is a free slot in the batch. The pseudo-code for this algorithm is shown below:
```python
while True:
@@ -1399,27 +1389,27 @@ The above image from anyscale (https://www.anyscale.com/blog/continuous-batching
Continuous batching improves TTFT by reducing wait times for incoming users. It also increases total throughput by keeping the decode batch full of useful work.
-Continuous batching is an LLM serving optimization but it requires some support in the model. The model has to support single user prefill so that when a slot is open, the model can prefill a new request into a specific slot of the batch. The model also has to support batched decode where position ids can be different for each user in the batch, to avoid context contamination.
+Continuous batching is an LLM serving optimization but it requires some support in the model. The model has to support single user prefill so that when a slot is open, the model can prefill a new request into a specific slot of the batch. The model also has to support batched decode where position IDs can be different for each user in the batch, to avoid context contamination.
Implementing continuous batching requires that the serving code track data for each slot of the batch. An example of our continuous batching demo can be found [here](../../models/demos/t3000/llama2_70b/demo/demo_continuous_batching.py). In production deployment, vLLM handles continuous batching for the LLM service.
### 3.5 vLLM Integration
-#### Overview
+#### 3.5.1 Overview
vLLM is an [open-source LLM serving library](https://github.com/vllm-project/vllm). We use vLLM to serve our models in production because of the features it enables. On the serving side, vLLM supports continuous batching and [paged attention](https://arxiv.org/pdf/2309.06180). In addition, vLLM provides an OpenAI-compatible server which is useful for deployment.
Tenstorrent maintains a [fork of vLLM](https://github.com/tenstorrent/vllm/tree/dev) for serving models on Tenstorrent hardware. The [README](https://github.com/tenstorrent/vllm/tree/dev/tt_metal/README.md) has instructions for setting up the environment.
-#### Implementation Requirements
+#### 3.5.2 Implementation Requirements
In order to add vLLM support to a new model, the model must conform to a certain interface. An example of the interface is the [Llama2-70b generation code](../../models/demos/t3000/llama2_70b/tt/llama_generation.py), which implements `prefill_forward`, `decode_forward`, and `initialize_vllm_model`.
Beyond implementing the functionality needed for continuous batching, a model must also implement paged attention. For an example, see [Llama2-70b attention](../../models/demos/t3000/llama2_70b/tt/llama_attention_optimized.py).
-#### vLLM modifications
+#### 3.5.3 vLLM modifications
On the vLLM side there may be additional changes needed to support the new model.
- Modify [`tt_loader.py`](https://github.com/tenstorrent/vllm/blob/dev/vllm/model_executor/model_loader/tt_loader.py) if the model requires a different initialization.
- Modify [`tt_model_runner.py`](https://github.com/tenstorrent/vllm/blob/dev/vllm/worker/tt_model_runner.py) if it is missing functionality for the new model.
-#### Testing
+#### 3.5.4 Testing
Finally, test the new model through vLLM. Register the new model as seen in [`offline_inference_tt.py`](https://github.com/tenstorrent/vllm/blob/dev/examples/offline_inference_tt.py).
```python
@@ -1476,7 +1466,7 @@ output = ttnn.linear(
When you don't pass memory configs or program configs the operation will choose default values. These defaults are often sub-optimal. `memory_config` typically defaults to a DRAM interleaved configuration, while `program_config` defaults to something reasonable but still sub-optimal.
See [Matrix Engine](../matrix_engine/matrix_engine.md) for background on `compute_kernel_config`.
-#### Memory Configs
+#### 4.4.1 Memory Configs
For the LLM context, memory configs are not as important in prefill mode, where activations are large (due to the long sequence lengths) and thus should generally be DRAM interleaved (otherwise wouldn't fit on L1). In prefill mode, each op should consume DRAM interleaved inputs and produce DRAM interleaved output(s).
Memory configs are most important in decode mode. For some operation like `ttnn.matmul`, both the activation and the output will be sharded according to their memory configs. Decode mode activations are of shape `[batch_size, hidden_size]` and should be width-sharded in L1 (sharding the `hidden_size` dimension). By keeping activations and outputs width-sharded in L1 we reduce DRAM traffic and get better performance. The Llama3 codebase has examples of how to create a width-sharded memory config (see [Llama3 model config](../../models/demos/llama3/tt/model_config.py)).
@@ -1504,12 +1494,12 @@ As always, you should try running your `ttnn` op in a unit test with whichever s
Be careful when your memory config creates shards that require padding (i.e, the shard shape does not divide evenly into 32x32 tiles). Padded shards and padded ops are under active development and can be sources of bugs. When your memory config requires padding, you probably want to instead find a core grid which divides evenly into the tensor shape.
-#### Program Configs and Picking the Right Matmul
+#### 4.4.2 Program Configs and Picking the Right Matmul
Each `ttnn` operation has its own unique program config class. In general, program configs configure the op with hyperparameters that affects their functionality and performance. There are too many ops and program configs to cover in detail. We will focus on `ttnn.matmul` since it has multiple variants and it usually requires the most care.
Picking a matmul variant is a key decision in optimizing a model. The choice depends on the shapes of the inputs and outputs and how the matmul fits into the rest of the model. You choose a variant by providing a specific `program_config` to `ttnn.matmul`. The following presents three matmul variants that are commonly used in LLMs.
-##### Matmul 2D
+##### 4.4.3 Matmul 2D
Matmul 2D gets its name because it parallelizes an `(M x K) @ (K x N)` matmul over the M and N dimensions. It is useful to have this 2D parallelization when M and N are large (usually >= 256). Rule of thumb: use matmul 2D for all matmuls in prefill mode. Generally, inputs and output to matmul 2D will be interleaved in DRAM because these matmuls should be compute bound rather than memory bound and the inputs may be too large to fit in L1. NOTE: the weights can be DRAM sharded and still work with matmul 2D.
The following is a description of the program config for matmul 2D.
@@ -1557,7 +1547,7 @@ fuse_batch=False,
Since we use matmul 2D for large matmuls, there may be some issues where we run out of L1 just to store intermediate values in the kernel. When this happens, try reducing `in0_block_w` and `out_subblock_h` and `out_subblock_w`.
-##### DRAM-Sharded Matmul
+##### 4.4.4 DRAM-Sharded Matmul
DRAM-Sharded matmul should be used in decode mode, where activations are small and DRAM-bandwidth to read weights is the limiting factor in op performance. This matmul gets its name because rather than having weights interleaved in DRAM, they are sharded across DRAM banks to optimally collocate weights with compute. See the [DRAM-Sharded Matmul](../Saturating_DRAM_bandwidth/Saturating_DRAM_bandwidth.md) writeup for details on the implementation.
We use DRAM-Sharded matmul for all matmuls in decode mode. The activation and output are width-sharded in L1, and the weights are width-sharded in DRAM.
@@ -1595,7 +1585,7 @@ output = ttnn.linear(
Be careful that the core grid evenly divides both the activations and the output. Padding functionality is not yet implemented for DRAM-Sharded matmuls.
-#### Matmul 1D
+#### 4.4.5 Matmul 1D
Matmul 1D is the final variant to cover. Before ttnn implemented DRAM-Sharded matmul, this was the matmul of choice for decode mode. Now that DRAM-Sharded matmul exists and is much faster, matmul 1D is less often used.
Matmul 1D gets its name because it only parallelizes over the N dimension. The activation and output(s) should be width-sharded in L1. Weights should be DRAM interleaved.
@@ -1623,7 +1613,7 @@ When creating a matmul 1D program config, maximize the `in0_block_w` and `out_su
While we work on maximizing the performance of large language models on Tenstorrent hardware, we must also ensure that the models are functionally correct and that they produce outputs of the expected quality. The subsections below will describe our methods for evaluating the accuracy (also referred to as functionality or correctness for our purposes) of a given model and how to debug issues pertaining to this.
-#### Accuracy Testing
+#### 4.5.1 Accuracy Testing
Below is a list of metrics that are used when evaluating accuracy:
- **Pearson Correlation Coefficient (PCC)**: A measure of the linear relationship between two variables, where a PCC of 1 indicates a perfect positive correlation, and a PCC of 0 indicates no linear correlation.
@@ -1637,7 +1627,7 @@ In order to thoroughly test the accuracy of a model, a bottom up approach is tak
- **Model-level unit tests**: In addition to the sub-module unit tests, there should also be unit tests for a full layer of the model with all sub-modules, and the full model comprising of all layers. For example, the [llama3 model test](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/llama3/tests/test_llama_model.py) runs 1 or many layers of the model over multiple iterations and checks the PCC against the reference model. A rule of thumb is that the full model PCC should be approximately ~0.99.
- **Dataset evaluation**: Once a model has been brought up with sufficient accuracy on the smaller unit tests, it should be tested on a larger set of prompts such as a full dataset or a subset of it. For example, the [Falcon7b perplexity test](https://github.com/tenstorrent/tt-metal/blob/main/models/demos/falcon7b_common/tests/perplexity/test_perplexity_falcon.py) loads a subset of the [WikiText dataset](https://huggingface.co/datasets/Salesforce/wikitext) and computes several metrics (including perplexity and top-1/5 accuracy) for evaluating the TT model with respect to the ground truth from the dataset. The results of these metrics should be comparable (e.g. within a couple percentage points of difference) to those obtained from running the evaluation with the reference model on CPU / GPU.
-#### Debugging Accuracy
+#### 4.5.2 Debugging Accuracy
If during model bringup or optimization it is observed that the model outputs do not seem reasonable or any of the evaluations above are failing, the following steps can be taken to debug the accuracy:
1. Locate the smallest module test that is failing. The fewer the number of operations that could be causing the issue, the easier it will be to debug the root cause. In most cases, the issue should be able to be found using a 1 layer or submodule test.
@@ -1658,7 +1648,7 @@ In some cases, it may be possible that the issue is not with the model and that
### 4.6 Performance Analysis
-ttnn performance has five components:
+TT-NN performance has five components:
![Performance components overview](images/4.6-overview.png)
@@ -1670,9 +1660,10 @@ ttnn performance has five components:
Further detail will be provided. It is important to confirm that Tracing has been enabled. For more inforation see [4.1 Tracing](#41-tracing) for more details, tracing should be used for decode mode but not prefill mode.
-**This means that for decode mode you won’t have to worry about 1-3 but for prefill mode you will.**
+> [!NOTE]
+> This means that for decode mode you won’t have to worry about 1-3 but for prefill mode you will.
-#### 1. Main Python Thread
+#### 4.6.1 Main Python Thread
Implement the main python thread if you are not tracing. The main python thread is not important if you are using tracing. The Metal Profiler/Tracy can also show python performance but for pure python analysis Viztracer is a recommended tool. [viztracer](https://github.com/gaogaotiantian/viztracer):
@@ -1704,11 +1695,11 @@ Top tips:
* Generate shard spec and compute kernel config objects once (e.g. in a constructor) instead of recreating them every time you run the forward pass. Keep the forward pass clean.
* Make sure Metal is compiled in Release mode (default) and you are using ttnn’s async mode (see above).
-#### 2. Host API
+#### 4.6.2 Host API
Any overhead here is outside your control and in our experience is minimal. Use a C++ profiler or [Metal Profiler/Tracy](https://github.com/tenstorrent/tt-metal/blob/main/tech_reports/MetalProfiler/metal-profiler.md) with host stack traces enabled to see this time.
-#### 3. Host-device communications
+#### 4.6.3 Host-device communications
As little communication as possible between the host and the device is preferred. For LLMs this means:
@@ -1734,7 +1725,7 @@ Looking at host-device communications in a python profiler like `viztracer` is p
If you want to measure calls this way, turn async mode off. The time your main python thread spends in `to_torch` will not include any time spent waiting for the device and will be a closer approximation the measures above.
-#### 4+5. Device dispatch and op performance
+#### 4.6.4 Device dispatch and OP performance
This is the fun bit, but we need to do a little prep to get started. First, metal must be compiled with `-p` to enable device profiling:
@@ -1762,7 +1753,7 @@ python models/perf/perf_report.py OPS_CSV_FILE
For device performance we recommend looking at a single layer. You can do this by using `--id-range` or by changing your test to run only a single layer of the model. For more information see: [Performance Report Analysis Tool](https://github.com/tenstorrent/tt-metal/tree/main/models/perf). The Performance Report Analysis Tool document describes how to select specific ranges of OPs.
-##### What makes a good performance test?
+**What makes a good performance test?**
Ideally you should run your model in as close to end-user form as possible, simplifying it as much as possible. In practice this means:
@@ -1771,7 +1762,7 @@ Ideally you should run your model in as close to end-user form as possible, simp
* Run a single layer of the model - but be aware of which OPs are run for every layer and which ones are only run at the start and end (e.g. embedding, final norm and LM head).
* Add a tracy signpost e.g. `tracy.signpost("Performance pass")` before the part you want to record - this will be focused on by default by `perf_report.py`, saving you some work.
-##### What does such a report look like?
+**What does such a report look like?**
Here is an example without tracing enabled. You can instantly see that more time (756us) is spent in between OPs (op-to-op gap) than running OPs on device (362us)!
@@ -1799,7 +1790,7 @@ There are many individual tips, let’s start with overall advice:
The perfect OP runs on the entire core grid using sharded inputs from L1. Let’s look more at data movement first, then specific tips.
-#### Data movement
+#### 4.7.1 Data movement
OPs can read data from:
@@ -1816,7 +1807,7 @@ Activations are placed in L1 and weights placed in DRAM.
See the [op config section](#44-op-configs) for more details on writing shard specs in your code.
-#### Specific tips
+#### 4.7.2 Specific tips
Situation: OPs are reading from the fastest memory they can, sharded if possible. What might still make things slow?
@@ -1863,7 +1854,7 @@ self.compute_kernel_config_hifi2 = ttnn.WormholeComputeKernelConfig(
As always, do not recreate these every single forward pass if you want your python thread to be fast (which you do).
### 4.8 Module Tests
-#### Llama3 Module and Test Differences
+#### 4.8.1 Llama3 Module and Test Differences
In our current Llama3 model, the attention module class (`TtLlamaAttention`) implements two primary methods for attention computation: `forward_prefill` and `forward_decode`.
To test these, we provide two separate attention test files, `test_attention_decode` and `test_attention_prefill`, which create the appropriate input tensors:
diff --git a/tests/nightly/tg/ccl/test_all_gather_async_nightly.py b/tests/nightly/tg/ccl/test_all_gather_async_nightly.py
new file mode 120000
index 00000000000..f342d96f5be
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_all_gather_async_nightly.py
@@ -0,0 +1 @@
+../../../ttnn/unit_tests/operations/ccl/test_all_gather_async_TG_nightly.py
\ No newline at end of file
diff --git a/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py b/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py
new file mode 120000
index 00000000000..2187a4cc4fb
--- /dev/null
+++ b/tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py
@@ -0,0 +1 @@
+../../../ttnn/unit_tests/operations/ccl/test_reduce_scatter_async_TG_nightly.py
\ No newline at end of file
diff --git a/tests/scripts/tg/run_tg_nightly_tests.sh b/tests/scripts/tg/run_tg_nightly_tests.sh
index 89e5c253c7c..d3f23a6a50c 100755
--- a/tests/scripts/tg/run_tg_nightly_tests.sh
+++ b/tests/scripts/tg/run_tg_nightly_tests.sh
@@ -7,8 +7,7 @@ run_tg_llama3_70b_tests() {
echo "LOG_METAL: Running run_tg_llama3_70b_tests"
- pytest tests/nightly/tg/ccl/test_all_gather_nightly.py ; fail+=$?
- pytest tests/nightly/tg/ccl/test_reduce_scatter_nightly.py ; fail+=$?
+ pytest -n auto tests/nightly/tg/ccl --timeout=180 ; fail+=$?
# Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
index 0b6bb8b0fff..c482877e082 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_and/logical_and_sharded.py
@@ -10,7 +10,11 @@
import random
import ttnn
from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -74,6 +75,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
index f32030aae75..826f3a52682 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_or/logical_or_sharded.py
@@ -10,7 +10,11 @@
import random
import ttnn
from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -74,6 +75,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py b/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
index 59e5cbf0572..cdb9e1c4473 100644
--- a/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/binary/logical_xor/logical_xor_sharded.py
@@ -10,7 +10,11 @@
import random
import ttnn
from tests.sweep_framework.sweep_utils.utils import gen_shapes, tensor_to_dtype, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import assert_equal, check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -74,6 +75,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -94,6 +96,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py b/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
index 9eb1b8d2254..6bc3cde251e 100644
--- a/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/addcdiv/addcdiv_sharded.py
@@ -11,7 +11,11 @@
import ttnn
import math
from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -72,6 +73,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -99,6 +101,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py b/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
index 309c4466057..45ab2f8f4e0 100644
--- a/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/ternary/addcmul/addcmul_sharded.py
@@ -11,7 +11,11 @@
import ttnn
import math
from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -72,6 +73,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -99,6 +101,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
index 65f83fb5545..08e6e827a8a 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/ceil/ceil_sharded.py
@@ -11,7 +11,11 @@
import ttnn
import math
from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -72,6 +73,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -90,6 +92,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py
new file mode 100644
index 00000000000..55169b42e1b
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+ "input_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "input_layout": [ttnn.TILE_LAYOUT],
+ "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_dtype,
+ input_layout,
+ input_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ torch.manual_seed(0)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=0.0001, high=100, dtype=torch.float32), input_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.digamma)
+ torch_output_tensor = golden_function(torch_input_tensor)
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.digamma(input_tensor, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py
new file mode 100644
index 00000000000..35273e221d0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/digamma/digamma_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=20 * 1024, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=0.0001, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.digamma)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.digamma(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py
new file mode 100644
index 00000000000..cc03eb013bf
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/elu/elu_sharded.py
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(6, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "alpha": [-0.5, 0, 0.5],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ alpha,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ torch_output_tensor = torch.nn.functional.elu(torch_input_tensor_a, alpha=alpha)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.elu(input_tensor_a, alpha, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py
new file mode 100644
index 00000000000..e1bd10f6f75
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random_with_zeros
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+ "input_dtype": [ttnn.bfloat16],
+ "input_layout": [ttnn.TILE_LAYOUT],
+ "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_dtype,
+ input_layout,
+ input_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ torch.manual_seed(0)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random_with_zeros, low=-10, high=10, dtype=torch.float32), input_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.eqz)
+ torch_output_tensor = golden_function(torch_input_tensor)
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.eqz(input_tensor, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py
new file mode 100644
index 00000000000..0c52d028425
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/eqz/eqz_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random_with_zeros
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random_with_zeros, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.eqz)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.eqz(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py
new file mode 100644
index 00000000000..e1fed6bc25a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/erf/erf_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.erf)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.erf(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py
new file mode 100644
index 00000000000..88880a65e92
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/leaky_relu/leaky_relu_sharded.py
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(6, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ "negative_slope": [-0.5, 0, 0.01, 0.5],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ negative_slope,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.leaky_relu)
+ torch_output_tensor = golden_function(torch_input_tensor_a, negative_slope=negative_slope)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.leaky_relu(input_tensor_a, negative_slope, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py
new file mode 100644
index 00000000000..2c4ffbf69c4
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/lez/lez_sharded.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ print(
+ f"{input_shape} {core_grid} {sharding_strategy} {shard_orientation} {tensor_hw_as_shard_shape} {input_a_dtype} {input_layout} {shard_height_mul_of_32}"
+ )
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.lez)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.lez(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ print(pcc)
+ return [pcc, e2e_perf]
+
+
+# # Run sweeps locally
+# from tests.sweep_framework.framework.permutations import *
+
+# start_time = start_measuring_time()
+# for suite in parameters.keys():
+# device_id = 0
+# device = ttnn.open_device(device_id=device_id)
+# suite_vectors = list(permutations(parameters[suite]))
+# print(len(suite_vectors))
+# for vector in suite_vectors:
+# invalidate_res = invalidate_vector(vector)
+# if invalidate_res[0]:
+# print(f"Invalidated: {invalidate_res[1]}")
+# continue
+# try:
+# passed, _ = run(**vector, device=device)
+# # if passed[0] != True:
+# # print(passed)
+# except Exception as e:
+# print(e)
+
+# ttnn.close_device(device)
+
+# e2e_perf = stop_measuring_time(start_time)
+# print(f"time {e2e_perf / 1000000000}s")
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
index acc77eba40a..75dcb176d1c 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rdiv/rdiv_sharded.py
@@ -11,7 +11,11 @@
import ttnn
import math
from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -72,6 +73,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -91,6 +93,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py
new file mode 100644
index 00000000000..16404f972a1
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import torch
+import ttnn
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_shape": gen_shapes([1, 1, 1, 1], [6, 12, 256, 256], [1, 1, 1, 1], 32)
+ + gen_shapes([1, 1, 1], [12, 256, 256], [1, 1, 1], 32)
+ + gen_shapes([1, 1], [256, 256], [1, 1], 32),
+ "input_dtype": [ttnn.bfloat16],
+ "input_layout": [ttnn.TILE_LAYOUT],
+ "input_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+ },
+}
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a device_mesh_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_shape,
+ input_dtype,
+ input_layout,
+ input_memory_config,
+ output_memory_config,
+ *,
+ device,
+) -> list:
+ torch.manual_seed(0)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.reciprocal)
+ torch_output_tensor = golden_function(torch_input_tensor)
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=input_memory_config,
+ )
+
+ start_time = start_measuring_time()
+ result = ttnn.reciprocal(input_tensor, memory_config=output_memory_config)
+ output_tensor = ttnn.to_torch(result)
+ e2e_perf = stop_measuring_time(start_time)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py
new file mode 100644
index 00000000000..137ff5755f9
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/reciprocal/reciprocal_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.reciprocal)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.reciprocal(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
index e6ff531b973..f7aec1d0ee4 100644
--- a/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
+++ b/tests/sweep_framework/sweeps/eltwise/unary/rsub/rsub_sharded.py
@@ -11,7 +11,11 @@
import ttnn
import math
from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
-from tests.sweep_framework.sweep_utils.sharding_utils import gen_sharded_spec_unary, parse_sharding_spec
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
@@ -39,16 +43,13 @@
# If invalidated, the vector will still be stored but will be skipped.
# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
- input_shape, X, Y, sharding_strategy, _, _, input_layout = test_vector["input_spec"].values()
- pre_sharded_height = math.prod(input_shape[:-1])
- pre_sharded_width = input_shape[-1]
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
if input_layout == "ROW_MAJOR_LAYOUT":
- return True, "Input to eltwise binary must be tilized"
-
- if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
- return True, "bfloat8_b is only supported on tiled layout"
-
+ return True, "Inputs to eltwise binary must be tilized"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
return False, None
@@ -72,6 +73,7 @@ def run(
shard_orientation,
tensor_hw_as_shard_shape,
input_layout,
+ shard_height_mul_of_32,
) = parse_sharding_spec(input_spec)
if input_layout == ttnn.ROW_MAJOR_LAYOUT:
@@ -91,6 +93,7 @@ def run(
strategy=sharding_strategy,
orientation=shard_orientation,
use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
)
input_tensor_a = ttnn.from_torch(
diff --git a/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py
new file mode 100644
index 00000000000..aa8402d106e
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary/sin/sin_sharded.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_input_tensor_a = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+ golden_function = ttnn.get_golden_function(ttnn.sin)
+ torch_output_tensor = golden_function(torch_input_tensor_a)
+
+ input_tensor_a = ttnn.from_torch(
+ torch_input_tensor_a,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.sin(input_tensor_a, memory_config=sharded_config)
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py
new file mode 100644
index 00000000000..2ecdfcbe108
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/acos_bw/acos_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor.requires_grad = True
+ golden_function = ttnn.get_golden_function(ttnn.acos_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.acos_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py
new file mode 100644
index 00000000000..fddb3d4360a
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/add_bw/add_bw_sharded.py
@@ -0,0 +1,130 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor.requires_grad = True
+
+ scalar = torch.tensor(1, dtype=torch.bfloat16).uniform_(-100, 100).item()
+ golden_function = ttnn.get_golden_function(ttnn.add_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor, scalar)[0]
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.add_bw(grad_tensor, input_tensor, scalar, memory_config=sharded_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py
new file mode 100644
index 00000000000..f45031b69c0
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/cos_bw/cos_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor.requires_grad = True
+ golden_function = ttnn.get_golden_function(ttnn.cos_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.cos_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py
new file mode 100644
index 00000000000..c10f4c90c45
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/fill_bw/fill_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor.requires_grad = True
+ golden_function = ttnn.get_golden_function(ttnn.fill_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.fill_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor[0], output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py
new file mode 100644
index 00000000000..89fd978b802
--- /dev/null
+++ b/tests/sweep_framework/sweeps/eltwise/unary_backward/hardsigmoid_bw/hardsigmoid_bw_sharded.py
@@ -0,0 +1,128 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+
+import json
+import torch
+import random
+import ttnn
+import math
+from tests.sweep_framework.sweep_utils.utils import gen_shapes, sanitize_shape_rm
+from tests.sweep_framework.sweep_utils.sharding_utils import (
+ gen_sharded_spec_unary,
+ parse_sharding_spec,
+ invalidate_vector_sharding,
+)
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 120
+
+random.seed(0)
+
+
+# Parameters provided to the test vector generator are defined here.
+# They are defined as dict-type suites that contain the arguments to the run function as keys, and lists of possible inputs as values.
+# Each suite has a key name (in this case "suite_1" and "suite_2") which will associate the test vectors to this specific suite of inputs.
+# Developers can create their own generator functions and pass them to the parameters as inputs.
+parameters = {
+ "nightly": {
+ "input_spec": gen_sharded_spec_unary(16, max_tensor_size_per_core=16 * 1024, layouts=["TILE_LAYOUT"]),
+ "input_a_dtype": [ttnn.bfloat16],
+ },
+}
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+ input_layout = test_vector["input_spec"]["input_layout"]
+ sharding_invalidated, output_str = invalidate_vector_sharding(test_vector["input_spec"])
+
+ if input_layout == "ROW_MAJOR_LAYOUT":
+ return True, "Input to eltwise binary must be tilized"
+ if input_layout == "ROW_MAJOR_LAYOUT" and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+ return True, "bfloat8_b is only supported on tiled layout"
+ if sharding_invalidated:
+ return sharding_invalidated, output_str
+
+ return False, None
+
+
+# This is the run instructions for the test, defined by the developer.
+# The run function must take the above-defined parameters as inputs.
+# The runner will call this run function with each test vector, and the returned results from this function will be stored.
+# If you defined a mesh_device_fixture above, the object you yielded will be passed into this function as 'device'. Otherwise, it will be the default ttnn device opened by the infra.
+def run(
+ input_spec,
+ input_a_dtype,
+ *,
+ device,
+) -> list:
+ data_seed = random.randint(0, 20000000)
+ torch.manual_seed(data_seed)
+
+ (
+ input_shape,
+ core_grid,
+ sharding_strategy,
+ shard_orientation,
+ tensor_hw_as_shard_shape,
+ input_layout,
+ shard_height_mul_of_32,
+ ) = parse_sharding_spec(input_spec)
+
+ if input_layout == ttnn.ROW_MAJOR_LAYOUT:
+ input_shape = sanitize_shape_rm(input_shape)
+
+ sharded_config = ttnn.create_sharded_memory_config_(
+ shape=input_shape,
+ core_grid=core_grid,
+ strategy=sharding_strategy,
+ orientation=shard_orientation,
+ use_height_and_width_as_shard_shape=tensor_hw_as_shard_shape,
+ tile_layout=shard_height_mul_of_32,
+ )
+
+ torch_grad_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor = gen_func_with_cast_tt(
+ partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+ )(input_shape)
+
+ torch_input_tensor.requires_grad = True
+ golden_function = ttnn.get_golden_function(ttnn.hardsigmoid_bw)
+ torch_output_tensor = golden_function(torch_grad_tensor, torch_input_tensor)[0]
+
+ grad_tensor = ttnn.from_torch(
+ torch_grad_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ input_tensor = ttnn.from_torch(
+ torch_input_tensor,
+ dtype=input_a_dtype,
+ layout=input_layout,
+ device=device,
+ memory_config=sharded_config,
+ )
+
+ start_time = start_measuring_time()
+ output_tensor = ttnn.hardsigmoid_bw(grad_tensor, input_tensor, memory_config=sharded_config)[0]
+ e2e_perf = stop_measuring_time(start_time)
+ output_tensor = ttnn.to_torch(output_tensor)
+
+ pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+ return [pcc, e2e_perf]
diff --git a/tests/tt_eager/integration_tests/test_bert.cpp b/tests/tt_eager/integration_tests/test_bert.cpp
index 60728c57d08..f6d72c7c85d 100644
--- a/tests/tt_eager/integration_tests/test_bert.cpp
+++ b/tests/tt_eager/integration_tests/test_bert.cpp
@@ -8,8 +8,8 @@
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operation.hpp"
#include "ttnn/operations/normalization/softmax/softmax.hpp"
-#include "tt_metal/common/constants.hpp"
-#include "tt_metal/host_api.hpp"
+#include
+#include
#include "ttnn/operations/functions.hpp"
#include "ttnn/operations/matmul/matmul.hpp"
#include "ttnn/operations/normalization/layernorm/layernorm.hpp"
diff --git a/tests/tt_eager/ops/test_average_pool.cpp b/tests/tt_eager/ops/test_average_pool.cpp
index dbf1a1beca8..0608e998dfb 100644
--- a/tests/tt_eager/ops/test_average_pool.cpp
+++ b/tests/tt_eager/ops/test_average_pool.cpp
@@ -7,7 +7,7 @@
#include "ttnn/operations/functions.hpp"
#include "ttnn/tensor/tensor.hpp"
-#include "common/constants.hpp"
+#include
using tt::tt_metal::DataType;
using tt::tt_metal::IDevice;
diff --git a/tests/tt_eager/ops/test_bcast_op.cpp b/tests/tt_eager/ops/test_bcast_op.cpp
index 3a331d8e226..dcef94c93a7 100644
--- a/tests/tt_eager/ops/test_bcast_op.cpp
+++ b/tests/tt_eager/ops/test_bcast_op.cpp
@@ -2,11 +2,11 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/cpp/ttnn/operations/creation.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/bcast/bcast.hpp"
-#include "common/constants.hpp"
+#include
#include
#include
diff --git a/tests/tt_eager/ops/test_bmm_op.cpp b/tests/tt_eager/ops/test_bmm_op.cpp
index 286ca727baa..8e73545f435 100644
--- a/tests/tt_eager/ops/test_bmm_op.cpp
+++ b/tests/tt_eager/ops/test_bmm_op.cpp
@@ -2,12 +2,12 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/cpp/ttnn/operations/creation.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/tensor/types.hpp"
#include "ttnn/operations/matmul/device/matmul_op.hpp"
-#include "common/constants.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp b/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
index a79ec56ef08..a2ea8ff96d2 100644
--- a/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
+++ b/tests/tt_eager/ops/test_conv_prepare_weights_and_biases.cpp
@@ -2,8 +2,8 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "common/assert.hpp"
-#include "common/bfloat16.hpp"
+#include
+#include
#include "ttnn/cpp/ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/cpp/ttnn/tensor/types.hpp"
#include "ttnn/tensor/host_buffer/functions.hpp"
diff --git a/tests/tt_eager/ops/test_eltwise_binary_op.cpp b/tests/tt_eager/ops/test_eltwise_binary_op.cpp
index 5b62dc878e9..e5251876b0a 100644
--- a/tests/tt_eager/ops/test_eltwise_binary_op.cpp
+++ b/tests/tt_eager/ops/test_eltwise_binary_op.cpp
@@ -2,7 +2,7 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
diff --git a/tests/tt_eager/ops/test_eltwise_unary_op.cpp b/tests/tt_eager/ops/test_eltwise_unary_op.cpp
index e8ea5e37385..462979b0b38 100644
--- a/tests/tt_eager/ops/test_eltwise_unary_op.cpp
+++ b/tests/tt_eager/ops/test_eltwise_unary_op.cpp
@@ -4,7 +4,7 @@
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
@@ -12,7 +12,7 @@
#include "ttnn/operations/eltwise/unary/device/unary_device_operation.hpp"
#include "ttnn/operations/data_movement/pad/pad.hpp"
#include "ttnn/operation.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using tt::tt_metal::DataType;
diff --git a/tests/tt_eager/ops/test_fold_op.cpp b/tests/tt_eager/ops/test_fold_op.cpp
index 386b8f41f19..a2a5e8e2d7c 100644
--- a/tests/tt_eager/ops/test_fold_op.cpp
+++ b/tests/tt_eager/ops/test_fold_op.cpp
@@ -9,7 +9,7 @@
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/fold/fold.hpp"
-#include "tt_metal/host_api.hpp"
+#include
using namespace tt;
using namespace tt::tt_metal;
diff --git a/tests/tt_eager/ops/test_layernorm_op.cpp b/tests/tt_eager/ops/test_layernorm_op.cpp
index 320b127eb3a..e6605614d0b 100644
--- a/tests/tt_eager/ops/test_layernorm_op.cpp
+++ b/tests/tt_eager/ops/test_layernorm_op.cpp
@@ -2,7 +2,7 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/normalization/layernorm/layernorm.hpp"
#include
diff --git a/tests/tt_eager/ops/test_pad_op.cpp b/tests/tt_eager/ops/test_pad_op.cpp
index 8a551740cf0..7d8619d0a42 100644
--- a/tests/tt_eager/ops/test_pad_op.cpp
+++ b/tests/tt_eager/ops/test_pad_op.cpp
@@ -4,12 +4,12 @@
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operation.hpp"
#include "ttnn/operations/data_movement/pad/pad.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using tt::tt_metal::DataType;
diff --git a/tests/tt_eager/ops/test_sfpu.cpp b/tests/tt_eager/ops/test_sfpu.cpp
index cb857da85b7..b4e94afa4dc 100644
--- a/tests/tt_eager/ops/test_sfpu.cpp
+++ b/tests/tt_eager/ops/test_sfpu.cpp
@@ -10,10 +10,10 @@
#include
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/impl/buffers/buffer.hpp"
-#include "common/bfloat16.hpp"
+#include
+#include
+#include
+#include
#include "tests_common/sfpu_helper/sfpu_helper.hpp"
#include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
// #include "tt_gdb/tt_gdb.hpp"
diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp
index a0e62d2038d..0abdf062283 100644
--- a/tests/tt_eager/ops/test_sliding_window_ops.cpp
+++ b/tests/tt_eager/ops/test_sliding_window_ops.cpp
@@ -10,7 +10,7 @@
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/sliding_window/reference_sliding_window.hpp"
#include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
#include "ttnn/tensor/types.hpp"
diff --git a/tests/tt_eager/ops/test_softmax_op.cpp b/tests/tt_eager/ops/test_softmax_op.cpp
index b843d54a856..c6b583ad5b5 100644
--- a/tests/tt_eager/ops/test_softmax_op.cpp
+++ b/tests/tt_eager/ops/test_softmax_op.cpp
@@ -2,7 +2,7 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/normalization/softmax/softmax.hpp"
#include "ttnn/operations/functions.hpp"
diff --git a/tests/tt_eager/ops/test_tilize_op.cpp b/tests/tt_eager/ops/test_tilize_op.cpp
index 731cc295fff..ca3d831c669 100644
--- a/tests/tt_eager/ops/test_tilize_op.cpp
+++ b/tests/tt_eager/ops/test_tilize_op.cpp
@@ -7,12 +7,12 @@
#include
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/tilize/tilize.hpp"
-#include "tt_metal/host_api.hpp"
+#include
using namespace tt;
using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
index 9d292fda6e8..be19f2cac10 100644
--- a/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
+++ b/tests/tt_eager/ops/test_tilize_op_channels_last.cpp
@@ -6,12 +6,12 @@
#include
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/tilize/tilize.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/ops/test_tilize_zero_padding.cpp b/tests/tt_eager/ops/test_tilize_zero_padding.cpp
index 3ae2ecbd80f..6037c70de3e 100644
--- a/tests/tt_eager/ops/test_tilize_zero_padding.cpp
+++ b/tests/tt_eager/ops/test_tilize_zero_padding.cpp
@@ -6,12 +6,12 @@
#include
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
index 2181552b33b..26199d4e833 100644
--- a/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
+++ b/tests/tt_eager/ops/test_tilize_zero_padding_channels_last.cpp
@@ -6,13 +6,13 @@
#include
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/cpp/ttnn/operations/creation.hpp"
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/tilize_with_val_padding/tilize_with_val_padding.hpp"
-#include "tt_metal/host_api.hpp"
+#include
using namespace tt;
using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_transpose_op.cpp b/tests/tt_eager/ops/test_transpose_op.cpp
index 42f5a12d8ea..e157d30b86d 100644
--- a/tests/tt_eager/ops/test_transpose_op.cpp
+++ b/tests/tt_eager/ops/test_transpose_op.cpp
@@ -2,7 +2,7 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/transpose/transpose.hpp"
#include
diff --git a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
index 9749652d152..7b332af4e87 100644
--- a/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
+++ b/tests/tt_eager/ops/test_transpose_wh_multi_core.cpp
@@ -11,7 +11,7 @@
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "tt_metal/host_api.hpp"
+#include
using namespace tt;
using namespace tt_metal;
diff --git a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
index 9749652d152..7b332af4e87 100644
--- a/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
+++ b/tests/tt_eager/ops/test_transpose_wh_single_core.cpp
@@ -11,7 +11,7 @@
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/operations/data_movement/transpose/transpose.hpp"
-#include "tt_metal/host_api.hpp"
+#include
using namespace tt;
using namespace tt_metal;
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
index f720ed60cbc..557382809f7 100644
--- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
+++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_transpose.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
# SPDX-License-Identifier: Apache-2.0
@@ -1092,3 +1092,31 @@ def test_transpose_hw_rm(shape, device):
tt_output = ttnn.transpose(tt_input, 2, 3)
tt_output = ttnn.to_torch(tt_output)
assert_with_pcc(torch_output, tt_output, 0.9999)
+
+
+@skip_for_grayskull("Grayskull does not support float32")
+def test_transpose_16411(device):
+ torch.manual_seed(2005)
+ input_shape = (5, 3, 1, 1, 12, 8)
+ a = torch.rand(input_shape, dtype=torch.bfloat16)
+ p_b2 = torch.transpose(a, 1, 3)
+ p_b3 = torch.transpose(a, 1, 5)
+ p_c = torch.transpose(a, 0, 4)
+ p_c2 = torch.transpose(a, 1, 4)
+ p_c3 = torch.transpose(a, 2, 4)
+ p_c4 = torch.transpose(a, 3, 4)
+
+ b = ttnn.from_torch(a, dtype=ttnn.float32, layout=ttnn.TILE_LAYOUT, device=device)
+ b2 = ttnn.transpose(b, 1, 3)
+ b3 = ttnn.transpose(b, 1, 5)
+ c = ttnn.transpose(b, 0, 4)
+ c2 = ttnn.transpose(b, 1, 4)
+ c3 = ttnn.transpose(b, 2, 4)
+ c4 = ttnn.transpose(b, 3, 4)
+
+ assert_with_pcc(p_b2, ttnn.to_torch(b2), 0.9999)
+ assert_with_pcc(p_b3, ttnn.to_torch(b3), 0.9999)
+ assert_with_pcc(p_c, ttnn.to_torch(c), 0.9999)
+ assert_with_pcc(p_c2, ttnn.to_torch(c2), 0.9999)
+ assert_with_pcc(p_c3, ttnn.to_torch(c3), 0.9999)
+ assert_with_pcc(p_c4, ttnn.to_torch(c4), 0.9999)
diff --git a/tests/tt_eager/tensors/test_async_tensor_apis.cpp b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
index 191e2308373..a7e3538efce 100644
--- a/tests/tt_eager/tensors/test_async_tensor_apis.cpp
+++ b/tests/tt_eager/tensors/test_async_tensor_apis.cpp
@@ -4,8 +4,8 @@
#include
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include
+#include
#include "ttnn/cpp/ttnn/operations/creation.hpp"
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
@@ -13,7 +13,7 @@
#include "ttnn/tensor/tensor_impl.hpp"
#include "ttnn/tensor/types.hpp"
#include "tests/tt_metal/tt_metal/common/dispatch_fixture.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/eltwise/binary/binary.hpp"
#include "ttnn/operations/eltwise/unary/unary.hpp"
diff --git a/tests/tt_eager/tensors/test_copy_and_move.cpp b/tests/tt_eager/tensors/test_copy_and_move.cpp
index 1b6b4e15b4e..96ba15e48a7 100644
--- a/tests/tt_eager/tensors/test_copy_and_move.cpp
+++ b/tests/tt_eager/tensors/test_copy_and_move.cpp
@@ -2,14 +2,14 @@
//
// SPDX-License-Identifier: Apache-2.0
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include
+#include
#include "ttnn/cpp/ttnn/operations/creation.hpp"
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/tensor/tensor_impl.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/tensors/test_host_device_loopback.cpp b/tests/tt_eager/tensors/test_host_device_loopback.cpp
index c50b8ee06af..40d8886fcb9 100644
--- a/tests/tt_eager/tensors/test_host_device_loopback.cpp
+++ b/tests/tt_eager/tensors/test_host_device_loopback.cpp
@@ -6,11 +6,11 @@
#include
#include
-#include "common/constants.hpp"
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/tensors/test_ranks.cpp b/tests/tt_eager/tensors/test_ranks.cpp
index 593ea923810..10dd0acbc77 100644
--- a/tests/tt_eager/tensors/test_ranks.cpp
+++ b/tests/tt_eager/tensors/test_ranks.cpp
@@ -6,13 +6,13 @@
#include
#include
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/tensor/tensor_impl.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
using namespace tt;
diff --git a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
index 720a9fe0c93..bc68c910fe5 100644
--- a/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
+++ b/tests/tt_eager/tensors/test_raw_host_memory_pointer.cpp
@@ -6,15 +6,15 @@
#include
#include
-#include "common/bfloat16.hpp"
-#include "common/constants.hpp"
+#include
+#include
#include "ttnn/tensor/host_buffer/functions.hpp"
#include "ttnn/tensor/host_buffer/types.hpp"
#include "ttnn/tensor/tensor.hpp"
#include "ttnn/tensor/tensor_impl.hpp"
#include "ttnn/operations/eltwise/binary/binary.hpp"
#include "ttnn/operations/eltwise/unary/unary.hpp"
-#include "tt_metal/host_api.hpp"
+#include
#include "ttnn/operations/functions.hpp"
/*
diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp
index 1556024c57c..5e87bb84f8d 100644
--- a/tests/tt_metal/distributed/test_distributed.cpp
+++ b/tests/tt_metal/distributed/test_distributed.cpp
@@ -3,7 +3,7 @@
// SPDX-License-Identifier: Apache-2.0
#include "tests/tt_metal/distributed/distributed_fixture.hpp"
-#include "tt_metal/distributed/system_mesh.hpp"
+#include
namespace tt::tt_metal::distributed::test {
diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp
index c746d6a80a3..43f9a17b8bf 100644
--- a/tests/tt_metal/distributed/test_mesh_workload.cpp
+++ b/tests/tt_metal/distributed/test_mesh_workload.cpp
@@ -6,9 +6,9 @@
#include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp"
#include "tests/tt_metal/distributed/distributed_fixture.hpp"
-#include "tt_metal/host_api.hpp"
-#include "tt_metal/detail/tt_metal.hpp"
-#include "tt_metal/common/bfloat16.hpp"
+#include
+#include
+#include
namespace tt::tt_metal::distributed::test {
diff --git a/tests/tt_metal/llrt/test_libs/conv_pattern.hpp b/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
index aeacc91ee23..e1144d75f9f 100644
--- a/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
+++ b/tests/tt_metal/llrt/test_libs/conv_pattern.hpp
@@ -5,8 +5,8 @@
#pragma once
#include